hpcflow-new2 0.2.0a190__py3-none-any.whl → 0.2.0a199__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpcflow/__pyinstaller/hook-hpcflow.py +1 -0
- hpcflow/_version.py +1 -1
- hpcflow/data/scripts/bad_script.py +2 -0
- hpcflow/data/scripts/do_nothing.py +2 -0
- hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
- hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/input_file_generator_basic.py +3 -0
- hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
- hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
- hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
- hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
- hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
- hpcflow/data/scripts/output_file_parser_basic.py +3 -0
- hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
- hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/script_exit_test.py +5 -0
- hpcflow/data/template_components/environments.yaml +1 -1
- hpcflow/sdk/__init__.py +5 -0
- hpcflow/sdk/app.py +150 -89
- hpcflow/sdk/cli.py +263 -84
- hpcflow/sdk/cli_common.py +99 -5
- hpcflow/sdk/config/callbacks.py +38 -1
- hpcflow/sdk/config/config.py +102 -13
- hpcflow/sdk/config/errors.py +19 -5
- hpcflow/sdk/config/types.py +3 -0
- hpcflow/sdk/core/__init__.py +25 -1
- hpcflow/sdk/core/actions.py +914 -262
- hpcflow/sdk/core/cache.py +76 -34
- hpcflow/sdk/core/command_files.py +14 -128
- hpcflow/sdk/core/commands.py +35 -6
- hpcflow/sdk/core/element.py +122 -50
- hpcflow/sdk/core/errors.py +58 -2
- hpcflow/sdk/core/execute.py +207 -0
- hpcflow/sdk/core/loop.py +408 -50
- hpcflow/sdk/core/loop_cache.py +4 -4
- hpcflow/sdk/core/parameters.py +382 -37
- hpcflow/sdk/core/run_dir_files.py +13 -40
- hpcflow/sdk/core/skip_reason.py +7 -0
- hpcflow/sdk/core/task.py +119 -30
- hpcflow/sdk/core/task_schema.py +68 -0
- hpcflow/sdk/core/test_utils.py +66 -27
- hpcflow/sdk/core/types.py +54 -1
- hpcflow/sdk/core/utils.py +78 -7
- hpcflow/sdk/core/workflow.py +1538 -336
- hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
- hpcflow/sdk/demo/cli.py +7 -0
- hpcflow/sdk/helper/cli.py +1 -0
- hpcflow/sdk/log.py +42 -15
- hpcflow/sdk/persistence/base.py +405 -53
- hpcflow/sdk/persistence/json.py +177 -52
- hpcflow/sdk/persistence/pending.py +237 -69
- hpcflow/sdk/persistence/store_resource.py +3 -2
- hpcflow/sdk/persistence/types.py +15 -4
- hpcflow/sdk/persistence/zarr.py +928 -81
- hpcflow/sdk/submission/jobscript.py +1408 -489
- hpcflow/sdk/submission/schedulers/__init__.py +40 -5
- hpcflow/sdk/submission/schedulers/direct.py +33 -19
- hpcflow/sdk/submission/schedulers/sge.py +51 -16
- hpcflow/sdk/submission/schedulers/slurm.py +44 -16
- hpcflow/sdk/submission/schedulers/utils.py +7 -2
- hpcflow/sdk/submission/shells/base.py +68 -20
- hpcflow/sdk/submission/shells/bash.py +222 -129
- hpcflow/sdk/submission/shells/powershell.py +200 -150
- hpcflow/sdk/submission/submission.py +852 -119
- hpcflow/sdk/submission/types.py +18 -21
- hpcflow/sdk/typing.py +24 -5
- hpcflow/sdk/utils/arrays.py +71 -0
- hpcflow/sdk/utils/deferred_file.py +55 -0
- hpcflow/sdk/utils/hashing.py +16 -0
- hpcflow/sdk/utils/patches.py +12 -0
- hpcflow/sdk/utils/strings.py +33 -0
- hpcflow/tests/api/test_api.py +32 -0
- hpcflow/tests/conftest.py +19 -0
- hpcflow/tests/data/multi_path_sequences.yaml +29 -0
- hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
- hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
- hpcflow/tests/scripts/test_input_file_generators.py +282 -0
- hpcflow/tests/scripts/test_main_scripts.py +821 -70
- hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
- hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
- hpcflow/tests/shells/wsl/test_wsl_submission.py +6 -0
- hpcflow/tests/unit/test_action.py +176 -0
- hpcflow/tests/unit/test_app.py +20 -0
- hpcflow/tests/unit/test_cache.py +46 -0
- hpcflow/tests/unit/test_cli.py +133 -0
- hpcflow/tests/unit/test_config.py +122 -1
- hpcflow/tests/unit/test_element_iteration.py +47 -0
- hpcflow/tests/unit/test_jobscript_unit.py +757 -0
- hpcflow/tests/unit/test_loop.py +1332 -27
- hpcflow/tests/unit/test_meta_task.py +325 -0
- hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
- hpcflow/tests/unit/test_parameter.py +13 -0
- hpcflow/tests/unit/test_persistence.py +190 -8
- hpcflow/tests/unit/test_run.py +109 -3
- hpcflow/tests/unit/test_run_directories.py +29 -0
- hpcflow/tests/unit/test_shell.py +20 -0
- hpcflow/tests/unit/test_submission.py +5 -76
- hpcflow/tests/unit/utils/test_arrays.py +40 -0
- hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
- hpcflow/tests/unit/utils/test_hashing.py +65 -0
- hpcflow/tests/unit/utils/test_patches.py +5 -0
- hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
- hpcflow/tests/workflows/__init__.py +0 -0
- hpcflow/tests/workflows/test_directory_structure.py +31 -0
- hpcflow/tests/workflows/test_jobscript.py +332 -0
- hpcflow/tests/workflows/test_run_status.py +198 -0
- hpcflow/tests/workflows/test_skip_downstream.py +696 -0
- hpcflow/tests/workflows/test_submission.py +140 -0
- hpcflow/tests/workflows/test_workflows.py +142 -2
- hpcflow/tests/workflows/test_zip.py +18 -0
- hpcflow/viz_demo.ipynb +6587 -3
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/METADATA +7 -4
- hpcflow_new2-0.2.0a199.dist-info/RECORD +221 -0
- hpcflow_new2-0.2.0a190.dist-info/RECORD +0 -165
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/LICENSE +0 -0
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/WHEEL +0 -0
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a199.dist-info}/entry_points.txt +0 -0
hpcflow/sdk/persistence/zarr.py
CHANGED
@@ -24,6 +24,7 @@ from numcodecs import MsgPack, VLenArray, blosc, Blosc, Zstd # type: ignore
|
|
24
24
|
from reretry import retry # type: ignore
|
25
25
|
|
26
26
|
from hpcflow.sdk.typing import hydrate
|
27
|
+
from hpcflow.sdk.core import RUN_DIR_ARR_DTYPE, RUN_DIR_ARR_FILL
|
27
28
|
from hpcflow.sdk.core.errors import (
|
28
29
|
MissingParameterData,
|
29
30
|
MissingStoreEARError,
|
@@ -53,9 +54,22 @@ from hpcflow.sdk.persistence.utils import ask_pw_on_auth_exc
|
|
53
54
|
from hpcflow.sdk.persistence.pending import CommitResourceMap
|
54
55
|
from hpcflow.sdk.persistence.base import update_param_source_dict
|
55
56
|
from hpcflow.sdk.log import TimeIt
|
57
|
+
from hpcflow.sdk.submission.submission import (
|
58
|
+
JOBSCRIPT_SUBMIT_TIME_KEYS,
|
59
|
+
SUBMISSION_SUBMIT_TIME_KEYS,
|
60
|
+
)
|
61
|
+
from hpcflow.sdk.utils.arrays import get_2D_idx, split_arr
|
62
|
+
from hpcflow.sdk.utils.strings import shorten_list_str
|
56
63
|
|
57
64
|
if TYPE_CHECKING:
|
58
|
-
from collections.abc import
|
65
|
+
from collections.abc import (
|
66
|
+
Callable,
|
67
|
+
Iterable,
|
68
|
+
Iterator,
|
69
|
+
Mapping,
|
70
|
+
MutableMapping,
|
71
|
+
Sequence,
|
72
|
+
)
|
59
73
|
from datetime import datetime
|
60
74
|
from fsspec import AbstractFileSystem # type: ignore
|
61
75
|
from logging import Logger
|
@@ -65,16 +79,16 @@ if TYPE_CHECKING:
|
|
65
79
|
from zarr import Array, Group # type: ignore
|
66
80
|
from zarr.attrs import Attributes # type: ignore
|
67
81
|
from zarr.storage import Store # type: ignore
|
82
|
+
from ..submission.types import ResolvedJobscriptBlockDependencies
|
68
83
|
from .types import TypeLookup
|
69
84
|
from ..app import BaseApp
|
70
85
|
from ..core.json_like import JSONed, JSONDocument
|
71
|
-
from ..typing import ParamSource, PathLike
|
72
|
-
|
86
|
+
from ..typing import ParamSource, PathLike, DataIndex
|
73
87
|
|
74
88
|
#: List of any (Zarr-serializable) value.
|
75
89
|
ListAny: TypeAlias = "list[Any]"
|
76
90
|
#: Zarr attribute mapping context.
|
77
|
-
ZarrAttrs: TypeAlias = "dict[str,
|
91
|
+
ZarrAttrs: TypeAlias = "dict[str, Any]"
|
78
92
|
_JS: TypeAlias = "dict[str, list[dict[str, dict]]]"
|
79
93
|
|
80
94
|
|
@@ -329,6 +343,8 @@ class ZarrStoreEAR(StoreEAR[ListAny, ZarrAttrs]):
|
|
329
343
|
self.metadata,
|
330
344
|
self.run_hostname,
|
331
345
|
self.commands_idx,
|
346
|
+
self.port_number,
|
347
|
+
self.commands_file_ID,
|
332
348
|
]
|
333
349
|
|
334
350
|
@override
|
@@ -351,6 +367,8 @@ class ZarrStoreEAR(StoreEAR[ListAny, ZarrAttrs]):
|
|
351
367
|
"metadata": EAR_dat[12],
|
352
368
|
"run_hostname": EAR_dat[13],
|
353
369
|
"commands_idx": EAR_dat[14],
|
370
|
+
"port_number": EAR_dat[15],
|
371
|
+
"commands_file_ID": EAR_dat[16],
|
354
372
|
}
|
355
373
|
return cls(is_pending=False, **obj_dat)
|
356
374
|
|
@@ -420,10 +438,17 @@ class ZarrPersistentStore(
|
|
420
438
|
_param_sources_arr_name: ClassVar[str] = "sources"
|
421
439
|
_param_user_arr_grp_name: ClassVar[str] = "arrays"
|
422
440
|
_param_data_arr_grp_name: ClassVar = lambda _, param_idx: f"param_{param_idx}"
|
441
|
+
_subs_md_group_name: ClassVar[str] = "submissions"
|
423
442
|
_task_arr_name: ClassVar[str] = "tasks"
|
424
443
|
_elem_arr_name: ClassVar[str] = "elements"
|
425
444
|
_iter_arr_name: ClassVar[str] = "iters"
|
426
445
|
_EAR_arr_name: ClassVar[str] = "runs"
|
446
|
+
_run_dir_arr_name: ClassVar[str] = "run_dirs"
|
447
|
+
_js_at_submit_md_arr_name: ClassVar[str] = "js_at_submit_md"
|
448
|
+
_js_run_IDs_arr_name: ClassVar[str] = "js_run_IDs"
|
449
|
+
_js_task_elems_arr_name: ClassVar[str] = "js_task_elems"
|
450
|
+
_js_task_acts_arr_name: ClassVar[str] = "js_task_acts"
|
451
|
+
_js_deps_arr_name: ClassVar[str] = "js_deps"
|
427
452
|
_time_res: ClassVar[str] = "us" # microseconds; must not be smaller than micro!
|
428
453
|
|
429
454
|
_res_map: ClassVar[CommitResourceMap] = CommitResourceMap(
|
@@ -437,6 +462,26 @@ class ZarrPersistentStore(
|
|
437
462
|
app, name="attrs", open_call=self._get_root_group
|
438
463
|
),
|
439
464
|
}
|
465
|
+
self._jobscript_at_submit_metadata: dict[
|
466
|
+
int, dict[str, Any]
|
467
|
+
] = {} # this is a cache
|
468
|
+
|
469
|
+
# these are caches; keys are submission index and then tuples of
|
470
|
+
# (jobscript index, jobscript-block index):
|
471
|
+
self._jobscript_run_ID_arrays: dict[int, dict[tuple[int, int], NDArray]] = {}
|
472
|
+
self._jobscript_task_element_maps: dict[
|
473
|
+
int, dict[tuple[int, int], dict[int, list[int]]]
|
474
|
+
] = {}
|
475
|
+
self._jobscript_task_actions_arrays: dict[
|
476
|
+
int, dict[tuple[int, int], NDArray]
|
477
|
+
] = {}
|
478
|
+
self._jobscript_dependencies: dict[
|
479
|
+
int,
|
480
|
+
dict[
|
481
|
+
tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]
|
482
|
+
],
|
483
|
+
] = {}
|
484
|
+
|
440
485
|
super().__init__(app, workflow, path, fs)
|
441
486
|
|
442
487
|
@contextmanager
|
@@ -514,7 +559,11 @@ class ZarrPersistentStore(
|
|
514
559
|
root = zarr.group(store=store, overwrite=False)
|
515
560
|
root.attrs.update(attrs)
|
516
561
|
|
517
|
-
|
562
|
+
# use a nested directory store for the metadata group so the runs array
|
563
|
+
# can be stored as a 2D array in nested directories, thereby limiting the maximum
|
564
|
+
# number of files stored in a given directory:
|
565
|
+
md_store = zarr.NestedDirectoryStore(Path(root.store.path).joinpath("metadata"))
|
566
|
+
md = zarr.group(store=md_store)
|
518
567
|
|
519
568
|
compressor_lookup = {
|
520
569
|
"blosc": Blosc,
|
@@ -561,13 +610,24 @@ class ZarrPersistentStore(
|
|
561
610
|
|
562
611
|
EARs_arr = md.create_dataset(
|
563
612
|
name=cls._EAR_arr_name,
|
564
|
-
shape=0,
|
613
|
+
shape=(0, 1000),
|
565
614
|
dtype=object,
|
566
615
|
object_codec=cls._CODEC,
|
567
616
|
chunks=1, # single-chunk rows for multiprocess writing
|
568
617
|
compressor=cmp,
|
618
|
+
dimension_separator="/",
|
619
|
+
)
|
620
|
+
EARs_arr.attrs.update({"parameter_paths": [], "num_runs": 0})
|
621
|
+
|
622
|
+
# array for storing indices that can be used to reproduce run directory paths:
|
623
|
+
run_dir_arr = md.create_dataset(
|
624
|
+
name=cls._run_dir_arr_name,
|
625
|
+
shape=0,
|
626
|
+
chunks=10_000,
|
627
|
+
dtype=RUN_DIR_ARR_DTYPE,
|
628
|
+
fill_value=RUN_DIR_ARR_FILL,
|
629
|
+
write_empty_chunks=False,
|
569
630
|
)
|
570
|
-
EARs_arr.attrs["parameter_paths"] = []
|
571
631
|
|
572
632
|
parameter_data = root.create_group(name=cls._param_grp_name)
|
573
633
|
parameter_data.create_dataset(
|
@@ -590,6 +650,9 @@ class ZarrPersistentStore(
|
|
590
650
|
)
|
591
651
|
parameter_data.create_group(name=cls._param_user_arr_grp_name)
|
592
652
|
|
653
|
+
# for storing submission metadata that should not be stored in the root group:
|
654
|
+
md.create_group(name=cls._subs_md_group_name)
|
655
|
+
|
593
656
|
def _append_tasks(self, tasks: Iterable[ZarrStoreTask]):
|
594
657
|
elem_IDs_arr = self._get_tasks_arr(mode="r+")
|
595
658
|
elem_IDs: list[int] = []
|
@@ -614,12 +677,339 @@ class ZarrPersistentStore(
|
|
614
677
|
{
|
615
678
|
"num_added_iterations": loop["num_added_iterations"],
|
616
679
|
"iterable_parameters": loop["iterable_parameters"],
|
680
|
+
"output_parameters": loop["output_parameters"],
|
617
681
|
"parents": loop["parents"],
|
618
682
|
}
|
619
683
|
)
|
620
684
|
attrs["template"]["loops"].append(loop["loop_template"])
|
621
685
|
|
622
|
-
|
686
|
+
@staticmethod
|
687
|
+
def _extract_submission_run_IDs_array(
|
688
|
+
sub_js: Mapping[str, JSONed],
|
689
|
+
) -> tuple[np.ndarray, list[list[list[int]]]]:
|
690
|
+
"""For a JSON-like representation of a Submission object, remove and combine all
|
691
|
+
jobscript-block run ID lists into a single array with a fill value.
|
692
|
+
|
693
|
+
Notes
|
694
|
+
-----
|
695
|
+
This mutates `sub_js`, by setting `EAR_ID` jobscript-block keys to `None`.
|
696
|
+
|
697
|
+
Parameters
|
698
|
+
----------
|
699
|
+
sub_js
|
700
|
+
JSON-like representation of a `Submission` object.
|
701
|
+
|
702
|
+
Returns
|
703
|
+
-------
|
704
|
+
combined_run_IDs
|
705
|
+
Integer Numpy array that contains a concatenation of all 2D run ID arrays
|
706
|
+
from each jobscript-block. Technically a "jagged"/"ragged" array that is made
|
707
|
+
square with a large fill value.
|
708
|
+
block_shapes
|
709
|
+
List of length equal to the number of jobscripts in the submission. Each
|
710
|
+
sub-list contains a list of shapes (as a two-item list:
|
711
|
+
`[num_actions, num_elements]`) of the constituent blocks of that jobscript.
|
712
|
+
|
713
|
+
"""
|
714
|
+
arrs = []
|
715
|
+
max_acts, max_elems = 0, 0
|
716
|
+
|
717
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
718
|
+
block_shapes = []
|
719
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
720
|
+
block_shapes_js_i = []
|
721
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
722
|
+
run_IDs_i = np.array(blk["EAR_ID"])
|
723
|
+
blk["EAR_ID"] = None # TODO: how to type?
|
724
|
+
block_shapes_js_i.append(list(run_IDs_i.shape))
|
725
|
+
if run_IDs_i.shape[0] > max_acts:
|
726
|
+
max_acts = run_IDs_i.shape[0]
|
727
|
+
if run_IDs_i.shape[1] > max_elems:
|
728
|
+
max_elems = run_IDs_i.shape[1]
|
729
|
+
arrs.append(run_IDs_i)
|
730
|
+
block_shapes.append(block_shapes_js_i)
|
731
|
+
|
732
|
+
combined_run_IDs = np.full(
|
733
|
+
(len(arrs), max_acts, max_elems),
|
734
|
+
dtype=np.uint32,
|
735
|
+
fill_value=np.iinfo(np.uint32).max,
|
736
|
+
)
|
737
|
+
for arr_idx, arr in enumerate(arrs):
|
738
|
+
combined_run_IDs[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
|
739
|
+
|
740
|
+
return combined_run_IDs, block_shapes
|
741
|
+
|
742
|
+
@staticmethod
|
743
|
+
def _extract_submission_task_elements_array(
|
744
|
+
sub_js: Mapping[str, JSONed],
|
745
|
+
) -> tuple[np.ndarray, list[list[list[int]]]]:
|
746
|
+
"""For a JSON-like representation of a Submission object, remove and combine all
|
747
|
+
jobscript-block task-element mappings into a single array with a fill value.
|
748
|
+
|
749
|
+
Notes
|
750
|
+
-----
|
751
|
+
This mutates `sub_js`, by setting `task_elements` jobscript-block keys to `None`.
|
752
|
+
|
753
|
+
Parameters
|
754
|
+
----------
|
755
|
+
sub_js
|
756
|
+
JSON-like representation of a `Submission` object.
|
757
|
+
|
758
|
+
Returns
|
759
|
+
-------
|
760
|
+
combined_task_elems
|
761
|
+
Integer Numpy array that contains a concatenation of each task-element,
|
762
|
+
mapping, where each mapping is expressed as a 2D array whose first column
|
763
|
+
corresponds to the keys of the mappings, and whose remaining columns
|
764
|
+
correspond to the values of the mappings. Technically a "jagged"/"ragged"
|
765
|
+
array that is made square with a large fill value.
|
766
|
+
block_shapes
|
767
|
+
List of length equal to the number of jobscripts in the submission. Each
|
768
|
+
sub-list contains a list of shapes (as a two-item list:
|
769
|
+
`[num_actions, num_elements]`) of the constituent blocks of that jobscript.
|
770
|
+
|
771
|
+
"""
|
772
|
+
arrs = []
|
773
|
+
max_x, max_y = 0, 0
|
774
|
+
|
775
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
776
|
+
block_shapes = []
|
777
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
778
|
+
block_shapes_js_i = []
|
779
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
780
|
+
|
781
|
+
task_elems_lst = []
|
782
|
+
for k, v in cast("Mapping[int, list[int]]", blk["task_elements"]).items():
|
783
|
+
task_elems_lst.append([k] + v)
|
784
|
+
task_elems_i = np.array(task_elems_lst)
|
785
|
+
|
786
|
+
block_shape_j = [task_elems_i.shape[1] - 1, task_elems_i.shape[0]]
|
787
|
+
block_shapes_js_i.append(block_shape_j)
|
788
|
+
|
789
|
+
blk["task_elements"] = None # TODO: how to type?
|
790
|
+
if task_elems_i.shape[1] > max_x:
|
791
|
+
max_x = task_elems_i.shape[1]
|
792
|
+
if task_elems_i.shape[0] > max_y:
|
793
|
+
max_y = task_elems_i.shape[0]
|
794
|
+
arrs.append(task_elems_i)
|
795
|
+
block_shapes.append(block_shapes_js_i)
|
796
|
+
|
797
|
+
combined_task_elems = np.full(
|
798
|
+
(len(arrs), max_y, max_x),
|
799
|
+
dtype=np.uint32,
|
800
|
+
fill_value=np.iinfo(np.uint32).max,
|
801
|
+
)
|
802
|
+
for arr_idx, arr in enumerate(arrs):
|
803
|
+
combined_task_elems[arr_idx][: arr.shape[0], : arr.shape[1]] = arr
|
804
|
+
|
805
|
+
return combined_task_elems, block_shapes
|
806
|
+
|
807
|
+
@staticmethod
|
808
|
+
def _extract_submission_task_actions_array(
|
809
|
+
sub_js: Mapping[str, JSONed],
|
810
|
+
) -> tuple[np.ndarray, list[list[int]]]:
|
811
|
+
"""For a JSON-like representation of a Submission object, remove and concatenate
|
812
|
+
all jobscript-block task-action arrays into a single array.
|
813
|
+
|
814
|
+
Notes
|
815
|
+
-----
|
816
|
+
This mutates `sub_js`, by setting `task_actions` jobscript-block keys to `None`.
|
817
|
+
|
818
|
+
Parameters
|
819
|
+
----------
|
820
|
+
sub_js
|
821
|
+
JSON-like representation of a `Submission` object.
|
822
|
+
|
823
|
+
Returns
|
824
|
+
-------
|
825
|
+
combined_task_acts
|
826
|
+
Integer 2D Numpy array which is a concatenation along the first axis of
|
827
|
+
task-action actions from all jobscript blocks. The second dimension is of
|
828
|
+
length three.
|
829
|
+
block_num_acts
|
830
|
+
List of length equal to the number of jobscripts in the submission. Each
|
831
|
+
sub-list contains a list of `num_actions` of the constituent blocks of that
|
832
|
+
jobscript.
|
833
|
+
|
834
|
+
"""
|
835
|
+
arrs = []
|
836
|
+
|
837
|
+
# a list for each jobscript, containing shapes of run ID arrays in each block:
|
838
|
+
|
839
|
+
blk_num_acts = []
|
840
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
841
|
+
|
842
|
+
blk_num_acts_js_i = []
|
843
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
844
|
+
|
845
|
+
blk_acts = np.array(blk["task_actions"])
|
846
|
+
blk["task_actions"] = None # TODO: how to type?
|
847
|
+
blk_num_acts_js_i.append(blk_acts.shape[0])
|
848
|
+
arrs.append(blk_acts)
|
849
|
+
|
850
|
+
blk_num_acts.append(blk_num_acts_js_i)
|
851
|
+
|
852
|
+
combined_task_acts = np.vstack(arrs)
|
853
|
+
|
854
|
+
return combined_task_acts, blk_num_acts
|
855
|
+
|
856
|
+
@staticmethod
|
857
|
+
def _encode_jobscript_block_dependencies(sub_js: Mapping[str, JSONed]) -> np.ndarray:
|
858
|
+
"""For a JSON-like representation of a Submission object, remove jobscript-block
|
859
|
+
dependencies for all jobscripts and transform to a single 1D integer array, that
|
860
|
+
can be transformed back by `_decode_jobscript_block_dependencies`.
|
861
|
+
|
862
|
+
Notes
|
863
|
+
-----
|
864
|
+
This mutates `sub_js`, by setting `depdendencies` jobscript-block keys to `None`.
|
865
|
+
"""
|
866
|
+
|
867
|
+
# TODO: avoid this horrible mess of casts
|
868
|
+
|
869
|
+
all_deps_arr = []
|
870
|
+
assert sub_js["jobscripts"] is not None
|
871
|
+
for js in cast("Sequence[Mapping[str, JSONed]]", sub_js["jobscripts"]):
|
872
|
+
for blk in cast("Sequence[MutableMapping[str, JSONed]]", js["blocks"]):
|
873
|
+
all_deps_i: list[int] = []
|
874
|
+
assert blk["dependencies"] is not None
|
875
|
+
blk_deps = cast(
|
876
|
+
"list[tuple[tuple[int, int], Mapping[str, JSONed]]]",
|
877
|
+
blk["dependencies"],
|
878
|
+
)
|
879
|
+
for (dep_js_idx, dep_blk_idx), dep in blk_deps:
|
880
|
+
deps_arr: list[int] = []
|
881
|
+
for elem_i, elements_j in cast(
|
882
|
+
"Mapping[int, Sequence[int]]", dep["js_element_mapping"]
|
883
|
+
).items():
|
884
|
+
deps_arr.extend([len(elements_j) + 1, elem_i] + list(elements_j))
|
885
|
+
blk_arr = [
|
886
|
+
dep_js_idx,
|
887
|
+
dep_blk_idx,
|
888
|
+
int(cast("bool", dep["is_array"])),
|
889
|
+
] + deps_arr
|
890
|
+
blk_arr = [len(blk_arr)] + blk_arr
|
891
|
+
all_deps_i.extend(blk_arr)
|
892
|
+
all_deps_i = [
|
893
|
+
cast("int", js["index"]),
|
894
|
+
cast("int", blk["index"]),
|
895
|
+
] + all_deps_i
|
896
|
+
blk["dependencies"] = None # TODO: how to type?
|
897
|
+
all_deps_arr.extend([len(all_deps_i)] + all_deps_i)
|
898
|
+
|
899
|
+
return np.array(all_deps_arr)
|
900
|
+
|
901
|
+
@staticmethod
|
902
|
+
def _decode_jobscript_block_dependencies(
|
903
|
+
arr: np.ndarray,
|
904
|
+
) -> dict[tuple[int, int], dict[tuple[int, int], ResolvedJobscriptBlockDependencies]]:
|
905
|
+
"""Re-generate jobscript-block dependencies that have been transformed by
|
906
|
+
`_encode_jobscript_block_dependencies` into a single 1D integer array.
|
907
|
+
|
908
|
+
Parameters
|
909
|
+
----------
|
910
|
+
arr:
|
911
|
+
The 1D integer array to transform back to a verbose jobscript-block dependency
|
912
|
+
mapping.
|
913
|
+
"""
|
914
|
+
# metadata is js/blk_idx for which the dependencies are stored:
|
915
|
+
block_arrs = split_arr(arr, metadata_size=2)
|
916
|
+
block_deps = {}
|
917
|
+
for i in block_arrs:
|
918
|
+
|
919
|
+
js_idx: int
|
920
|
+
blk_idx: int
|
921
|
+
dep_js_idx: int
|
922
|
+
dep_blk_idx: int
|
923
|
+
is_array: int
|
924
|
+
|
925
|
+
js_idx, blk_idx = i[0]
|
926
|
+
# metadata is js/blk_idx that this block depends on, plus whether the
|
927
|
+
# dependency is an array dependency:
|
928
|
+
deps_arrs = split_arr(i[1], metadata_size=3)
|
929
|
+
all_deps_ij: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] = {}
|
930
|
+
for j in deps_arrs:
|
931
|
+
dep_js_idx, dep_blk_idx, is_array = j[0]
|
932
|
+
# no metadata:
|
933
|
+
elem_deps = split_arr(j[1], metadata_size=0)
|
934
|
+
all_deps_ij[(dep_js_idx, dep_blk_idx)] = {
|
935
|
+
"js_element_mapping": {},
|
936
|
+
"is_array": bool(is_array),
|
937
|
+
}
|
938
|
+
for k in elem_deps:
|
939
|
+
all_deps_ij[(dep_js_idx, dep_blk_idx)]["js_element_mapping"].update(
|
940
|
+
{k[1][0]: list(k[1][1:])}
|
941
|
+
)
|
942
|
+
|
943
|
+
block_deps[(js_idx, blk_idx)] = all_deps_ij
|
944
|
+
return block_deps
|
945
|
+
|
946
|
+
def _append_submissions(self, subs: dict[int, Mapping[str, JSONed]]):
|
947
|
+
|
948
|
+
for sub_idx, sub_i in subs.items():
|
949
|
+
|
950
|
+
# add a new metadata group for this submission:
|
951
|
+
sub_grp = self._get_all_submissions_metadata_group(mode="r+").create_group(
|
952
|
+
sub_idx
|
953
|
+
)
|
954
|
+
|
955
|
+
# add a new at-submit metadata array for jobscripts of this submission:
|
956
|
+
num_js = len(cast("list", sub_i["jobscripts"]))
|
957
|
+
sub_grp.create_dataset(
|
958
|
+
name=self._js_at_submit_md_arr_name,
|
959
|
+
shape=num_js,
|
960
|
+
dtype=object,
|
961
|
+
object_codec=MsgPack(),
|
962
|
+
chunks=1,
|
963
|
+
write_empty_chunks=False,
|
964
|
+
)
|
965
|
+
|
966
|
+
# add a new array to store run IDs for each jobscript:
|
967
|
+
combined_run_IDs, block_shapes = self._extract_submission_run_IDs_array(sub_i)
|
968
|
+
run_IDs_arr = sub_grp.create_dataset(
|
969
|
+
name=self._js_run_IDs_arr_name,
|
970
|
+
data=combined_run_IDs,
|
971
|
+
chunks=(None, None, None), # single chunk for the whole array
|
972
|
+
)
|
973
|
+
run_IDs_arr.attrs["block_shapes"] = block_shapes
|
974
|
+
|
975
|
+
# add a new array to store task-element map for each jobscript:
|
976
|
+
(
|
977
|
+
combined_task_elems,
|
978
|
+
block_shapes,
|
979
|
+
) = self._extract_submission_task_elements_array(sub_i)
|
980
|
+
task_elems_arr = sub_grp.create_dataset(
|
981
|
+
name=self._js_task_elems_arr_name,
|
982
|
+
data=combined_task_elems,
|
983
|
+
chunks=(None, None, None),
|
984
|
+
)
|
985
|
+
task_elems_arr.attrs["block_shapes"] = block_shapes
|
986
|
+
|
987
|
+
# add a new array to store task-actions for each jobscript:
|
988
|
+
(
|
989
|
+
combined_task_acts,
|
990
|
+
block_num_acts,
|
991
|
+
) = self._extract_submission_task_actions_array(sub_i)
|
992
|
+
task_acts_arr = sub_grp.create_dataset(
|
993
|
+
name=self._js_task_acts_arr_name,
|
994
|
+
data=combined_task_acts,
|
995
|
+
chunks=(None, None),
|
996
|
+
)
|
997
|
+
task_acts_arr.attrs["block_num_acts"] = block_num_acts
|
998
|
+
|
999
|
+
# add a new array to store jobscript-block dependencies for this submission:
|
1000
|
+
sub_grp.create_dataset(
|
1001
|
+
name=self._js_deps_arr_name,
|
1002
|
+
data=self._encode_jobscript_block_dependencies(sub_i),
|
1003
|
+
chunks=(None,),
|
1004
|
+
)
|
1005
|
+
|
1006
|
+
# TODO: store block shapes in `grp.attrs` since it is defined at the
|
1007
|
+
# submission level
|
1008
|
+
|
1009
|
+
# add attributes for at-submit-time submission metadata:
|
1010
|
+
grp = self._get_submission_metadata_group(sub_idx, mode="r+")
|
1011
|
+
grp.attrs["submission_parts"] = {}
|
1012
|
+
|
623
1013
|
with self.using_resource("attrs", action="update") as attrs:
|
624
1014
|
attrs["submissions"].extend(subs.values())
|
625
1015
|
|
@@ -694,20 +1084,29 @@ class ZarrPersistentStore(
|
|
694
1084
|
arr[iter_ID] = store_iter.encode(attrs)
|
695
1085
|
# attrs shouldn't be mutated (TODO: test!)
|
696
1086
|
|
697
|
-
def
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
1087
|
+
def _update_at_submit_metadata(
|
1088
|
+
self,
|
1089
|
+
at_submit_metadata: dict[int, dict[str, Any]],
|
1090
|
+
):
|
1091
|
+
for sub_idx, metadata_i in at_submit_metadata.items():
|
1092
|
+
grp = self._get_submission_metadata_group(sub_idx, mode="r+")
|
1093
|
+
attrs = self.__as_dict(grp.attrs)
|
1094
|
+
attrs["submission_parts"].update(metadata_i["submission_parts"])
|
1095
|
+
grp.attrs.put(attrs)
|
1096
|
+
|
1097
|
+
def _update_loop_index(self, loop_indices: dict[int, dict[str, int]]):
|
703
1098
|
|
704
|
-
def _update_loop_index(self, iter_ID: int, loop_idx: Mapping[str, int]):
|
705
1099
|
arr = self._get_iters_arr(mode="r+")
|
706
1100
|
attrs = self.__as_dict(arr.attrs)
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
1101
|
+
iter_IDs = list(loop_indices.keys())
|
1102
|
+
iter_dat = arr.get_coordinate_selection(iter_IDs)
|
1103
|
+
store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
|
1104
|
+
|
1105
|
+
for idx, iter_ID_i in enumerate(iter_IDs):
|
1106
|
+
new_iter_i = store_iters[idx].update_loop_idx(loop_indices[iter_ID_i])
|
1107
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
1108
|
+
# object array, so set one-by-one:
|
1109
|
+
arr[iter_ID_i] = new_iter_i.encode(attrs)
|
711
1110
|
|
712
1111
|
def _update_loop_num_iters(self, index: int, num_iters: list[list[list[int] | int]]):
|
713
1112
|
with self.using_resource("attrs", action="update") as attrs:
|
@@ -717,73 +1116,165 @@ class ZarrPersistentStore(
|
|
717
1116
|
with self.using_resource("attrs", action="update") as attrs:
|
718
1117
|
attrs["loops"][index]["parents"] = parents
|
719
1118
|
|
1119
|
+
def _update_iter_data_indices(self, iter_data_indices: dict[int, DataIndex]):
|
1120
|
+
|
1121
|
+
arr = self._get_iters_arr(mode="r+")
|
1122
|
+
attrs = self.__as_dict(arr.attrs)
|
1123
|
+
iter_IDs = list(iter_data_indices.keys())
|
1124
|
+
iter_dat = arr.get_coordinate_selection(iter_IDs)
|
1125
|
+
store_iters = [ZarrStoreElementIter.decode(i, attrs) for i in iter_dat]
|
1126
|
+
|
1127
|
+
for idx, iter_ID_i in enumerate(iter_IDs):
|
1128
|
+
new_iter_i = store_iters[idx].update_data_idx(iter_data_indices[iter_ID_i])
|
1129
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
1130
|
+
# object array, so set one-by-one:
|
1131
|
+
arr[iter_ID_i] = new_iter_i.encode(attrs)
|
1132
|
+
|
1133
|
+
def _update_run_data_indices(self, run_data_indices: dict[int, DataIndex]):
|
1134
|
+
self._update_runs(
|
1135
|
+
updates={k: {"data_idx": v} for k, v in run_data_indices.items()}
|
1136
|
+
)
|
1137
|
+
|
720
1138
|
def _append_EARs(self, EARs: Sequence[ZarrStoreEAR]):
|
721
1139
|
arr = self._get_EARs_arr(mode="r+")
|
722
1140
|
with self.__mutate_attrs(arr) as attrs:
|
723
|
-
|
724
|
-
|
725
|
-
|
1141
|
+
num_existing = attrs["num_runs"]
|
1142
|
+
num_add = len(EARs)
|
1143
|
+
num_tot = num_existing + num_add
|
1144
|
+
arr_add = np.empty(num_add, dtype=object)
|
1145
|
+
arr_add[:] = [i.encode(self.ts_fmt, attrs) for i in EARs]
|
1146
|
+
|
1147
|
+
# get new 1D indices:
|
1148
|
+
new_idx: NDArray = np.arange(num_existing, num_tot)
|
1149
|
+
|
1150
|
+
# transform to 2D indices:
|
1151
|
+
r_idx, c_idx = get_2D_idx(new_idx, num_cols=arr.shape[1])
|
1152
|
+
|
1153
|
+
# add rows to accomodate new runs:
|
1154
|
+
max_r_idx = np.max(r_idx)
|
1155
|
+
if max_r_idx + 1 > arr.shape[0]:
|
1156
|
+
arr.resize(max_r_idx + 1, arr.shape[1])
|
1157
|
+
|
1158
|
+
# fill in new data:
|
1159
|
+
for arr_add_idx_i, (r_idx_i, c_idx_i) in enumerate(zip(r_idx, c_idx)):
|
1160
|
+
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
1161
|
+
# object array, so set one-by-one:
|
1162
|
+
arr[r_idx_i, c_idx_i] = arr_add[arr_add_idx_i]
|
1163
|
+
|
1164
|
+
attrs["num_runs"] = num_tot
|
1165
|
+
|
1166
|
+
# add more rows to run dirs array:
|
1167
|
+
dirs_arr = self._get_dirs_arr(mode="r+")
|
1168
|
+
dirs_arr.resize(num_tot)
|
1169
|
+
|
1170
|
+
def _set_run_dirs(self, run_dir_arr: np.ndarray, run_idx: np.ndarray):
|
1171
|
+
dirs_arr = self._get_dirs_arr(mode="r+")
|
1172
|
+
dirs_arr[run_idx] = run_dir_arr
|
726
1173
|
|
727
1174
|
@TimeIt.decorator
|
728
|
-
def
|
729
|
-
|
730
|
-
|
1175
|
+
def _update_runs(self, updates: dict[int, dict[str, Any]]):
|
1176
|
+
"""Update the provided EAR attribute values in the specified existing runs."""
|
1177
|
+
run_IDs = list(updates.keys())
|
1178
|
+
runs = self._get_persistent_EARs(run_IDs)
|
731
1179
|
|
732
1180
|
arr = self._get_EARs_arr(mode="r+")
|
733
1181
|
with self.__mutate_attrs(arr) as attrs:
|
734
|
-
|
735
|
-
|
1182
|
+
# convert to 2D array indices:
|
1183
|
+
r_idx, c_idx = get_2D_idx(
|
1184
|
+
np.array(list(updates.keys())), num_cols=arr.shape[1]
|
1185
|
+
)
|
1186
|
+
for ri, ci, rID_i, upd_i in zip(
|
1187
|
+
r_idx, c_idx, updates.keys(), updates.values()
|
1188
|
+
):
|
1189
|
+
new_run_i = runs[rID_i].update(**upd_i)
|
736
1190
|
# seems to be a Zarr bug that prevents `set_coordinate_selection` with an
|
737
1191
|
# object array, so set one-by-one:
|
738
|
-
arr[
|
1192
|
+
arr[ri, ci] = new_run_i.encode(self.ts_fmt, attrs)
|
1193
|
+
|
1194
|
+
@TimeIt.decorator
|
1195
|
+
def _update_EAR_submission_data(self, sub_data: Mapping[int, tuple[int, int | None]]):
|
1196
|
+
self._update_runs(
|
1197
|
+
updates={
|
1198
|
+
k: {"submission_idx": v[0], "commands_file_ID": v[1]}
|
1199
|
+
for k, v in sub_data.items()
|
1200
|
+
}
|
1201
|
+
)
|
739
1202
|
|
740
1203
|
def _update_EAR_start(
|
741
|
-
self,
|
1204
|
+
self,
|
1205
|
+
run_starts: dict[int, tuple[datetime, dict[str, Any] | None, str, int | None]],
|
742
1206
|
):
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
1207
|
+
self._update_runs(
|
1208
|
+
updates={
|
1209
|
+
k: {
|
1210
|
+
"start_time": v[0],
|
1211
|
+
"snapshot_start": v[1],
|
1212
|
+
"run_hostname": v[2],
|
1213
|
+
"port_number": v[3],
|
1214
|
+
}
|
1215
|
+
for k, v in run_starts.items()
|
1216
|
+
}
|
1217
|
+
)
|
752
1218
|
|
753
1219
|
def _update_EAR_end(
|
754
|
-
self,
|
755
|
-
EAR_id: int,
|
756
|
-
e_time: datetime,
|
757
|
-
e_snap: dict[str, Any],
|
758
|
-
ext_code: int,
|
759
|
-
success: bool,
|
1220
|
+
self, run_ends: dict[int, tuple[datetime, dict[str, Any] | None, int, bool]]
|
760
1221
|
):
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
1222
|
+
self._update_runs(
|
1223
|
+
updates={
|
1224
|
+
k: {
|
1225
|
+
"end_time": v[0],
|
1226
|
+
"snapshot_end": v[1],
|
1227
|
+
"exit_code": v[2],
|
1228
|
+
"success": v[3],
|
1229
|
+
}
|
1230
|
+
for k, v in run_ends.items()
|
1231
|
+
}
|
1232
|
+
)
|
771
1233
|
|
772
|
-
def _update_EAR_skip(self,
|
773
|
-
|
774
|
-
with self.__mutate_attrs(arr) as attrs:
|
775
|
-
EAR_i = self._get_persistent_EARs([EAR_id])[EAR_id]
|
776
|
-
EAR_i = EAR_i.update(skip=True)
|
777
|
-
arr[EAR_id] = EAR_i.encode(self.ts_fmt, attrs)
|
1234
|
+
def _update_EAR_skip(self, skips: dict[int, int]):
|
1235
|
+
self._update_runs(updates={k: {"skip": v} for k, v in skips.items()})
|
778
1236
|
|
779
1237
|
def _update_js_metadata(self, js_meta: dict[int, dict[int, dict[str, Any]]]):
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
1238
|
+
|
1239
|
+
arr_keys = JOBSCRIPT_SUBMIT_TIME_KEYS # these items go to the Zarr array
|
1240
|
+
|
1241
|
+
# split into attributes to save to the root group metadata, and those to save to
|
1242
|
+
# the submit-time jobscript metadata array
|
1243
|
+
|
1244
|
+
grp_dat = {} # keys are tuples of (sub_idx, js_idx), values are metadata dicts
|
1245
|
+
|
1246
|
+
for sub_idx, all_js_md in js_meta.items():
|
1247
|
+
js_arr = None
|
1248
|
+
for js_idx, js_meta_i in all_js_md.items():
|
1249
|
+
|
1250
|
+
grp_dat_i = {k: v for k, v in js_meta_i.items() if k not in arr_keys}
|
1251
|
+
if grp_dat_i:
|
1252
|
+
grp_dat[(sub_idx, js_idx)] = grp_dat_i
|
1253
|
+
arr_dat = [js_meta_i.get(k) for k in arr_keys]
|
1254
|
+
|
1255
|
+
if any(arr_dat):
|
1256
|
+
# we are updating the at-sumbmit metadata, so clear the cache:
|
1257
|
+
self.clear_jobscript_at_submit_metadata_cache()
|
1258
|
+
|
1259
|
+
js_arr = js_arr or self._get_jobscripts_at_submit_metadata_arr(
|
1260
|
+
mode="r+", sub_idx=sub_idx
|
1261
|
+
)
|
1262
|
+
self.logger.info(
|
1263
|
+
f"updating submit-time jobscript metadata array: {arr_dat!r}."
|
1264
|
+
)
|
1265
|
+
js_arr[js_idx] = arr_dat
|
1266
|
+
|
1267
|
+
if grp_dat:
|
1268
|
+
with self.using_resource("attrs", action="update") as attrs:
|
1269
|
+
for (sub_idx, js_idx), js_meta_i in grp_dat.items():
|
1270
|
+
self.logger.info(
|
1271
|
+
f"updating jobscript metadata in the root group for "
|
1272
|
+
f"(sub={sub_idx}, js={js_idx}): {js_meta_i!r}."
|
1273
|
+
)
|
1274
|
+
sub = cast(
|
1275
|
+
"dict[str, list[dict[str, Any]]]", attrs["submissions"][sub_idx]
|
1276
|
+
)
|
1277
|
+
sub["jobscripts"][js_idx].update(js_meta_i)
|
787
1278
|
|
788
1279
|
def _append_parameters(self, params: Sequence[StoreParameter]):
|
789
1280
|
"""Add new persistent parameters."""
|
@@ -887,7 +1378,7 @@ class ZarrPersistentStore(
|
|
887
1378
|
if self.use_cache and self.num_EARs_cache is not None:
|
888
1379
|
num = self.num_EARs_cache
|
889
1380
|
else:
|
890
|
-
num =
|
1381
|
+
num = self._get_EARs_arr().attrs["num_runs"]
|
891
1382
|
if self.use_cache and self.num_EARs_cache is None:
|
892
1383
|
self.num_EARs_cache = num
|
893
1384
|
return num
|
@@ -910,6 +1401,13 @@ class ZarrPersistentStore(
|
|
910
1401
|
return self._zarr_store
|
911
1402
|
|
912
1403
|
def _get_root_group(self, mode: str = "r", **kwargs) -> Group:
|
1404
|
+
# TODO: investigate if there are inefficiencies in how we retrieve zarr groups
|
1405
|
+
# and arrays, e.g. opening sub groups sequentially would open the root group
|
1406
|
+
# multiple times, and so read the root group attrs file multiple times?
|
1407
|
+
# it might make sense to define a ZarrAttrsStoreResource for each zarr group and
|
1408
|
+
# array (or at least non-parameter groups/arrays?), there could be some built-in
|
1409
|
+
# understanding of the hierarchy (e.g. via a `path` attribute) which would then
|
1410
|
+
# avoid reading parent groups multiple times --- if that is happening currently.
|
913
1411
|
return zarr.open(self.zarr_store, mode=mode, **kwargs)
|
914
1412
|
|
915
1413
|
def _get_parameter_group(self, mode: str = "r", **kwargs) -> Group:
|
@@ -952,7 +1450,55 @@ class ZarrPersistentStore(
|
|
952
1450
|
return group, f"arr_{arr_idx}"
|
953
1451
|
|
954
1452
|
def _get_metadata_group(self, mode: str = "r") -> Group:
|
955
|
-
|
1453
|
+
try:
|
1454
|
+
path = Path(self.workflow.url).joinpath("metadata")
|
1455
|
+
md_store = zarr.NestedDirectoryStore(path)
|
1456
|
+
return zarr.open_group(store=md_store, mode=mode)
|
1457
|
+
except (FileNotFoundError, zarr.errors.GroupNotFoundError):
|
1458
|
+
# zip store?
|
1459
|
+
return zarr.open_group(self.zarr_store, path="metadata", mode=mode)
|
1460
|
+
|
1461
|
+
def _get_all_submissions_metadata_group(self, mode: str = "r") -> Group:
|
1462
|
+
return self._get_metadata_group(mode=mode).get(self._subs_md_group_name)
|
1463
|
+
|
1464
|
+
def _get_submission_metadata_group(self, sub_idx: int, mode: str = "r") -> Group:
|
1465
|
+
return self._get_all_submissions_metadata_group(mode=mode).get(sub_idx)
|
1466
|
+
|
1467
|
+
def _get_submission_metadata_group_path(self, sub_idx: int) -> Path:
|
1468
|
+
grp = self._get_submission_metadata_group(sub_idx)
|
1469
|
+
return Path(grp.store.path).joinpath(grp.path)
|
1470
|
+
|
1471
|
+
def _get_jobscripts_at_submit_metadata_arr(
|
1472
|
+
self, sub_idx: int, mode: str = "r"
|
1473
|
+
) -> Array:
|
1474
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1475
|
+
self._js_at_submit_md_arr_name
|
1476
|
+
)
|
1477
|
+
|
1478
|
+
def _get_jobscripts_at_submit_metadata_arr_path(self, sub_idx: int) -> Path:
|
1479
|
+
arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
|
1480
|
+
return Path(arr.store.path).joinpath(arr.path)
|
1481
|
+
|
1482
|
+
@TimeIt.decorator
|
1483
|
+
def _get_jobscripts_run_ID_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
1484
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1485
|
+
self._js_run_IDs_arr_name
|
1486
|
+
)
|
1487
|
+
|
1488
|
+
def _get_jobscripts_task_elements_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
1489
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1490
|
+
self._js_task_elems_arr_name
|
1491
|
+
)
|
1492
|
+
|
1493
|
+
def _get_jobscripts_task_actions_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
1494
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1495
|
+
self._js_task_acts_arr_name
|
1496
|
+
)
|
1497
|
+
|
1498
|
+
def _get_jobscripts_dependencies_arr(self, sub_idx: int, mode: str = "r") -> Array:
|
1499
|
+
return self._get_submission_metadata_group(sub_idx=sub_idx, mode=mode).get(
|
1500
|
+
self._js_deps_arr_name
|
1501
|
+
)
|
956
1502
|
|
957
1503
|
def _get_tasks_arr(self, mode: str = "r") -> Array:
|
958
1504
|
return self._get_metadata_group(mode=mode).get(self._task_arr_name)
|
@@ -966,6 +1512,9 @@ class ZarrPersistentStore(
|
|
966
1512
|
def _get_EARs_arr(self, mode: str = "r") -> Array:
|
967
1513
|
return self._get_metadata_group(mode=mode).get(self._EAR_arr_name)
|
968
1514
|
|
1515
|
+
def _get_dirs_arr(self, mode: str = "r") -> zarr.Array:
|
1516
|
+
return self._get_metadata_group(mode=mode).get(self._run_dir_arr_name)
|
1517
|
+
|
969
1518
|
@classmethod
|
970
1519
|
def make_test_store_from_spec(
|
971
1520
|
cls,
|
@@ -1091,7 +1640,9 @@ class ZarrPersistentStore(
|
|
1091
1640
|
}
|
1092
1641
|
|
1093
1642
|
@TimeIt.decorator
|
1094
|
-
def _get_persistent_submissions(
|
1643
|
+
def _get_persistent_submissions(
|
1644
|
+
self, id_lst: Iterable[int] | None = None
|
1645
|
+
) -> dict[int, Mapping[str, JSONed]]:
|
1095
1646
|
self.logger.debug("loading persistent submissions from the zarr store")
|
1096
1647
|
ids = set(id_lst or ())
|
1097
1648
|
with self.using_resource("attrs", "read") as attrs:
|
@@ -1102,12 +1653,6 @@ class ZarrPersistentStore(
|
|
1102
1653
|
if id_lst is None or idx in ids
|
1103
1654
|
}
|
1104
1655
|
)
|
1105
|
-
# cast jobscript submit-times and jobscript `task_elements` keys:
|
1106
|
-
for sub in subs_dat.values():
|
1107
|
-
for js in cast("_JS", sub)["jobscripts"]:
|
1108
|
-
task_elements = js["task_elements"]
|
1109
|
-
for key in list(task_elements):
|
1110
|
-
task_elements[int(key)] = task_elements.pop(key)
|
1111
1656
|
|
1112
1657
|
return subs_dat
|
1113
1658
|
|
@@ -1117,6 +1662,10 @@ class ZarrPersistentStore(
|
|
1117
1662
|
) -> dict[int, ZarrStoreElement]:
|
1118
1663
|
elems, id_lst = self._get_cached_persistent_elements(id_lst)
|
1119
1664
|
if id_lst:
|
1665
|
+
self.logger.debug(
|
1666
|
+
f"loading {len(id_lst)} persistent element(s) from disk: "
|
1667
|
+
f"{shorten_list_str(id_lst)}."
|
1668
|
+
)
|
1120
1669
|
arr = self._get_elements_arr()
|
1121
1670
|
attrs = arr.attrs.asdict()
|
1122
1671
|
try:
|
@@ -1137,6 +1686,10 @@ class ZarrPersistentStore(
|
|
1137
1686
|
) -> dict[int, ZarrStoreElementIter]:
|
1138
1687
|
iters, id_lst = self._get_cached_persistent_element_iters(id_lst)
|
1139
1688
|
if id_lst:
|
1689
|
+
self.logger.debug(
|
1690
|
+
f"loading {len(id_lst)} persistent element iteration(s) from disk: "
|
1691
|
+
f"{shorten_list_str(id_lst)}."
|
1692
|
+
)
|
1140
1693
|
arr = self._get_iters_arr()
|
1141
1694
|
attrs = arr.attrs.asdict()
|
1142
1695
|
try:
|
@@ -1155,11 +1708,21 @@ class ZarrPersistentStore(
|
|
1155
1708
|
def _get_persistent_EARs(self, id_lst: Iterable[int]) -> dict[int, ZarrStoreEAR]:
|
1156
1709
|
runs, id_lst = self._get_cached_persistent_EARs(id_lst)
|
1157
1710
|
if id_lst:
|
1711
|
+
self.logger.debug(
|
1712
|
+
f"loading {len(id_lst)} persistent EAR(s) from disk: "
|
1713
|
+
f"{shorten_list_str(id_lst)}."
|
1714
|
+
)
|
1158
1715
|
arr = self._get_EARs_arr()
|
1159
1716
|
attrs = arr.attrs.asdict()
|
1717
|
+
sel: tuple[NDArray, NDArray] | list[int]
|
1160
1718
|
try:
|
1161
|
-
|
1162
|
-
|
1719
|
+
# convert to 2D array indices:
|
1720
|
+
sel = get_2D_idx(np.array(id_lst), num_cols=arr.shape[1])
|
1721
|
+
except IndexError:
|
1722
|
+
# 1D runs array from before update to 2D in Feb 2025 refactor/jobscript:
|
1723
|
+
sel = id_lst
|
1724
|
+
try:
|
1725
|
+
EAR_arr_dat = _zarr_get_coord_selection(arr, sel, self.logger)
|
1163
1726
|
except BoundsCheckError:
|
1164
1727
|
raise MissingStoreEARError(id_lst) from None
|
1165
1728
|
EAR_dat = dict(zip(id_lst, EAR_arr_dat))
|
@@ -1178,6 +1741,14 @@ class ZarrPersistentStore(
|
|
1178
1741
|
) -> dict[int, ZarrStoreParameter]:
|
1179
1742
|
params, id_lst = self._get_cached_persistent_parameters(id_lst)
|
1180
1743
|
if id_lst:
|
1744
|
+
|
1745
|
+
self.logger.debug(
|
1746
|
+
f"loading {len(id_lst)} persistent parameter(s) from disk: "
|
1747
|
+
f"{shorten_list_str(id_lst)}."
|
1748
|
+
)
|
1749
|
+
|
1750
|
+
# TODO: implement the "parameter_metadata_cache" for zarr stores, which would
|
1751
|
+
# keep the base_arr and src_arr open
|
1181
1752
|
base_arr = self._get_parameter_base_array(mode="r")
|
1182
1753
|
src_arr = self._get_parameter_sources_array(mode="r")
|
1183
1754
|
|
@@ -1237,6 +1808,253 @@ class ZarrPersistentStore(
|
|
1237
1808
|
base_arr = self._get_parameter_base_array(mode="r")
|
1238
1809
|
return list(range(len(base_arr)))
|
1239
1810
|
|
1811
|
+
def get_submission_at_submit_metadata(
|
1812
|
+
self, sub_idx: int, metadata_attr: dict | None
|
1813
|
+
) -> dict[str, Any]:
|
1814
|
+
"""Retrieve the values of submission attributes that are stored at submit-time."""
|
1815
|
+
grp = self._get_submission_metadata_group(sub_idx)
|
1816
|
+
attrs = grp.attrs.asdict()
|
1817
|
+
return {k: attrs[k] for k in SUBMISSION_SUBMIT_TIME_KEYS}
|
1818
|
+
|
1819
|
+
def clear_jobscript_at_submit_metadata_cache(self):
|
1820
|
+
"""Clear the cache of at-submit-time jobscript metadata."""
|
1821
|
+
self._jobscript_at_submit_metadata = {}
|
1822
|
+
|
1823
|
+
def get_jobscript_at_submit_metadata(
|
1824
|
+
self,
|
1825
|
+
sub_idx: int,
|
1826
|
+
js_idx: int,
|
1827
|
+
metadata_attr: dict | None,
|
1828
|
+
) -> dict[str, Any]:
|
1829
|
+
"""For the specified jobscript, retrieve the values of jobscript-submit-time
|
1830
|
+
attributes.
|
1831
|
+
|
1832
|
+
Notes
|
1833
|
+
-----
|
1834
|
+
If the cache does not exist, this method will retrieve and cache metadata for
|
1835
|
+
all jobscripts for which metadata has been set. If the cache does exist, but not
|
1836
|
+
for the requested jobscript, then this method will retrieve and cache metadata for
|
1837
|
+
all non-cached jobscripts for which metadata has been set. If metadata has not
|
1838
|
+
yet been set for the specified jobscript, and dict with all `None` values will be
|
1839
|
+
returned.
|
1840
|
+
|
1841
|
+
The cache can be cleared using the method
|
1842
|
+
`clear_jobscript_at_submit_metadata_cache`.
|
1843
|
+
|
1844
|
+
"""
|
1845
|
+
if self._jobscript_at_submit_metadata:
|
1846
|
+
# cache exists, but might not include data for the requested jobscript:
|
1847
|
+
if js_idx in self._jobscript_at_submit_metadata:
|
1848
|
+
return self._jobscript_at_submit_metadata[js_idx]
|
1849
|
+
|
1850
|
+
arr = self._get_jobscripts_at_submit_metadata_arr(sub_idx)
|
1851
|
+
non_cached = set(range(len(arr))) - set(self._jobscript_at_submit_metadata.keys())
|
1852
|
+
|
1853
|
+
# populate cache:
|
1854
|
+
arr_non_cached = arr.get_coordinate_selection((list(non_cached),))
|
1855
|
+
for js_idx_i, arr_item in zip(non_cached, arr_non_cached):
|
1856
|
+
try:
|
1857
|
+
self._jobscript_at_submit_metadata[js_idx_i] = {
|
1858
|
+
i: arr_item[i_idx]
|
1859
|
+
for i_idx, i in enumerate(JOBSCRIPT_SUBMIT_TIME_KEYS)
|
1860
|
+
}
|
1861
|
+
except TypeError:
|
1862
|
+
# data for this jobscript is not set
|
1863
|
+
pass
|
1864
|
+
|
1865
|
+
if js_idx not in self._jobscript_at_submit_metadata:
|
1866
|
+
return {i: None for i in JOBSCRIPT_SUBMIT_TIME_KEYS}
|
1867
|
+
|
1868
|
+
return self._jobscript_at_submit_metadata[js_idx]
|
1869
|
+
|
1870
|
+
@TimeIt.decorator
|
1871
|
+
def get_jobscript_block_run_ID_array(
|
1872
|
+
self,
|
1873
|
+
sub_idx: int,
|
1874
|
+
js_idx: int,
|
1875
|
+
blk_idx: int,
|
1876
|
+
run_ID_arr: NDArray | None,
|
1877
|
+
) -> NDArray:
|
1878
|
+
"""For the specified jobscript-block, retrieve the run ID array."""
|
1879
|
+
|
1880
|
+
if run_ID_arr is not None:
|
1881
|
+
self.logger.debug("jobscript-block run IDs are still in memory.")
|
1882
|
+
# in the special case when the Submission object has just been created, the
|
1883
|
+
# run ID arrays will not yet be persistent.
|
1884
|
+
return np.asarray(run_ID_arr)
|
1885
|
+
|
1886
|
+
# otherwise, `append_submissions` has been called, the run IDs have been
|
1887
|
+
# removed from the JSON-representation of the submission object, and have been
|
1888
|
+
# saved in separate zarr arrays:
|
1889
|
+
if sub_idx not in self._jobscript_run_ID_arrays:
|
1890
|
+
|
1891
|
+
self.logger.debug(
|
1892
|
+
f"retrieving jobscript-block run IDs for submission {sub_idx} from disk,"
|
1893
|
+
f" and caching."
|
1894
|
+
)
|
1895
|
+
|
1896
|
+
# for a given submission, run IDs are stored for all jobscript-blocks in the
|
1897
|
+
# same array (and chunk), so retrieve all of them and cache:
|
1898
|
+
|
1899
|
+
arr = self._get_jobscripts_run_ID_arr(sub_idx)
|
1900
|
+
arr_dat = arr[:]
|
1901
|
+
block_shapes = arr.attrs["block_shapes"]
|
1902
|
+
|
1903
|
+
self._jobscript_run_ID_arrays[sub_idx] = {} # keyed by (js_idx, blk_idx)
|
1904
|
+
arr_idx = 0
|
1905
|
+
for js_idx_i, js_blk_shapes in enumerate(block_shapes):
|
1906
|
+
for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
|
1907
|
+
self._jobscript_run_ID_arrays[sub_idx][
|
1908
|
+
(js_idx_i, blk_idx_j)
|
1909
|
+
] = arr_dat[arr_idx, : blk_shape_j[0], : blk_shape_j[1]]
|
1910
|
+
arr_idx += 1
|
1911
|
+
|
1912
|
+
else:
|
1913
|
+
self.logger.debug(
|
1914
|
+
f"retrieving jobscript-block run IDs for submission {sub_idx} from cache."
|
1915
|
+
)
|
1916
|
+
|
1917
|
+
return self._jobscript_run_ID_arrays[sub_idx][(js_idx, blk_idx)]
|
1918
|
+
|
1919
|
+
def get_jobscript_block_task_elements_map(
|
1920
|
+
self,
|
1921
|
+
sub_idx: int,
|
1922
|
+
js_idx: int,
|
1923
|
+
blk_idx: int,
|
1924
|
+
task_elems_map: dict[int, list[int]] | None,
|
1925
|
+
) -> dict[int, list[int]]:
|
1926
|
+
"""For the specified jobscript-block, retrieve the task-elements mapping."""
|
1927
|
+
|
1928
|
+
if task_elems_map is not None:
|
1929
|
+
self.logger.debug("jobscript-block task elements are still in memory.")
|
1930
|
+
# in the special case when the Submission object has just been created, the
|
1931
|
+
# task elements arrays will not yet be persistent.
|
1932
|
+
return task_elems_map
|
1933
|
+
|
1934
|
+
# otherwise, `append_submissions` has been called, the task elements have been
|
1935
|
+
# removed from the JSON-representation of the submission object, and have been
|
1936
|
+
# saved in separate zarr arrays:
|
1937
|
+
if sub_idx not in self._jobscript_task_element_maps:
|
1938
|
+
|
1939
|
+
self.logger.debug(
|
1940
|
+
f"retrieving jobscript-block task elements for submission {sub_idx} from "
|
1941
|
+
f"disk, and caching."
|
1942
|
+
)
|
1943
|
+
|
1944
|
+
# for a given submission, task elements are stored for all jobscript-blocks in
|
1945
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
1946
|
+
|
1947
|
+
arr = self._get_jobscripts_task_elements_arr(sub_idx)
|
1948
|
+
arr_dat = arr[:]
|
1949
|
+
block_shapes = arr.attrs["block_shapes"]
|
1950
|
+
|
1951
|
+
self._jobscript_task_element_maps[sub_idx] = {} # keys: (js_idx, blk_idx)
|
1952
|
+
arr_idx = 0
|
1953
|
+
for js_idx_i, js_blk_shapes in enumerate(block_shapes):
|
1954
|
+
for blk_idx_j, blk_shape_j in enumerate(js_blk_shapes):
|
1955
|
+
arr_i = arr_dat[arr_idx, : blk_shape_j[1], : blk_shape_j[0] + 1]
|
1956
|
+
self._jobscript_task_element_maps[sub_idx][(js_idx_i, blk_idx_j)] = {
|
1957
|
+
k[0]: list(k[1:]) for k in arr_i
|
1958
|
+
}
|
1959
|
+
arr_idx += 1
|
1960
|
+
|
1961
|
+
else:
|
1962
|
+
self.logger.debug(
|
1963
|
+
f"retrieving jobscript-block task elements for submission {sub_idx} from "
|
1964
|
+
"cache."
|
1965
|
+
)
|
1966
|
+
|
1967
|
+
return self._jobscript_task_element_maps[sub_idx][(js_idx, blk_idx)]
|
1968
|
+
|
1969
|
+
@TimeIt.decorator
|
1970
|
+
def get_jobscript_block_task_actions_array(
|
1971
|
+
self,
|
1972
|
+
sub_idx: int,
|
1973
|
+
js_idx: int,
|
1974
|
+
blk_idx: int,
|
1975
|
+
task_actions_arr: NDArray | list[tuple[int, int, int]] | None,
|
1976
|
+
) -> NDArray:
|
1977
|
+
"""For the specified jobscript-block, retrieve the task-actions array."""
|
1978
|
+
|
1979
|
+
if task_actions_arr is not None:
|
1980
|
+
self.logger.debug("jobscript-block task actions are still in memory.")
|
1981
|
+
# in the special case when the Submission object has just been created, the
|
1982
|
+
# task actions arrays will not yet be persistent.
|
1983
|
+
return np.asarray(task_actions_arr)
|
1984
|
+
|
1985
|
+
# otherwise, `append_submissions` has been called, the task actions have been
|
1986
|
+
# removed from the JSON-representation of the submission object, and have been
|
1987
|
+
# saved in separate zarr arrays:
|
1988
|
+
if sub_idx not in self._jobscript_task_actions_arrays:
|
1989
|
+
|
1990
|
+
self.logger.debug(
|
1991
|
+
f"retrieving jobscript-block task actions for submission {sub_idx} from "
|
1992
|
+
f"disk, and caching."
|
1993
|
+
)
|
1994
|
+
|
1995
|
+
# for a given submission, task actions are stored for all jobscript-blocks in
|
1996
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
1997
|
+
|
1998
|
+
arr = self._get_jobscripts_task_actions_arr(sub_idx)
|
1999
|
+
arr_dat = arr[:]
|
2000
|
+
block_num_acts = arr.attrs["block_num_acts"]
|
2001
|
+
|
2002
|
+
num_acts_count = 0
|
2003
|
+
self._jobscript_task_actions_arrays[sub_idx] = {} # keys: (js_idx, blk_idx)
|
2004
|
+
for js_idx_i, js_blk_num_acts in enumerate(block_num_acts):
|
2005
|
+
for blk_idx_j, blk_num_acts_j in enumerate(js_blk_num_acts):
|
2006
|
+
arr_i = arr_dat[num_acts_count : num_acts_count + blk_num_acts_j]
|
2007
|
+
num_acts_count += blk_num_acts_j
|
2008
|
+
self._jobscript_task_actions_arrays[sub_idx][
|
2009
|
+
(js_idx_i, blk_idx_j)
|
2010
|
+
] = arr_i
|
2011
|
+
|
2012
|
+
else:
|
2013
|
+
self.logger.debug(
|
2014
|
+
f"retrieving jobscript-block task actions for submission {sub_idx} from "
|
2015
|
+
"cache."
|
2016
|
+
)
|
2017
|
+
|
2018
|
+
return self._jobscript_task_actions_arrays[sub_idx][(js_idx, blk_idx)]
|
2019
|
+
|
2020
|
+
@TimeIt.decorator
|
2021
|
+
def get_jobscript_block_dependencies(
|
2022
|
+
self,
|
2023
|
+
sub_idx: int,
|
2024
|
+
js_idx: int,
|
2025
|
+
blk_idx: int,
|
2026
|
+
js_dependencies: dict[tuple[int, int], ResolvedJobscriptBlockDependencies] | None,
|
2027
|
+
) -> dict[tuple[int, int], ResolvedJobscriptBlockDependencies]:
|
2028
|
+
"""For the specified jobscript-block, retrieve the dependencies."""
|
2029
|
+
|
2030
|
+
if js_dependencies is not None:
|
2031
|
+
self.logger.debug("jobscript-block dependencies are still in memory.")
|
2032
|
+
# in the special case when the Submission object has just been created, the
|
2033
|
+
# dependencies will not yet be persistent.
|
2034
|
+
return js_dependencies
|
2035
|
+
|
2036
|
+
# otherwise, `append_submissions` has been called, the dependencies have been
|
2037
|
+
# removed from the JSON-representation of the submission object, and have been
|
2038
|
+
# saved in separate zarr arrays:
|
2039
|
+
if sub_idx not in self._jobscript_dependencies:
|
2040
|
+
self.logger.debug(
|
2041
|
+
f"retrieving jobscript-block dependencies for submission {sub_idx} from "
|
2042
|
+
f"disk, and caching."
|
2043
|
+
)
|
2044
|
+
# for a given submission, dependencies are stored for all jobscript-blocks in
|
2045
|
+
# the same array (and chunk), so retrieve all of them and cache:
|
2046
|
+
arr = self._get_jobscripts_dependencies_arr(sub_idx)
|
2047
|
+
self._jobscript_dependencies[
|
2048
|
+
sub_idx
|
2049
|
+
] = self._decode_jobscript_block_dependencies(arr)
|
2050
|
+
else:
|
2051
|
+
self.logger.debug(
|
2052
|
+
f"retrieving jobscript-block dependencies for submission {sub_idx} from "
|
2053
|
+
"cache."
|
2054
|
+
)
|
2055
|
+
|
2056
|
+
return self._jobscript_dependencies[sub_idx][(js_idx, blk_idx)]
|
2057
|
+
|
1240
2058
|
def get_ts_fmt(self):
|
1241
2059
|
"""
|
1242
2060
|
Get the format for timestamps.
|
@@ -1332,7 +2150,7 @@ class ZarrPersistentStore(
|
|
1332
2150
|
backup: bool = True,
|
1333
2151
|
status: bool = True,
|
1334
2152
|
) -> Array:
|
1335
|
-
arr_path = Path(
|
2153
|
+
arr_path = Path(arr.store.path) / arr.path
|
1336
2154
|
arr_name = arr.path.split("/")[-1]
|
1337
2155
|
|
1338
2156
|
if status:
|
@@ -1354,16 +2172,24 @@ class ZarrPersistentStore(
|
|
1354
2172
|
|
1355
2173
|
tic = time.perf_counter()
|
1356
2174
|
arr_rc_path = arr_path.with_suffix(".rechunked")
|
1357
|
-
arr = zarr.open(arr_path)
|
1358
2175
|
if status:
|
1359
2176
|
s.update("Creating new array...")
|
2177
|
+
|
2178
|
+
# use the same store:
|
2179
|
+
try:
|
2180
|
+
arr_rc_store = arr.store.__class__(path=arr_rc_path)
|
2181
|
+
except TypeError:
|
2182
|
+
# FSStore
|
2183
|
+
arr_rc_store = arr.store.__class__(url=str(arr_rc_path))
|
2184
|
+
|
1360
2185
|
arr_rc = zarr.create(
|
1361
|
-
store=
|
2186
|
+
store=arr_rc_store,
|
1362
2187
|
shape=arr.shape,
|
1363
2188
|
chunks=arr.shape if chunk_size is None else chunk_size,
|
1364
2189
|
dtype=object,
|
1365
2190
|
object_codec=self._CODEC,
|
1366
2191
|
)
|
2192
|
+
|
1367
2193
|
if status:
|
1368
2194
|
s.update("Copying data...")
|
1369
2195
|
data = np.empty(shape=arr.shape, dtype=object)
|
@@ -1426,6 +2252,12 @@ class ZarrPersistentStore(
|
|
1426
2252
|
arr = self._get_EARs_arr()
|
1427
2253
|
return self._rechunk_arr(arr, chunk_size, backup, status)
|
1428
2254
|
|
2255
|
+
def get_dirs_array(self) -> NDArray:
|
2256
|
+
"""
|
2257
|
+
Retrieve the run directories array.
|
2258
|
+
"""
|
2259
|
+
return self._get_dirs_arr()[:]
|
2260
|
+
|
1429
2261
|
|
1430
2262
|
class ZarrZipPersistentStore(ZarrPersistentStore):
|
1431
2263
|
"""A store designed mainly as an archive format that can be uploaded to data
|
@@ -1503,3 +2335,18 @@ class ZarrZipPersistentStore(ZarrPersistentStore):
|
|
1503
2335
|
status: bool = True,
|
1504
2336
|
) -> Array:
|
1505
2337
|
raise NotImplementedError
|
2338
|
+
|
2339
|
+
def get_text_file(self, path: str | Path) -> str:
|
2340
|
+
"""Retrieve the contents of a text file stored within the workflow."""
|
2341
|
+
path = Path(path)
|
2342
|
+
if path.is_absolute():
|
2343
|
+
path = path.relative_to(self.workflow.url)
|
2344
|
+
path = str(path.as_posix())
|
2345
|
+
assert self.fs
|
2346
|
+
try:
|
2347
|
+
with self.fs.open(path, mode="rt") as fp:
|
2348
|
+
return fp.read()
|
2349
|
+
except KeyError:
|
2350
|
+
raise FileNotFoundError(
|
2351
|
+
f"File within zip at location {path!r} does not exist."
|
2352
|
+
) from None
|