PyPI - anemoi-datasets - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

anemoi-datasets 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

anemoi/datasets/_version.py +2 -2
anemoi/datasets/commands/cleanup.py +44 -0
anemoi/datasets/commands/create.py +50 -20
anemoi/datasets/commands/finalise-additions.py +45 -0
anemoi/datasets/commands/finalise.py +39 -0
anemoi/datasets/commands/init-additions.py +45 -0
anemoi/datasets/commands/init.py +67 -0
anemoi/datasets/commands/inspect.py +1 -1
anemoi/datasets/commands/load-additions.py +47 -0
anemoi/datasets/commands/load.py +47 -0
anemoi/datasets/commands/patch.py +39 -0
anemoi/datasets/compute/recentre.py +1 -1
anemoi/datasets/create/__init__.py +961 -146
anemoi/datasets/create/check.py +5 -3
anemoi/datasets/create/config.py +53 -2
anemoi/datasets/create/functions/sources/accumulations.py +6 -22
anemoi/datasets/create/functions/sources/hindcasts.py +27 -12
anemoi/datasets/create/functions/sources/tendencies.py +1 -1
anemoi/datasets/create/functions/sources/xarray/__init__.py +12 -2
anemoi/datasets/create/functions/sources/xarray/coordinates.py +7 -0
anemoi/datasets/create/functions/sources/xarray/field.py +1 -1
anemoi/datasets/create/functions/sources/xarray/fieldlist.py +0 -2
anemoi/datasets/create/functions/sources/xarray/flavour.py +21 -1
anemoi/datasets/create/functions/sources/xarray/metadata.py +27 -29
anemoi/datasets/create/functions/sources/xarray/time.py +63 -30
anemoi/datasets/create/functions/sources/xarray/variable.py +15 -38
anemoi/datasets/create/input.py +62 -25
anemoi/datasets/create/statistics/__init__.py +39 -23
anemoi/datasets/create/utils.py +3 -2
anemoi/datasets/data/__init__.py +1 -0
anemoi/datasets/data/concat.py +46 -2
anemoi/datasets/data/dataset.py +109 -34
anemoi/datasets/data/forwards.py +17 -8
anemoi/datasets/data/grids.py +17 -3
anemoi/datasets/data/interpolate.py +133 -0
anemoi/datasets/data/misc.py +56 -66
anemoi/datasets/data/missing.py +240 -0
anemoi/datasets/data/select.py +7 -1
anemoi/datasets/data/stores.py +3 -3
anemoi/datasets/data/subset.py +47 -5
anemoi/datasets/data/unchecked.py +20 -22
anemoi/datasets/data/xy.py +125 -0
anemoi/datasets/dates/__init__.py +33 -20
anemoi/datasets/dates/groups.py +2 -2
anemoi/datasets/grids.py +66 -48
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/METADATA +5 -5
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/RECORD +51 -41
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/WHEEL +1 -1
anemoi/datasets/create/loaders.py +0 -924
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/LICENSE +0 -0
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.4.3.dist-info → anemoi_datasets-0.4.5.dist-info}/top_level.txt +0 -0

anemoi/datasets/create/__init__.py CHANGED Viewed

@@ -7,196 +7,1011 @@
 # nor does it submit to any jurisdiction.
 #
+import datetime
+import json
 import logging
 import os
+import time
+import uuid
+import warnings
+from functools import cached_property
+import numpy as np
+import tqdm
+from anemoi.utils.config import DotDict as DotDict
+from anemoi.utils.dates import as_datetime
+from anemoi.utils.dates import frequency_to_string
+from anemoi.utils.dates import frequency_to_timedelta
+from anemoi.utils.humanize import compress_dates
+from anemoi.utils.humanize import seconds_to_human
+from anemoi.datasets import MissingDateError
+from anemoi.datasets import open_dataset
+from anemoi.datasets.create.persistent import build_storage
+from anemoi.datasets.data.misc import as_first_date
+from anemoi.datasets.data.misc import as_last_date
+from anemoi.datasets.dates.groups import Groups
+from .check import DatasetName
+from .check import check_data_values
+from .chunks import ChunkFilter
+from .config import build_output
+from .config import loader_config
+from .input import build_input
+from .statistics import Summary
+from .statistics import TmpStatistics
+from .statistics import check_variance
+from .statistics import compute_statistics
+from .statistics import default_statistics_dates
+from .statistics import fix_variance
+from .utils import normalize_and_check_dates
+from .writer import ViewCacheArray
 LOG = logging.getLogger(__name__)
+VERSION = "0.20"
+def json_tidy(o):
+    if isinstance(o, datetime.datetime):
+        return o.isoformat()
+    if isinstance(o, datetime.datetime):
+        return o.isoformat()
+    if isinstance(o, datetime.timedelta):
+        return frequency_to_string(o)
+    raise TypeError(repr(o) + " is not JSON serializable")
+def build_statistics_dates(dates, start, end):
+    """Compute the start and end dates for the statistics, based on :
+    - The start and end dates in the config
+    - The default statistics dates convention
+    Then adapt according to the actual dates in the dataset.
+    """
+    # if not specified, use the default statistics dates
+    default_start, default_end = default_statistics_dates(dates)
+    if start is None:
+        start = default_start
+    if end is None:
+        end = default_end
+    # in any case, adapt to the actual dates in the dataset
+    start = as_first_date(start, dates)
+    end = as_last_date(end, dates)
+    # and convert to datetime to isoformat
+    start = start.astype(datetime.datetime)
+    end = end.astype(datetime.datetime)
+    return (start.isoformat(), end.isoformat())
 def _ignore(*args, **kwargs):
     pass
-class Creator:
-    def __init__(
-        self,
-        path,
-        config=None,
-        cache=None,
-        use_threads=False,
-        statistics_tmp=None,
-        overwrite=False,
-        test=None,
-        progress=None,
-        **kwargs,
-    ):
-        self.path = path  # Output path
-        self.config = config
+def _path_readable(path):
+    import zarr
+    try:
+        zarr.open(path, "r")
+        return True
+    except zarr.errors.PathNotFoundError:
+        return False
+class Dataset:
+    def __init__(self, path):
+        self.path = path
+        _, ext = os.path.splitext(self.path)
+        if ext != ".zarr":
+            raise ValueError(f"Unsupported extension={ext} for path={self.path}")
+    def add_dataset(self, mode="r+", **kwargs):
+        import zarr
+        z = zarr.open(self.path, mode=mode)
+        from .zarr import add_zarr_dataset
+        return add_zarr_dataset(zarr_root=z, **kwargs)
+    def update_metadata(self, **kwargs):
+        import zarr
+        LOG.debug(f"Updating metadata {kwargs}")
+        z = zarr.open(self.path, mode="w+")
+        for k, v in kwargs.items():
+            if isinstance(v, np.datetime64):
+                v = v.astype(datetime.datetime)
+            if isinstance(v, datetime.date):
+                v = v.isoformat()
+            z.attrs[k] = json.loads(json.dumps(v, default=json_tidy))
+    @property
+    def anemoi_dataset(self):
+        return open_dataset(self.path)
+    @cached_property
+    def zarr_metadata(self):
+        import zarr
+        return dict(zarr.open(self.path, mode="r").attrs)
+    def print_info(self):
+        import zarr
+        z = zarr.open(self.path, mode="r")
+        try:
+            LOG.info(z["data"].info)
+        except Exception as e:
+            LOG.info(e)
+    def get_zarr_chunks(self):
+        import zarr
+        z = zarr.open(self.path, mode="r")
+        return z["data"].chunks
+    def check_name(self, resolution, dates, frequency, raise_exception=True, is_test=False):
+        basename, _ = os.path.splitext(os.path.basename(self.path))
+        try:
+            DatasetName(basename, resolution, dates[0], dates[-1], frequency).raise_if_not_valid()
+        except Exception as e:
+            if raise_exception and not is_test:
+                raise e
+            else:
+                LOG.warning(f"Dataset name error: {e}")
+    def get_main_config(self):
+        """Returns None if the config is not found."""
+        import zarr
+        z = zarr.open(self.path, mode="r")
+        return loader_config(z.attrs.get("_create_yaml_config"))
+class WritableDataset(Dataset):
+    def __init__(self, path):
+        super().__init__(path)
+        self.path = path
+        import zarr
+        self.z = zarr.open(self.path, mode="r+")
+    @cached_property
+    def data_array(self):
+        import zarr
+        return zarr.open(self.path, mode="r+")["data"]
+class NewDataset(Dataset):
+    def __init__(self, path, overwrite=False):
+        super().__init__(path)
+        self.path = path
+        import zarr
+        self.z = zarr.open(self.path, mode="w")
+        self.z.create_group("_build")
+class Actor:  # TODO: rename to Creator
+    dataset_class = WritableDataset
+    def __init__(self, path, cache=None):
+        # Catch all floating point errors, including overflow, sqrt(<0), etc
+        np.seterr(all="raise", under="warn")
+        self.path = path
         self.cache = cache
+        self.dataset = self.dataset_class(self.path)
+    def run(self):
+        # to be implemented in the sub-classes
+        raise NotImplementedError()
+    def update_metadata(self, **kwargs):
+        self.dataset.update_metadata(**kwargs)
+    def _cache_context(self):
+        from .utils import cache_context
+        return cache_context(self.cache)
+    def check_unkown_kwargs(self, kwargs):
+        # remove this latter
+        LOG.warning(f"💬 Unknown kwargs for {self.__class__.__name__}: {kwargs}")
+    def read_dataset_metadata(self, path):
+        ds = open_dataset(path)
+        self.dataset_shape = ds.shape
+        self.variables_names = ds.variables
+        assert len(self.variables_names) == ds.shape[1], self.dataset_shape
+        self.dates = ds.dates
+        self.missing_dates = sorted(list([self.dates[i] for i in ds.missing]))
+        def check_missing_dates(expected):
+            import zarr
+            z = zarr.open(path, "r")
+            missing_dates = z.attrs.get("missing_dates", [])
+            missing_dates = sorted([np.datetime64(d) for d in missing_dates])
+            if missing_dates != expected:
+                LOG.warn("Missing dates given in recipe do not match the actual missing dates in the dataset.")
+                LOG.warn(f"Missing dates in recipe: {sorted(str(x) for x in missing_dates)}")
+                LOG.warn(f"Missing dates in dataset: {sorted(str(x) for x in  expected)}")
+                raise ValueError("Missing dates given in recipe do not match the actual missing dates in the dataset.")
+        check_missing_dates(self.missing_dates)
+class Patch(Actor):
+    def __init__(self, path, options=None, **kwargs):
+        self.path = path
+        self.options = options or {}
+    def run(self):
+        from .patch import apply_patch
+        apply_patch(self.path, **self.options)
+class Size(Actor):
+    def __init__(self, path, **kwargs):
+        super().__init__(path)
+    def run(self):
+        from .size import compute_directory_sizes
+        metadata = compute_directory_sizes(self.path)
+        self.update_metadata(**metadata)
+class HasRegistryMixin:
+    @cached_property
+    def registry(self):
+        from .zarr import ZarrBuiltRegistry
+        return ZarrBuiltRegistry(self.path, use_threads=self.use_threads)
+class HasStatisticTempMixin:
+    @cached_property
+    def tmp_statistics(self):
+        directory = self.statistics_temp_dir or os.path.join(self.path + ".storage_for_statistics.tmp")
+        return TmpStatistics(directory)
+class HasElementForDataMixin:
+    def create_elements(self, config):
+        assert self.registry
+        assert self.tmp_statistics
+        LOG.info(dict(config.dates))
+        self.groups = Groups(**config.dates)
+        LOG.info(self.groups)
+        self.output = build_output(config.output, parent=self)
+        self.input = build_input_(main_config=config, output_config=self.output)
+        LOG.info(self.input)
+def build_input_(main_config, output_config):
+    from earthkit.data.core.order import build_remapping
+    builder = build_input(
+        main_config.input,
+        data_sources=main_config.get("data_sources", {}),
+        order_by=output_config.order_by,
+        flatten_grid=output_config.flatten_grid,
+        remapping=build_remapping(output_config.remapping),
+        use_grib_paramid=main_config.build.use_grib_paramid,
+    )
+    LOG.debug("✅ INPUT_BUILDER")
+    LOG.debug(builder)
+    return builder
+class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixin):
+    dataset_class = NewDataset
+    def __init__(self, path, config, check_name=False, overwrite=False, use_threads=False, statistics_temp_dir=None, progress=None, test=False, cache=None, **kwargs):  # fmt: skip
+        if _path_readable(path) and not overwrite:
+            raise Exception(f"{self.path} already exists. Use overwrite=True to overwrite.")
+        super().__init__(path, cache=cache)
+        self.config = config
+        self.check_name = check_name
         self.use_threads = use_threads
-        self.statistics_tmp = statistics_tmp
-        self.overwrite = overwrite
+        self.statistics_temp_dir = statistics_temp_dir
+        self.progress = progress
         self.test = test
-        self.progress = progress if progress is not None else _ignore
-    def init(self, check_name=False):
-        # check path
-        _, ext = os.path.splitext(self.path)
-        assert ext != "zarr", f"Unsupported extension={ext}"
-        from .loaders import InitialiserLoader
+        self.main_config = loader_config(config, is_test=test)
-        if self._path_readable() and not self.overwrite:
-            raise Exception(f"{self.path} already exists. Use overwrite=True to overwrite.")
+        # self.registry.delete() ??
+        self.tmp_statistics.delete()
+        assert isinstance(self.main_config.output.order_by, dict), self.main_config.output.order_by
+        self.create_elements(self.main_config)
+        first_date = self.groups.dates[0]
+        self.minimal_input = self.input.select([first_date])
+        LOG.info("Minimal input for 'init' step (using only the first date) :")
+        LOG.info(self.minimal_input)
+    def run(self):
         with self._cache_context():
-            obj = InitialiserLoader.from_config(
-                path=self.path,
-                config=self.config,
-                statistics_tmp=self.statistics_tmp,
-                use_threads=self.use_threads,
-                progress=self.progress,
-                test=self.test,
+            return self._run()
+    def _run(self):
+        """Create an empty dataset of the right final shape
+        Read a small part of the data to get the shape of the data and the resolution and more metadata.
+        """
+        LOG.info("Config loaded ok:")
+        # LOG.info(self.main_config)
+        dates = self.groups.dates
+        frequency = dates.frequency
+        assert isinstance(frequency, datetime.timedelta), frequency
+        LOG.info(f"Found {len(dates)} datetimes.")
+        LOG.info(f"Dates: Found {len(dates)} datetimes, in {len(self.groups)} groups: ")
+        LOG.info(f"Missing dates: {len(dates.missing)}")
+        lengths = tuple(len(g) for g in self.groups)
+        variables = self.minimal_input.variables
+        LOG.info(f"Found {len(variables)} variables : {','.join(variables)}.")
+        variables_with_nans = self.main_config.statistics.get("allow_nans", [])
+        ensembles = self.minimal_input.ensembles
+        LOG.info(f"Found {len(ensembles)} ensembles : {','.join([str(_) for _ in ensembles])}.")
+        grid_points = self.minimal_input.grid_points
+        LOG.info(f"gridpoints size: {[len(i) for i in grid_points]}")
+        resolution = self.minimal_input.resolution
+        LOG.info(f"{resolution=}")
+        coords = self.minimal_input.coords
+        coords["dates"] = dates
+        total_shape = self.minimal_input.shape
+        total_shape[0] = len(dates)
+        LOG.info(f"total_shape = {total_shape}")
+        chunks = self.output.get_chunking(coords)
+        LOG.info(f"{chunks=}")
+        dtype = self.output.dtype
+        LOG.info(f"Creating Dataset '{self.path}', with {total_shape=}, {chunks=} and {dtype=}")
+        metadata = {}
+        metadata["uuid"] = str(uuid.uuid4())
+        metadata.update(self.main_config.get("add_metadata", {}))
+        metadata["_create_yaml_config"] = self.main_config.get_serialisable_dict()
+        metadata["description"] = self.main_config.description
+        metadata["licence"] = self.main_config["licence"]
+        metadata["attribution"] = self.main_config["attribution"]
+        metadata["remapping"] = self.output.remapping
+        metadata["order_by"] = self.output.order_by_as_list
+        metadata["flatten_grid"] = self.output.flatten_grid
+        metadata["ensemble_dimension"] = len(ensembles)
+        metadata["variables"] = variables
+        metadata["variables_with_nans"] = variables_with_nans
+        metadata["allow_nans"] = self.main_config.build.get("allow_nans", False)
+        metadata["resolution"] = resolution
+        metadata["data_request"] = self.minimal_input.data_request
+        metadata["field_shape"] = self.minimal_input.field_shape
+        metadata["proj_string"] = self.minimal_input.proj_string
+        metadata["start_date"] = dates[0].isoformat()
+        metadata["end_date"] = dates[-1].isoformat()
+        metadata["frequency"] = frequency
+        metadata["missing_dates"] = [_.isoformat() for _ in dates.missing]
+        metadata["version"] = VERSION
+        self.dataset.check_name(
+            raise_exception=self.check_name,
+            is_test=self.test,
+            resolution=resolution,
+            dates=dates,
+            frequency=frequency,
+        )
+        if len(dates) != total_shape[0]:
+            raise ValueError(
+                f"Final date size {len(dates)} (from {dates[0]} to {dates[-1]}, {frequency=}) "
+                f"does not match data shape {total_shape[0]}. {total_shape=}"
             )
-            return obj.initialise(check_name=check_name)
-    def load(self, parts=None):
-        from .loaders import ContentLoader
+        dates = normalize_and_check_dates(dates, metadata["start_date"], metadata["end_date"], metadata["frequency"])
+        metadata.update(self.main_config.get("force_metadata", {}))
+        ###############################################################
+        # write metadata
+        ###############################################################
+        self.update_metadata(**metadata)
+        self.dataset.add_dataset(
+            name="data",
+            chunks=chunks,
+            dtype=dtype,
+            shape=total_shape,
+            dimensions=("time", "variable", "ensemble", "cell"),
+        )
+        self.dataset.add_dataset(name="dates", array=dates, dimensions=("time",))
+        self.dataset.add_dataset(name="latitudes", array=grid_points[0], dimensions=("cell",))
+        self.dataset.add_dataset(name="longitudes", array=grid_points[1], dimensions=("cell",))
+        self.registry.create(lengths=lengths)
+        self.tmp_statistics.create(exist_ok=False)
+        self.registry.add_to_history("tmp_statistics_initialised", version=self.tmp_statistics.version)
+        statistics_start, statistics_end = build_statistics_dates(
+            dates,
+            self.main_config.statistics.get("start"),
+            self.main_config.statistics.get("end"),
+        )
+        self.update_metadata(statistics_start_date=statistics_start, statistics_end_date=statistics_end)
+        LOG.info(f"Will compute statistics from {statistics_start} to {statistics_end}")
+        self.registry.add_to_history("init finished")
+        assert chunks == self.dataset.get_zarr_chunks(), (chunks, self.dataset.get_zarr_chunks())
+        def sanity_check_config(a, b):
+            a = json.dumps(a, sort_keys=True, default=str)
+            b = json.dumps(b, sort_keys=True, default=str)
+            b = b.replace("T", " ")  # dates are expected to be different because
+            if a != b:
+                print("❌❌❌ FIXME: Config serialisation to be checked")
+                print(a)
+                print(b)
+        sanity_check_config(self.main_config, self.dataset.get_main_config())
+        # Return the number of groups to process, so we can show a nice progress bar
+        return len(lengths)
+class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixin):
+    def __init__(self, path, parts=None,  use_threads=False, statistics_temp_dir=None, progress=None, cache=None, **kwargs):  # fmt: skip
+        super().__init__(path, cache=cache)
+        self.use_threads = use_threads
+        self.statistics_temp_dir = statistics_temp_dir
+        self.progress = progress
+        self.parts = parts
+        self.dataset = WritableDataset(self.path)
+        self.main_config = self.dataset.get_main_config()
+        self.create_elements(self.main_config)
+        self.read_dataset_metadata(self.dataset.path)
+        total = len(self.registry.get_flags())
+        self.chunk_filter = ChunkFilter(parts=self.parts, total=total)
+        self.data_array = self.dataset.data_array
+        self.n_groups = len(self.groups)
+    def run(self):
         with self._cache_context():
-            loader = ContentLoader.from_dataset_config(
-                path=self.path,
-                statistics_tmp=self.statistics_tmp,
-                use_threads=self.use_threads,
-                progress=self.progress,
-                parts=parts,
+            self._run()
+    def _run(self):
+        for igroup, group in enumerate(self.groups):
+            if not self.chunk_filter(igroup):
+                continue
+            if self.registry.get_flag(igroup):
+                LOG.info(f" -> Skipping {igroup} total={len(self.groups)} (already done)")
+                continue
+            assert isinstance(group[0], datetime.datetime), group
+            LOG.debug(f"Building data for group {igroup}/{self.n_groups}")
+            result = self.input.select(dates=group)
+            assert result.dates == group, (len(result.dates), len(group))
+            # There are several groups.
+            # There is one result to load for each group.
+            self.load_result(result)
+            self.registry.set_flag(igroup)
+        self.registry.add_provenance(name="provenance_load")
+        self.tmp_statistics.add_provenance(name="provenance_load", config=self.main_config)
+        self.dataset.print_info()
+    def load_result(self, result):
+        # There is one cube to load for each result.
+        dates = result.dates
+        cube = result.get_cube()
+        shape = cube.extended_user_shape
+        dates_in_data = cube.user_coords["valid_datetime"]
+        LOG.debug(f"Loading {shape=} in {self.data_array.shape=}")
+        def check_shape(cube, dates, dates_in_data):
+            if cube.extended_user_shape[0] != len(dates):
+                print(f"Cube shape does not match the number of dates {cube.extended_user_shape[0]}, {len(dates)}")
+                print("Requested dates", compress_dates(dates))
+                print("Cube dates", compress_dates(dates_in_data))
+                a = set(as_datetime(_) for _ in dates)
+                b = set(as_datetime(_) for _ in dates_in_data)
+                print("Missing dates", compress_dates(a - b))
+                print("Extra dates", compress_dates(b - a))
+                raise ValueError(
+                    f"Cube shape does not match the number of dates {cube.extended_user_shape[0]}, {len(dates)}"
+                )
+        check_shape(cube, dates, dates_in_data)
+        def check_dates_in_data(lst, lst2):
+            lst2 = [np.datetime64(_) for _ in lst2]
+            lst = [np.datetime64(_) for _ in lst]
+            assert lst == lst2, ("Dates in data are not the requested ones:", lst, lst2)
+        check_dates_in_data(dates_in_data, dates)
+        def dates_to_indexes(dates, all_dates):
+            x = np.array(dates, dtype=np.datetime64)
+            y = np.array(all_dates, dtype=np.datetime64)
+            bitmap = np.isin(x, y)
+            return np.where(bitmap)[0]
+        indexes = dates_to_indexes(self.dates, dates_in_data)
+        array = ViewCacheArray(self.data_array, shape=shape, indexes=indexes)
+        self.load_cube(cube, array)
+        stats = compute_statistics(array.cache, self.variables_names, allow_nans=self._get_allow_nans())
+        self.tmp_statistics.write(indexes, stats, dates=dates_in_data)
+        array.flush()
+    def _get_allow_nans(self):
+        config = self.main_config
+        if "allow_nans" in config.build:
+            return config.build.allow_nans
+        return config.statistics.get("allow_nans", [])
+    def load_cube(self, cube, array):
+        # There are several cubelets for each cube
+        start = time.time()
+        load = 0
+        save = 0
+        reading_chunks = None
+        total = cube.count(reading_chunks)
+        LOG.debug(f"Loading datacube: {cube}")
+        def position(x):
+            if isinstance(x, str) and "/" in x:
+                x = x.split("/")
+                return int(x[0])
+            return None
+        bar = tqdm.tqdm(
+            iterable=cube.iterate_cubelets(reading_chunks),
+            total=total,
+            desc=f"Loading datacube {cube}",
+            position=position(self.parts),
+        )
+        for i, cubelet in enumerate(bar):
+            bar.set_description(f"Loading {i}/{total}")
+            now = time.time()
+            data = cubelet.to_numpy()
+            local_indexes = cubelet.coords
+            load += time.time() - now
+            name = self.variables_names[local_indexes[1]]
+            check_data_values(
+                data[:],
+                name=name,
+                log=[i, data.shape, local_indexes],
+                allow_nans=self._get_allow_nans(),
             )
-            loader.load()
-    def statistics(self, force=False, output=None, start=None, end=None):
-        from .loaders import StatisticsAdder
-        loader = StatisticsAdder.from_dataset(
-            path=self.path,
-            use_threads=self.use_threads,
-            progress=self.progress,
-            statistics_tmp=self.statistics_tmp,
-            statistics_output=output,
-            recompute=False,
-            statistics_start=start,
-            statistics_end=end,
+            now = time.time()
+            array[local_indexes] = data
+            save += time.time() - now
+        now = time.time()
+        save += time.time() - now
+        LOG.debug(
+            f"Elapsed: {seconds_to_human(time.time() - start)}, "
+            f"load time: {seconds_to_human(load)}, "
+            f"write time: {seconds_to_human(save)}."
         )
-        loader.run()
-        assert loader.ready()
-    def size(self):
-        from .loaders import DatasetHandler
-        from .size import compute_directory_sizes
-        metadata = compute_directory_sizes(self.path)
-        handle = DatasetHandler.from_dataset(path=self.path, use_threads=self.use_threads)
-        handle.update_metadata(**metadata)
+class Cleanup(Actor, HasRegistryMixin, HasStatisticTempMixin):
+    def __init__(self, path, statistics_temp_dir=None, delta=[], use_threads=False, **kwargs):
+        super().__init__(path)
+        self.use_threads = use_threads
+        self.statistics_temp_dir = statistics_temp_dir
+        self.additinon_temp_dir = statistics_temp_dir
+        self.actors = [
+            _InitAdditions(path, delta=d, use_threads=use_threads, statistics_temp_dir=statistics_temp_dir)
+            for d in delta
+        ]
+    def run(self):
+        self.tmp_statistics.delete()
+        self.registry.clean()
+        for actor in self.actors:
+            actor.cleanup()
+class Verify(Actor):
+    def __init__(self, path, **kwargs):
+        super().__init__(path)
+    def run(self):
+        LOG.info(f"Verifying dataset at {self.path}")
+        LOG.info(str(self.dataset.anemoi_dataset))
+class AdditionsMixin:
+    def skip(self):
+        frequency = frequency_to_timedelta(self.dataset.anemoi_dataset.frequency)
+        if not self.delta.total_seconds() % frequency.total_seconds() == 0:
+            LOG.debug(f"Delta {self.delta} is not a multiple of frequency {frequency}. Skipping.")
+            return True
+        return False
+    @cached_property
+    def tmp_storage_path(self):
+        name = "storage_for_additions"
+        if self.delta:
+            name += frequency_to_string(self.delta)
+        return os.path.join(f"{self.path}.{name}.tmp")
+    def read_from_dataset(self):
+        self.variables = self.dataset.anemoi_dataset.variables
+        self.frequency = frequency_to_timedelta(self.dataset.anemoi_dataset.frequency)
+        start = self.dataset.zarr_metadata["statistics_start_date"]
+        end = self.dataset.zarr_metadata["statistics_end_date"]
+        self.start = datetime.datetime.fromisoformat(start)
+        self.end = datetime.datetime.fromisoformat(end)
+        ds = open_dataset(self.path, start=self.start, end=self.end)
+        self.dates = ds.dates
+        self.total = len(self.dates)
+        idelta = self.delta.total_seconds() // self.frequency.total_seconds()
+        assert int(idelta) == idelta, idelta
+        idelta = int(idelta)
+        self.ds = DeltaDataset(ds, idelta)
+class DeltaDataset:
+    def __init__(self, ds, idelta):
+        self.ds = ds
+        self.idelta = idelta
+    def __getitem__(self, i):
+        j = i - self.idelta
+        if j < 0:
+            raise MissingDateError(f"Missing date {j}")
+        return self.ds[i : i + 1, ...] - self.ds[j : j + 1, ...]
+class _InitAdditions(Actor, HasRegistryMixin, AdditionsMixin):
+    def __init__(self, path, delta, use_threads=False, progress=None, **kwargs):
+        super().__init__(path)
+        self.delta = frequency_to_timedelta(delta)
+        self.use_threads = use_threads
+        self.progress = progress
+    def run(self):
+        if self.skip():
+            LOG.info(f"Skipping delta={self.delta}")
+            return
+        self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=True)
+        self.tmp_storage.delete()
+        self.tmp_storage.create()
+        LOG.info(f"Dataset {self.tmp_storage_path} additions initialized.")
     def cleanup(self):
-        from .loaders import DatasetHandlerWithStatistics
+        self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=False)
+        self.tmp_storage.delete()
+        LOG.info(f"Cleaned temporary storage {self.tmp_storage_path}")
-        cleaner = DatasetHandlerWithStatistics.from_dataset(
-            path=self.path, use_threads=self.use_threads, progress=self.progress, statistics_tmp=self.statistics_tmp
-        )
-        cleaner.tmp_statistics.delete()
-        cleaner.registry.clean()
-    def patch(self, **kwargs):
-        from .patch import apply_patch
+class _RunAdditions(Actor, HasRegistryMixin, AdditionsMixin):
+    def __init__(self, path, delta, parts=None, use_threads=False, progress=None, **kwargs):
+        super().__init__(path)
+        self.delta = frequency_to_timedelta(delta)
+        self.use_threads = use_threads
+        self.progress = progress
+        self.parts = parts
-        apply_patch(self.path, **kwargs)
+        self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=False)
+        LOG.info(f"Writing in {self.tmp_storage_path}")
-    def init_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
-        from .loaders import StatisticsAddition
-        from .loaders import TendenciesStatisticsAddition
-        from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
+    def run(self):
+        if self.skip():
+            LOG.info(f"Skipping delta={self.delta}")
+            return
-        if statistics:
-            a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
-            a.initialise()
+        self.read_from_dataset()
-        for d in delta:
+        chunk_filter = ChunkFilter(parts=self.parts, total=self.total)
+        for i in range(0, self.total):
+            if not chunk_filter(i):
+                continue
+            date = self.dates[i]
             try:
-                a = TendenciesStatisticsAddition.from_dataset(
-                    path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
-                )
-                a.initialise()
-            except TendenciesStatisticsDeltaNotMultipleOfFrequency:
-                LOG.info(f"Skipping delta={d} as it is not a multiple of the frequency.")
+                arr = self.ds[i]
+                stats = compute_statistics(arr, self.variables, allow_nans=self.allow_nans)
+                self.tmp_storage.add([date, i, stats], key=date)
+            except MissingDateError:
+                self.tmp_storage.add([date, i, "missing"], key=date)
+        self.tmp_storage.flush()
+        LOG.debug(f"Dataset {self.path} additions run.")
+    def allow_nans(self):
+        if self.dataset.anemoi_dataset.metadata.get("allow_nans", False):
+            return True
-    def run_additions(self, parts=None, delta=[1, 3, 6, 12, 24], statistics=True):
-        from .loaders import StatisticsAddition
-        from .loaders import TendenciesStatisticsAddition
-        from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
+        variables_with_nans = self.dataset.anemoi_dataset.metadata.get("variables_with_nans", None)
+        if variables_with_nans is not None:
+            return variables_with_nans
+        warnings.warn(f"❗Cannot find 'variables_with_nans' in {self.path}, assuming nans allowed.")
+        return True
-        if statistics:
-            a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
-            a.run(parts)
-        for d in delta:
-            try:
-                a = TendenciesStatisticsAddition.from_dataset(
-                    path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
-                )
-                a.run(parts)
-            except TendenciesStatisticsDeltaNotMultipleOfFrequency:
-                LOG.debug(f"Skipping delta={d} as it is not a multiple of the frequency.")
+class _FinaliseAdditions(Actor, HasRegistryMixin, AdditionsMixin):
+    def __init__(self, path, delta, use_threads=False, progress=None, **kwargs):
+        super().__init__(path)
+        self.delta = frequency_to_timedelta(delta)
+        self.use_threads = use_threads
+        self.progress = progress
+        self.tmp_storage = build_storage(directory=self.tmp_storage_path, create=False)
+        LOG.info(f"Reading from {self.tmp_storage_path}.")
+    def run(self):
+        if self.skip():
+            LOG.info(f"Skipping delta={self.delta}.")
+            return
+        self.read_from_dataset()
+        shape = (len(self.dates), len(self.variables))
+        agg = dict(
+            minimum=np.full(shape, np.nan, dtype=np.float64),
+            maximum=np.full(shape, np.nan, dtype=np.float64),
+            sums=np.full(shape, np.nan, dtype=np.float64),
+            squares=np.full(shape, np.nan, dtype=np.float64),
+            count=np.full(shape, -1, dtype=np.int64),
+            has_nans=np.full(shape, False, dtype=np.bool_),
+        )
+        LOG.debug(f"Aggregating {self.__class__.__name__} statistics on shape={shape}. Variables : {self.variables}")
+        found = set()
+        ifound = set()
+        missing = set()
+        for _date, (date, i, stats) in self.tmp_storage.items():
+            assert _date == date
+            if stats == "missing":
+                missing.add(date)
+                continue
+            assert date not in found, f"Duplicates found {date}"
+            found.add(date)
+            ifound.add(i)
+            for k in ["minimum", "maximum", "sums", "squares", "count", "has_nans"]:
+                agg[k][i, ...] = stats[k]
+        assert len(found) + len(missing) == len(self.dates), (
+            len(found),
+            len(missing),
+            len(self.dates),
+        )
+        assert found.union(missing) == set(self.dates), (
+            found,
+            missing,
+            set(self.dates),
+        )
-    def finalise_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
-        from .loaders import StatisticsAddition
-        from .loaders import TendenciesStatisticsAddition
-        from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
+        if len(ifound) < 2:
+            LOG.warn(f"Not enough data found in {self.path} to compute {self.__class__.__name__}. Skipped.")
+            self.tmp_storage.delete()
+            return
-        if statistics:
-            a = StatisticsAddition.from_dataset(path=self.path, use_threads=self.use_threads)
-            a.finalise()
+        mask = sorted(list(ifound))
+        for k in ["minimum", "maximum", "sums", "squares", "count", "has_nans"]:
+            agg[k] = agg[k][mask, ...]
-        for d in delta:
-            try:
-                a = TendenciesStatisticsAddition.from_dataset(
-                    path=self.path, use_threads=self.use_threads, progress=self.progress, delta=d
-                )
-                a.finalise()
-            except TendenciesStatisticsDeltaNotMultipleOfFrequency:
-                LOG.debug(f"Skipping delta={d} as it is not a multiple of the frequency.")
-    def finalise(self, **kwargs):
-        self.statistics(**kwargs)
-        self.size()
-    def create(self):
-        self.init()
-        self.load()
-        self.finalise()
-        self.additions()
-        self.cleanup()
-    def additions(self, delta=[1, 3, 6, 12, 24]):
-        self.init_additions(delta=delta)
-        self.run_additions(delta=delta)
-        self.finalise_additions(delta=delta)
+        for k in ["minimum", "maximum", "sums", "squares", "count", "has_nans"]:
+            assert agg[k].shape == agg["count"].shape, (
+                agg[k].shape,
+                agg["count"].shape,
+            )
-    def _cache_context(self):
-        from .utils import cache_context
+        minimum = np.nanmin(agg["minimum"], axis=0)
+        maximum = np.nanmax(agg["maximum"], axis=0)
+        sums = np.nansum(agg["sums"], axis=0)
+        squares = np.nansum(agg["squares"], axis=0)
+        count = np.nansum(agg["count"], axis=0)
+        has_nans = np.any(agg["has_nans"], axis=0)
+        assert sums.shape == count.shape
+        assert sums.shape == squares.shape
+        assert sums.shape == minimum.shape
+        assert sums.shape == maximum.shape
+        assert sums.shape == has_nans.shape
+        mean = sums / count
+        assert sums.shape == mean.shape
+        x = squares / count - mean * mean
+        # x[- 1e-15 < (x / (np.sqrt(squares / count) + np.abs(mean))) < 0] = 0
+        # remove negative variance due to numerical errors
+        for i, name in enumerate(self.variables):
+            x[i] = fix_variance(x[i], name, agg["count"][i : i + 1], agg["sums"][i : i + 1], agg["squares"][i : i + 1])
+        check_variance(x, self.variables, minimum, maximum, mean, count, sums, squares)
+        stdev = np.sqrt(x)
+        assert sums.shape == stdev.shape
+        self.summary = Summary(
+            minimum=minimum,
+            maximum=maximum,
+            mean=mean,
+            count=count,
+            sums=sums,
+            squares=squares,
+            stdev=stdev,
+            variables_names=self.variables,
+            has_nans=has_nans,
+        )
+        LOG.info(f"Dataset {self.path} additions finalised.")
+        # self.check_statistics()
+        self._write(self.summary)
+        self.tmp_storage.delete()
-        return cache_context(self.cache)
+    def _write(self, summary):
+        for k in ["mean", "stdev", "minimum", "maximum", "sums", "squares", "count", "has_nans"]:
+            name = f"statistics_tendencies_{frequency_to_string(self.delta)}_{k}"
+            self.dataset.add_dataset(name=name, array=summary[k], dimensions=("variable",))
+        self.registry.add_to_history(f"compute_statistics_{self.__class__.__name__.lower()}_end")
+        LOG.debug(f"Wrote additions in {self.path}")
-    def _path_readable(self):
-        import zarr
-        try:
-            zarr.open(self.path, "r")
-            return True
-        except zarr.errors.PathNotFoundError:
-            return False
+def multi_addition(cls):
+    class MultiAdditions:
+        def __init__(self, *args, **kwargs):
+            self.actors = []
+            for k in kwargs.pop("delta", []):
+                self.actors.append(cls(*args, delta=k, **kwargs))
+            if not self.actors:
+                LOG.warning("No delta found in kwargs, no addtions will be computed.")
-    def verify(self):
-        from .loaders import DatasetVerifier
+        def run(self):
+            for actor in self.actors:
+                actor.run()
-        handle = DatasetVerifier.from_dataset(path=self.path, use_threads=self.use_threads)
+    return MultiAdditions
+InitAdditions = multi_addition(_InitAdditions)
+RunAdditions = multi_addition(_RunAdditions)
+FinaliseAdditions = multi_addition(_FinaliseAdditions)
+class Statistics(Actor, HasStatisticTempMixin, HasRegistryMixin):
+    def __init__(self, path, use_threads=False, statistics_temp_dir=None, progress=None, **kwargs):
+        super().__init__(path)
+        self.use_threads = use_threads
+        self.progress = progress
+        self.statistics_temp_dir = statistics_temp_dir
+    def run(self):
+        start, end = (
+            self.dataset.zarr_metadata["statistics_start_date"],
+            self.dataset.zarr_metadata["statistics_end_date"],
+        )
+        start, end = np.datetime64(start), np.datetime64(end)
+        dates = self.dataset.anemoi_dataset.dates
+        assert type(dates[0]) == type(start), (type(dates[0]), type(start))  # noqa
+        dates = [d for d in dates if d >= start and d <= end]
+        dates = [d for i, d in enumerate(dates) if i not in self.dataset.anemoi_dataset.missing]
+        variables = self.dataset.anemoi_dataset.variables
+        stats = self.tmp_statistics.get_aggregated(dates, variables, self.allow_nans)
+        LOG.info(stats)
+        if not all(self.registry.get_flags(sync=False)):
+            raise Exception(f"❗Zarr {self.path} is not fully built, not writting statistics into dataset.")
+        for k in ["mean", "stdev", "minimum", "maximum", "sums", "squares", "count", "has_nans"]:
+            self.dataset.add_dataset(name=k, array=stats[k], dimensions=("variable",))
+        self.registry.add_to_history("compute_statistics_end")
+        LOG.info(f"Wrote statistics in {self.path}")
+    @cached_property
+    def allow_nans(self):
+        import zarr
-        handle.verify()
+        z = zarr.open(self.path, mode="r")
+        if "allow_nans" in z.attrs:
+            return z.attrs["allow_nans"]
+        if "variables_with_nans" in z.attrs:
+            return z.attrs["variables_with_nans"]
+        warnings.warn(f"Cannot find 'variables_with_nans' of 'allow_nans' in {self.path}.")
+        return True
+def chain(tasks):
+    class Chain(Actor):
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+        def run(self):
+            for cls in tasks:
+                t = cls(**self.kwargs)
+                t.run()
+    return Chain
+def creator_factory(name, trace=None, **kwargs):
+    if trace:
+        from anemoi.datasets.create.trace import enable_trace
+        enable_trace(trace)
+    cls = dict(
+        init=Init,
+        load=Load,
+        size=Size,
+        patch=Patch,
+        statistics=Statistics,
+        finalise=chain([Statistics, Size, Cleanup]),
+        cleanup=Cleanup,
+        verify=Verify,
+        init_additions=InitAdditions,
+        load_additions=RunAdditions,
+        run_additions=RunAdditions,
+        finalise_additions=chain([FinaliseAdditions, Size]),
+        additions=chain([InitAdditions, RunAdditions, FinaliseAdditions, Size, Cleanup]),
+    )[name]
+    LOG.debug(f"Creating {cls.__name__} with {kwargs}")
+    return cls(**kwargs)

anemoi-datasets 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

anemoi-datasets 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl