PyPI - anemoi-datasets - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

anemoi-datasets 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

anemoi/datasets/__init__.py +11 -3
anemoi/datasets/__main__.py +2 -3
anemoi/datasets/_version.py +2 -2
anemoi/datasets/commands/__init__.py +2 -3
anemoi/datasets/commands/cleanup.py +9 -0
anemoi/datasets/commands/compare.py +3 -3
anemoi/datasets/commands/copy.py +38 -68
anemoi/datasets/commands/create.py +20 -5
anemoi/datasets/commands/finalise-additions.py +9 -0
anemoi/datasets/commands/finalise.py +9 -0
anemoi/datasets/commands/init-additions.py +9 -0
anemoi/datasets/commands/init.py +9 -0
anemoi/datasets/commands/inspect.py +7 -1
anemoi/datasets/commands/load-additions.py +9 -0
anemoi/datasets/commands/load.py +9 -0
anemoi/datasets/commands/patch.py +9 -0
anemoi/datasets/commands/publish.py +9 -0
anemoi/datasets/commands/scan.py +9 -0
anemoi/datasets/compute/__init__.py +8 -0
anemoi/datasets/compute/recentre.py +3 -2
anemoi/datasets/create/__init__.py +64 -48
anemoi/datasets/create/check.py +4 -3
anemoi/datasets/create/chunks.py +3 -2
anemoi/datasets/create/config.py +5 -5
anemoi/datasets/create/functions/__init__.py +22 -7
anemoi/datasets/create/functions/filters/__init__.py +2 -1
anemoi/datasets/create/functions/filters/empty.py +3 -2
anemoi/datasets/create/functions/filters/noop.py +2 -2
anemoi/datasets/create/functions/filters/pressure_level_relative_humidity_to_specific_humidity.py +3 -2
anemoi/datasets/create/functions/filters/pressure_level_specific_humidity_to_relative_humidity.py +3 -2
anemoi/datasets/create/functions/filters/rename.py +16 -10
anemoi/datasets/create/functions/filters/rotate_winds.py +3 -2
anemoi/datasets/create/functions/filters/single_level_dewpoint_to_relative_humidity.py +3 -2
anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_dewpoint.py +3 -2
anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_specific_humidity.py +2 -2
anemoi/datasets/create/functions/filters/single_level_specific_humidity_to_relative_humidity.py +2 -2
anemoi/datasets/create/functions/filters/speeddir_to_uv.py +3 -2
anemoi/datasets/create/functions/filters/unrotate_winds.py +3 -2
anemoi/datasets/create/functions/filters/uv_to_speeddir.py +3 -2
anemoi/datasets/create/functions/sources/__init__.py +2 -2
anemoi/datasets/create/functions/sources/accumulations.py +10 -4
anemoi/datasets/create/functions/sources/constants.py +3 -2
anemoi/datasets/create/functions/sources/empty.py +3 -2
anemoi/datasets/create/functions/sources/forcings.py +3 -2
anemoi/datasets/create/functions/sources/grib.py +2 -2
anemoi/datasets/create/functions/sources/hindcasts.py +3 -2
anemoi/datasets/create/functions/sources/mars.py +97 -17
anemoi/datasets/create/functions/sources/netcdf.py +3 -2
anemoi/datasets/create/functions/sources/opendap.py +2 -2
anemoi/datasets/create/functions/sources/recentre.py +3 -2
anemoi/datasets/create/functions/sources/source.py +3 -2
anemoi/datasets/create/functions/sources/tendencies.py +3 -2
anemoi/datasets/create/functions/sources/xarray/__init__.py +8 -2
anemoi/datasets/create/functions/sources/xarray/coordinates.py +5 -2
anemoi/datasets/create/functions/sources/xarray/field.py +3 -2
anemoi/datasets/create/functions/sources/xarray/fieldlist.py +12 -2
anemoi/datasets/create/functions/sources/xarray/flavour.py +21 -16
anemoi/datasets/create/functions/sources/xarray/grid.py +3 -2
anemoi/datasets/create/functions/sources/xarray/metadata.py +3 -2
anemoi/datasets/create/functions/sources/xarray/time.py +39 -4
anemoi/datasets/create/functions/sources/xarray/variable.py +6 -6
anemoi/datasets/create/functions/sources/xarray_kerchunk.py +2 -2
anemoi/datasets/create/functions/sources/xarray_zarr.py +2 -2
anemoi/datasets/create/functions/sources/zenodo.py +2 -2
anemoi/datasets/create/input/__init__.py +3 -17
anemoi/datasets/create/input/action.py +3 -2
anemoi/datasets/create/input/concat.py +3 -2
anemoi/datasets/create/input/context.py +3 -2
anemoi/datasets/create/input/data_sources.py +3 -2
anemoi/datasets/create/input/empty.py +3 -2
anemoi/datasets/create/input/filter.py +3 -2
anemoi/datasets/create/input/function.py +3 -2
anemoi/datasets/create/input/join.py +3 -2
anemoi/datasets/create/input/misc.py +3 -2
anemoi/datasets/create/input/pipe.py +3 -2
anemoi/datasets/create/input/repeated_dates.py +3 -2
anemoi/datasets/create/input/result.py +187 -3
anemoi/datasets/create/input/step.py +4 -2
anemoi/datasets/create/input/template.py +3 -2
anemoi/datasets/create/input/trace.py +3 -2
anemoi/datasets/create/patch.py +9 -1
anemoi/datasets/create/persistent.py +7 -3
anemoi/datasets/create/size.py +3 -2
anemoi/datasets/create/statistics/__init__.py +7 -3
anemoi/datasets/create/statistics/summary.py +3 -2
anemoi/datasets/create/utils.py +15 -2
anemoi/datasets/create/writer.py +3 -2
anemoi/datasets/create/zarr.py +8 -3
anemoi/datasets/data/__init__.py +27 -1
anemoi/datasets/data/concat.py +5 -1
anemoi/datasets/data/dataset.py +216 -37
anemoi/datasets/data/debug.py +4 -1
anemoi/datasets/data/ensemble.py +4 -1
anemoi/datasets/data/fill_missing.py +165 -0
anemoi/datasets/data/forwards.py +27 -2
anemoi/datasets/data/grids.py +236 -58
anemoi/datasets/data/indexing.py +4 -1
anemoi/datasets/data/interpolate.py +4 -1
anemoi/datasets/data/join.py +17 -1
anemoi/datasets/data/masked.py +36 -10
anemoi/datasets/data/merge.py +180 -0
anemoi/datasets/data/misc.py +18 -3
anemoi/datasets/data/missing.py +4 -1
anemoi/datasets/data/rescale.py +4 -1
anemoi/datasets/data/select.py +15 -1
anemoi/datasets/data/statistics.py +4 -1
anemoi/datasets/data/stores.py +70 -3
anemoi/datasets/data/subset.py +6 -1
anemoi/datasets/data/unchecked.py +9 -1
anemoi/datasets/data/xy.py +20 -5
anemoi/datasets/dates/__init__.py +9 -7
anemoi/datasets/dates/groups.py +3 -1
anemoi/datasets/fields.py +3 -1
anemoi/datasets/grids.py +86 -2
anemoi/datasets/testing.py +60 -0
anemoi/datasets/utils/__init__.py +8 -0
anemoi/datasets/utils/fields.py +2 -2
{anemoi_datasets-0.5.6.dist-info → anemoi_datasets-0.5.10.dist-info}/METADATA +11 -29
anemoi_datasets-0.5.10.dist-info/RECORD +124 -0
{anemoi_datasets-0.5.6.dist-info → anemoi_datasets-0.5.10.dist-info}/WHEEL +1 -1
anemoi_datasets-0.5.6.dist-info/RECORD +0 -121
{anemoi_datasets-0.5.6.dist-info → anemoi_datasets-0.5.10.dist-info}/LICENSE +0 -0
{anemoi_datasets-0.5.6.dist-info → anemoi_datasets-0.5.10.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.5.6.dist-info → anemoi_datasets-0.5.10.dist-info}/top_level.txt +0 -0

anemoi/datasets/data/grids.py CHANGED Viewed

@@ -1,14 +1,18 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 import logging
 from functools import cached_property
 import numpy as np
+from scipy.spatial import cKDTree
 from .debug import Node
 from .debug import debug_indexing
@@ -105,6 +109,17 @@ class GridsBase(GivenAxis):
         # We don't check the resolution, because we want to be able to combine
         pass
+    def metadata_specific(self):
+        return super().metadata_specific(
+            multi_grids=True,
+        )
+    def collect_input_sources(self, collected):
+        # We assume that,because they have different grids, they have different input sources
+        for d in self.datasets:
+            collected.append(d)
+            d.collect_input_sources(collected)
 class Grids(GridsBase):
     # TODO: select the statistics of the most global grid?
@@ -128,85 +143,248 @@ class Grids(GridsBase):
 class Cutout(GridsBase):
-    def __init__(self, datasets, axis, min_distance_km=None, cropping_distance=2.0, neighbours=5, plot=False):
-        from anemoi.datasets.grids import cutout_mask
+    def __init__(self, datasets, axis=3, cropping_distance=2.0, neighbours=5, min_distance_km=None, plot=None):
+        """Initializes a Cutout object for hierarchical management of Limited Area
+        Models (LAMs) and a global dataset, handling overlapping regions.
+        Args:
+            datasets (list): List of LAM and global datasets.
+            axis (int): Concatenation axis, must be set to 3.
+            cropping_distance (float): Distance threshold in degrees for
+                cropping cutouts.
+            neighbours (int): Number of neighboring points to consider when
+                constructing masks.
+            min_distance_km (float, optional): Minimum distance threshold in km
+                between grid points.
+            plot (bool, optional): Flag to enable or disable visualization
+                plots.
+        """
         super().__init__(datasets, axis)
-        assert len(datasets) == 2, "CutoutGrids requires two datasets"
+        assert len(datasets) >= 2, "CutoutGrids requires at least two datasets"
         assert axis == 3, "CutoutGrids requires axis=3"
+        assert cropping_distance >= 0, "cropping_distance must be a non-negative number"
+        if min_distance_km is not None:
+            assert min_distance_km >= 0, "min_distance_km must be a non-negative number"
+        self.lams = datasets[:-1]  # Assume the last dataset is the global one
+        self.globe = datasets[-1]
+        self.axis = axis
+        self.cropping_distance = cropping_distance
+        self.neighbours = neighbours
+        self.min_distance_km = min_distance_km
+        self.plot = plot
+        self.masks = []  # To store the masks for each LAM dataset
+        self.global_mask = np.ones(self.globe.shape[-1], dtype=bool)
+        # Initialize cumulative masks
+        self._initialize_masks()
+    def _initialize_masks(self):
+        """Generates hierarchical masks for each LAM dataset by excluding
+        overlapping regions with previous LAMs and creating a global mask for
+        the global dataset.
+        Raises:
+            ValueError: If the global mask dimension does not match the global
+                dataset grid points.
+        """
+        from anemoi.datasets.grids import cutout_mask
-        # We assume that the LAM is the first dataset, and the global is the second
-        # Note: the second fields does not really need to be global
-        self.lam, self.globe = datasets
-        self.mask = cutout_mask(
-            self.lam.latitudes,
-            self.lam.longitudes,
-            self.globe.latitudes,
-            self.globe.longitudes,
-            plot=plot,
-            min_distance_km=min_distance_km,
-            cropping_distance=cropping_distance,
-            neighbours=neighbours,
-        )
-        assert len(self.mask) == self.globe.shape[3], (
-            len(self.mask),
-            self.globe.shape[3],
-        )
+        for i, lam in enumerate(self.lams):
+            assert len(lam.shape) == len(
+                self.globe.shape
+            ), "LAMs and global dataset must have the same number of dimensions"
+            lam_lats = lam.latitudes
+            lam_lons = lam.longitudes
+            # Create a mask for the global dataset excluding all LAM points
+            global_overlap_mask = cutout_mask(
+                lam.latitudes,
+                lam.longitudes,
+                self.globe.latitudes,
+                self.globe.longitudes,
+                plot=False,
+                min_distance_km=self.min_distance_km,
+                cropping_distance=self.cropping_distance,
+                neighbours=self.neighbours,
+            )
+            # Ensure the mask dimensions match the global grid points
+            if global_overlap_mask.shape[0] != self.globe.shape[-1]:
+                raise ValueError("Global mask dimension does not match global dataset grid " "points.")
+            self.global_mask[~global_overlap_mask] = False
+            # Create a mask for the LAM datasets hierarchically, excluding
+            # points from previous LAMs
+            lam_current_mask = np.ones(lam.shape[-1], dtype=bool)
+            if i > 0:
+                for j in range(i):
+                    prev_lam = self.lams[j]
+                    prev_lam_lats = prev_lam.latitudes
+                    prev_lam_lons = prev_lam.longitudes
+                    # Check for overlap by computing distances
+                    if self.has_overlap(prev_lam_lats, prev_lam_lons, lam_lats, lam_lons):
+                        lam_overlap_mask = cutout_mask(
+                            prev_lam_lats,
+                            prev_lam_lons,
+                            lam_lats,
+                            lam_lons,
+                            plot=False,
+                            min_distance_km=self.min_distance_km,
+                            cropping_distance=self.cropping_distance,
+                            neighbours=self.neighbours,
+                        )
+                        lam_current_mask[~lam_overlap_mask] = False
+            self.masks.append(lam_current_mask)
+    def has_overlap(self, lats1, lons1, lats2, lons2, distance_threshold=1.0):
+        """Checks for overlapping points between two sets of latitudes and
+        longitudes within a specified distance threshold.
+        Args:
+            lats1, lons1 (np.ndarray): Latitude and longitude arrays for the
+                first dataset.
+            lats2, lons2 (np.ndarray): Latitude and longitude arrays for the
+                second dataset.
+            distance_threshold (float): Distance in degrees to consider as
+                overlapping.
+        Returns:
+            bool: True if any points overlap within the distance threshold,
+                otherwise False.
+        """
+        # Create KDTree for the first set of points
+        tree = cKDTree(np.vstack((lats1, lons1)).T)
+        # Query the second set of points against the first tree
+        distances, _ = tree.query(np.vstack((lats2, lons2)).T, k=1)
+        # Check if any distance is less than the specified threshold
+        return np.any(distances < distance_threshold)
+    def __getitem__(self, index):
+        """Retrieves data from the masked LAMs and global dataset based on the
+        given index.
+        Args:
+            index (int or slice or tuple): Index specifying the data to
+                retrieve.
+        Returns:
+            np.ndarray: Data array from the masked datasets based on the index.
+        """
+        if isinstance(index, (int, slice)):
+            index = (index, slice(None), slice(None), slice(None))
+        return self._get_tuple(index)
+    def _get_tuple(self, index):
+        """Helper method that applies masks and retrieves data from each dataset
+        according to the specified index.
+        Args:
+            index (tuple): Index specifying slices to retrieve data.
+        Returns:
+            np.ndarray: Concatenated data array from all datasets based on the
+                index.
+        """
+        index, changes = index_to_slices(index, self.shape)
+        # Select data from each LAM
+        lam_data = [lam[index] for lam in self.lams]
+        # First apply spatial indexing on `self.globe` and then apply the mask
+        globe_data_sliced = self.globe[index[:3]]
+        globe_data = globe_data_sliced[..., self.global_mask]
+        # Concatenate LAM data with global data
+        result = np.concatenate(lam_data + [globe_data], axis=self.axis)
+        return apply_index_to_slices_changes(result, changes)
+    def collect_supporting_arrays(self, collected, *path):
+        """Collects supporting arrays, including masks for each LAM and the global
+        dataset.
+        Args:
+            collected (list): List to which the supporting arrays are appended.
+            *path: Variable length argument list specifying the paths for the masks.
+        """
+        # Append masks for each LAM
+        for i, (lam, mask) in enumerate(zip(self.lams, self.masks)):
+            collected.append((path + (f"lam_{i}",), "cutout_mask", mask))
+        # Append the global mask
+        collected.append((path + ("global",), "cutout_mask", self.global_mask))
     @cached_property
     def shape(self):
-        shape = self.lam.shape
-        # Number of non-zero masked values in the globe dataset
-        nb_globe = np.count_nonzero(self.mask)
-        return shape[:-1] + (shape[-1] + nb_globe,)
+        """Returns the shape of the Cutout, accounting for retained grid points
+        across all LAMs and the global dataset.
+        Returns:
+            tuple: Shape of the concatenated masked datasets.
+        """
+        shapes = [np.sum(mask) for mask in self.masks]
+        global_shape = np.sum(self.global_mask)
+        return tuple(self.lams[0].shape[:-1] + (sum(shapes) + global_shape,))
     def check_same_resolution(self, d1, d2):
         # Turned off because we are combining different resolutions
         pass
     @property
-    def latitudes(self):
-        return np.concatenate([self.lam.latitudes, self.globe.latitudes[self.mask]])
+    def grids(self):
+        """Returns the number of grid points for each LAM and the global dataset
+        after applying masks.
-    @property
-    def longitudes(self):
-        return np.concatenate([self.lam.longitudes, self.globe.longitudes[self.mask]])
+        Returns:
+            tuple: Count of retained grid points for each dataset.
+        """
+        grids = [np.sum(mask) for mask in self.masks]
+        grids.append(np.sum(self.global_mask))
+        return tuple(grids)
-    def __getitem__(self, index):
-        if isinstance(index, (int, slice)):
-            index = (index, slice(None), slice(None), slice(None))
-        return self._get_tuple(index)
+    @property
+    def latitudes(self):
+        """Returns the concatenated latitudes of each LAM and the global dataset
+        after applying masks.
-    @debug_indexing
-    @expand_list_indexing
-    def _get_tuple(self, index):
-        assert self.axis >= len(index) or index[self.axis] == slice(
-            None
-        ), f"No support for selecting a subset of the 1D values {index} ({self.tree()})"
-        index, changes = index_to_slices(index, self.shape)
+        Returns:
+            np.ndarray: Concatenated latitude array for the masked datasets.
+        """
+        lam_latitudes = np.concatenate([lam.latitudes[mask] for lam, mask in zip(self.lams, self.masks)])
-        # In case index_to_slices has changed the last slice
-        index, _ = update_tuple(index, self.axis, slice(None))
+        assert (
+            len(lam_latitudes) + len(self.globe.latitudes[self.global_mask]) == self.shape[-1]
+        ), "Mismatch in number of latitudes"
-        lam_data = self.lam[index]
-        globe_data = self.globe[index]
+        latitudes = np.concatenate([lam_latitudes, self.globe.latitudes[self.global_mask]])
+        return latitudes
-        globe_data = globe_data[:, :, :, self.mask]
+    @property
+    def longitudes(self):
+        """Returns the concatenated longitudes of each LAM and the global dataset
+        after applying masks.
-        result = np.concatenate([lam_data, globe_data], axis=self.axis)
+        Returns:
+            np.ndarray: Concatenated longitude array for the masked datasets.
+        """
+        lam_longitudes = np.concatenate([lam.longitudes[mask] for lam, mask in zip(self.lams, self.masks)])
-        return apply_index_to_slices_changes(result, changes)
+        assert (
+            len(lam_longitudes) + len(self.globe.longitudes[self.global_mask]) == self.shape[-1]
+        ), "Mismatch in number of longitudes"
-    @property
-    def grids(self):
-        for d in self.datasets:
-            if len(d.grids) > 1:
-                raise NotImplementedError("CutoutGrids does not support multi-grids datasets as inputs")
-        shape = self.lam.shape
-        return (shape[-1], self.shape[-1] - shape[-1])
+        longitudes = np.concatenate([lam_longitudes, self.globe.longitudes[self.global_mask]])
+        return longitudes
     def tree(self):
+        """Generates a hierarchical tree structure for the `Cutout` instance and
+        its associated datasets.
+        Returns:
+            Node: A `Node` object representing the `Cutout` instance as the root
+            node, with each dataset in `self.datasets` represented as a child
+            node.
+        """
         return Node(self, [d.tree() for d in self.datasets])
@@ -238,7 +416,7 @@ def cutout_factory(args, kwargs):
     neighbours = kwargs.pop("neighbours", 5)
     assert len(args) == 0
-    assert isinstance(cutout, (list, tuple))
+    assert isinstance(cutout, (list, tuple)), "cutout must be a list or tuple"
     datasets = [_open(e) for e in cutout]
     datasets, kwargs = _auto_adjust(datasets, kwargs)

anemoi/datasets/data/indexing.py CHANGED Viewed

@@ -1,10 +1,13 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 from functools import wraps
 import numpy as np

anemoi/datasets/data/interpolate.py CHANGED Viewed

@@ -1,10 +1,13 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 import logging
 from functools import cached_property

anemoi/datasets/data/join.py CHANGED Viewed

@@ -1,10 +1,13 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 import logging
 from functools import cached_property
@@ -111,6 +114,19 @@ class Join(Combined):
         return result
+    @property
+    def variables_metadata(self):
+        result = {}
+        variables = [v for v in self.variables if not (v.startswith("(") and v.endswith(")"))]
+        for d in self.datasets:
+            md = d.variables_metadata
+            for v in variables:
+                if v in md:
+                    result[v] = md[v]
+        assert len(result) == len(variables), (result, variables)
+        return result
     @cached_property
     def name_to_index(self):
         return {k: i for i, k in enumerate(self.variables)}

anemoi/datasets/data/masked.py CHANGED Viewed

@@ -1,10 +1,13 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 import logging
 from functools import cached_property
@@ -30,6 +33,8 @@ class Masked(Forwards):
         self.mask = mask
         self.axis = 3
+        self.mask_name = f"{self.__class__.__name__.lower()}_mask"
     @cached_property
     def shape(self):
         return self.forward.shape[:-1] + (np.count_nonzero(self.mask),)
@@ -64,26 +69,46 @@ class Masked(Forwards):
         result = apply_index_to_slices_changes(result, changes)
         return result
+    def collect_supporting_arrays(self, collected, *path):
+        super().collect_supporting_arrays(collected, *path)
+        collected.append((path, self.mask_name, self.mask))
 class Thinning(Masked):
     def __init__(self, forward, thinning, method):
         self.thinning = thinning
         self.method = method
-        shape = forward.field_shape
-        if len(shape) != 2:
-            raise ValueError("Thinning only works latitude/longitude fields")
+        if thinning is not None:
+            shape = forward.field_shape
+            if len(shape) != 2:
+                raise ValueError("Thinning only works latitude/longitude fields")
-        latitudes = forward.latitudes.reshape(shape)
-        longitudes = forward.longitudes.reshape(shape)
-        latitudes = latitudes[::thinning, ::thinning].flatten()
-        longitudes = longitudes[::thinning, ::thinning].flatten()
+            # Make a copy, so we read the data only once from zarr
+            forward_latitudes = forward.latitudes.copy()
+            forward_longitudes = forward.longitudes.copy()
-        mask = [lat in latitudes and lon in longitudes for lat, lon in zip(forward.latitudes, forward.longitudes)]
-        mask = np.array(mask, dtype=bool)
+            latitudes = forward_latitudes.reshape(shape)
+            longitudes = forward_longitudes.reshape(shape)
+            latitudes = latitudes[::thinning, ::thinning].flatten()
+            longitudes = longitudes[::thinning, ::thinning].flatten()
+            # TODO: This is not very efficient
+            mask = [lat in latitudes and lon in longitudes for lat, lon in zip(forward_latitudes, forward_longitudes)]
+            mask = np.array(mask, dtype=bool)
+        else:
+            mask = None
         super().__init__(forward, mask)
+    def mutate(self) -> Dataset:
+        if self.thinning is None:
+            return self.forward.mutate()
+        return super().mutate()
     def tree(self):
         return Node(self, [self.forward.tree()], thinning=self.thinning, method=self.method)
@@ -92,6 +117,7 @@ class Thinning(Masked):
 class Cropping(Masked):
     def __init__(self, forward, area):
         from ..data import open_dataset

anemoi/datasets/data/merge.py ADDED Viewed

@@ -0,0 +1,180 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+import logging
+from functools import cached_property
+import numpy as np
+from . import MissingDateError
+from .debug import Node
+from .debug import debug_indexing
+from .forwards import Combined
+from .indexing import apply_index_to_slices_changes
+from .indexing import expand_list_indexing
+from .indexing import index_to_slices
+from .indexing import update_tuple
+from .misc import _auto_adjust
+from .misc import _open
+LOG = logging.getLogger(__name__)
+class Merge(Combined):
+    # d0 d2 d4 d6 ...
+    # d1 d3 d5 d7 ...
+    # gives
+    # d0 d1 d2 d3 ...
+    def __init__(self, datasets, allow_gaps_in_dates=False):
+        super().__init__(datasets)
+        self.allow_gaps_in_dates = allow_gaps_in_dates
+        dates = dict()  # date -> (dataset_index, date_index)
+        for i, d in enumerate(datasets):
+            for j, date in enumerate(d.dates):
+                date = date.astype(object)
+                if date in dates:
+                    d1 = datasets[dates[date][0]]  # Selected
+                    d2 = datasets[i]  # The new one
+                    if j in d2.missing:
+                        # LOG.warning(f"Duplicate date {date} found in datasets {d1} and {d2}, but {date} is missing in {d}, ignoring")
+                        continue
+                    k = dates[date][1]
+                    if k in d1.missing:
+                        # LOG.warning(f"Duplicate date {date} found in datasets {d1} and {d2}, but {date} is missing in {d}, ignoring")
+                        dates[date] = (i, j)  # Replace the missing date with the new one
+                        continue
+                    raise ValueError(f"Duplicate date {date} found in datasets {d1} and {d2}")
+                else:
+                    dates[date] = (i, j)
+        all_dates = sorted(dates)
+        start = all_dates[0]
+        end = all_dates[-1]
+        frequency = min(d2 - d1 for d1, d2 in zip(all_dates[:-1], all_dates[1:]))
+        date = start
+        indices = []
+        _dates = []
+        self._missing_index = len(datasets)
+        while date <= end:
+            if date not in dates:
+                if self.allow_gaps_in_dates:
+                    dates[date] = (self._missing_index, -1)
+                else:
+                    raise ValueError(
+                        f"merge: date {date} not covered by dataset. Start={start}, end={end}, frequency={frequency}"
+                    )
+            indices.append(dates[date])
+            _dates.append(date)
+            date += frequency
+        self._dates = np.array(_dates, dtype="datetime64[s]")
+        self._indices = np.array(indices)
+        self._frequency = frequency  # .astype(object)
+    def __len__(self):
+        return len(self._dates)
+    @property
+    def dates(self):
+        return self._dates
+    @property
+    def frequency(self):
+        return self._frequency
+    @cached_property
+    def missing(self):
+        # TODO: optimize
+        result = set()
+        for i, (dataset, row) in enumerate(self._indices):
+            if dataset == self._missing_index:
+                result.add(i)
+                continue
+            if row in self.datasets[dataset].missing:
+                result.add(i)
+        return result
+    def check_same_lengths(self, d1, d2):
+        # Turned off because we are concatenating along the first axis
+        pass
+    def check_same_dates(self, d1, d2):
+        # Turned off because we are concatenating along the dates axis
+        pass
+    def check_compatibility(self, d1, d2):
+        super().check_compatibility(d1, d2)
+        self.check_same_sub_shapes(d1, d2, drop_axis=0)
+    def tree(self):
+        return Node(self, [d.tree() for d in self.datasets], allow_gaps_in_dates=self.allow_gaps_in_dates)
+    @debug_indexing
+    def __getitem__(self, n):
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
+        if isinstance(n, slice):
+            return self._get_slice(n)
+        dataset, row = self._indices[n]
+        if dataset == self._missing_index:
+            raise MissingDateError(f"Date {self.dates[n]} is missing (index={n})")
+        return self.datasets[dataset][int(row)]
+    @debug_indexing
+    @expand_list_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, 0, slice(None))
+        result = self._get_slice(previous)
+        return apply_index_to_slices_changes(result[index], changes)
+    def _get_slice(self, s):
+        return np.stack([self[i] for i in range(*s.indices(self._len))])
+def merge_factory(args, kwargs):
+    datasets = kwargs.pop("merge")
+    assert isinstance(datasets, (list, tuple))
+    assert len(args) == 0
+    datasets = [_open(e) for e in datasets]
+    if len(datasets) == 1:
+        return datasets[0]._subset(**kwargs)
+    datasets, kwargs = _auto_adjust(datasets, kwargs)
+    allow_gaps_in_dates = kwargs.pop("allow_gaps_in_dates", False)
+    return Merge(datasets, allow_gaps_in_dates=allow_gaps_in_dates)._subset(**kwargs)

anemoi-datasets 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

anemoi-datasets 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl