PyPI - roms-tools - Versions diffs - 3.1.2__py3-none-any.whl → 3.3.0__py3-none-any.whl - Mend

roms-tools 3.1.2py3-none-any.whl → 3.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

roms_tools/setup/{datasets.py → lat_lon_datasets.py} RENAMED Viewed

@@ -1,13 +1,16 @@
+from __future__ import annotations
 import importlib.util
-import logging
-import time
-from collections import Counter, defaultdict
-from collections.abc import Callable
+import typing
+from collections.abc import Callable, Mapping
 from dataclasses import dataclass, field
-from datetime import datetime, timedelta
+from datetime import datetime
 from pathlib import Path
 from types import ModuleType
-from typing import ClassVar
+from typing import Any, ClassVar, Literal, cast
+if typing.TYPE_CHECKING:
+    from roms_tools.setup.grid import Grid
 import numpy as np
 import xarray as xr
@@ -15,27 +18,38 @@ import xarray as xr
 from roms_tools.constants import R_EARTH
 from roms_tools.download import (
     download_correction_data,
-    download_river_data,
     download_sal_data,
     download_topo,
 )
 from roms_tools.setup.fill import LateralFill
 from roms_tools.setup.utils import (
+    Timed,
     assign_dates_to_climatology,
-    convert_cftime_to_datetime,
-    gc_dist,
-    get_time_type,
+    check_dataset,
+    get_target_coords,
     interpolate_cyclic_time,
-    interpolate_from_climatology,
     one_dim_fill,
+    select_relevant_times,
 )
-from roms_tools.utils import _get_pkg_error_msg, _has_gcsfs, _load_data
+from roms_tools.utils import get_dask_chunks, get_pkg_error_msg, has_gcsfs, load_data
-# lat-lon datasets
+TConcatEndTypes = Literal["lower", "upper", "both"]
+REPO_ROOT = Path(__file__).resolve().parents[2]
+GLORYS_GLOBAL_GRID_PATH = (
+    REPO_ROOT / "roms_tools" / "data" / "grids" / "GLORYS_global_grid.nc"
+)
+DEFAULT_NR_BUFFER_POINTS = (
+    20  # Default number of buffer points for subdomain selection.
+)
+# Balances performance and accuracy:
+# - Too many points → more expensive computations
+# - Too few points → potential boundary artifacts when lateral refill is performed
+# See discussion: https://github.com/CWorthy-ocean/roms-tools/issues/153
+# This default will be applied consistently across all datasets requiring lateral fill.
 @dataclass(kw_only=True)
-class Dataset:
+class LatLonDataset:
     """Represents forcing data on original grid.
     Parameters
@@ -47,7 +61,7 @@ class Dataset:
         Start time for selecting relevant data. If not provided, no time-based filtering is applied.
     end_time : Optional[datetime], optional
         End time for selecting relevant data. If not provided, the dataset selects the time entry
-        closest to `start_time` within the range `[start_time, start_time + 24 hours]`.
+        closest to `start_time` within the range `[start_time, start_time + 24 hours)`.
         If `start_time` is also not provided, no time-based filtering is applied.
     dim_names: Dict[str, str], optional
         Dictionary specifying the names of dimensions in the dataset.
@@ -62,8 +76,19 @@ class Dataset:
         Indicates whether land values require lateral filling. If `True`, ocean values will be extended into land areas
         to replace NaNs or non-ocean values (such as atmospheric values in ERA5 data). If `False`, it is assumed that
         land values are already correctly assigned, and lateral filling will be skipped. Defaults to `True`.
-    use_dask: bool
+    use_dask: bool, optional
         Indicates whether to use dask for chunking. If True, data is loaded with dask; if False, data is loaded eagerly. Defaults to False.
+    read_zarr: bool, optional
+        If True, use the zarr engine to read the dataset, and don't use mfdataset.
+        Defaults to False.
+    allow_flex_time: bool, optional
+        Controls how strictly the dataset selects a time entry when `end_time` is not provided (relevant for initial conditions):
+        - If False (default): requires an exact match to `start_time`. Raises a ValueError if no match exists.
+        - If True: allows a +24h search window after `start_time` and selects the closest available
+          time entry within that window. Raises a ValueError if none are found.
+        Only used when `end_time` is None. Has no effect otherwise.
     apply_post_processing: bool
         Indicates whether to post-process the dataset for futher use. Defaults to True.
@@ -94,14 +119,15 @@ class Dataset:
         }
     )
     var_names: dict[str, str]
-    opt_var_names: dict[str, str] | None = field(default_factory=dict)
-    climatology: bool | None = False
+    opt_var_names: dict[str, str] = field(default_factory=dict)
+    climatology: bool = False
     needs_lateral_fill: bool | None = True
-    use_dask: bool | None = False
+    use_dask: bool = False
+    read_zarr: bool = False
+    allow_flex_time: bool = False
     apply_post_processing: bool | None = True
-    read_zarr: bool | None = False
-    ds_loader_fn: Callable[[], xr.Dataset] | None = None
+    ds_loader_fn: Callable[[], xr.Dataset] | None = None
     is_global: bool = field(init=False, repr=False)
     ds: xr.Dataset = field(init=False, repr=False)
@@ -172,17 +198,17 @@ class Dataset:
         ValueError
             If a list of files is provided but self.dim_names["time"] is not available or use_dask=False.
         """
-        ds = _load_data(
-            self.filename,
-            self.dim_names,
-            self.use_dask or False,
-            read_zarr=self.read_zarr or False,
+        ds = load_data(
+            filename=self.filename,
+            dim_names=self.dim_names,
+            use_dask=self.use_dask,
+            read_zarr=self.read_zarr,
             ds_loader_fn=self.ds_loader_fn,
         )
         return ds
-    def clean_up(self, ds: xr.Dataset, **kwargs) -> xr.Dataset:
+    def clean_up(self, ds: xr.Dataset) -> xr.Dataset:
         """Dummy method to be overridden by child classes to clean up the dataset.
         This method is intended as a placeholder and should be implemented in subclasses
@@ -213,9 +239,9 @@ class Dataset:
         ValueError
             If the dataset does not contain the specified variables or dimensions.
         """
-        _check_dataset(ds, self.dim_names, self.var_names)
+        check_dataset(ds, self.dim_names, self.var_names)
-    def select_relevant_fields(self, ds) -> xr.Dataset:
+    def select_relevant_fields(self, ds: xr.Dataset) -> xr.Dataset:
         """Selects and returns a subset of the dataset containing only the variables
         specified in `self.var_names`.
@@ -258,7 +284,7 @@ class Dataset:
         """
         return ds
-    def select_relevant_times(self, ds) -> xr.Dataset:
+    def select_relevant_times(self, ds: xr.Dataset) -> xr.Dataset:
         """Select a subset of the dataset based on the specified time range.
         This method filters the dataset to include all records between `start_time` and `end_time`.
@@ -266,7 +292,7 @@ class Dataset:
         after `end_time` are included, even if they fall outside the strict time range.
         If no `end_time` is specified, the method will select the time range of
-        [start_time, start_time + 24 hours] and return the closest time entry to `start_time` within that range.
+        [start_time, start_time + 24 hours) and return the closest time entry to `start_time` within that range.
         Parameters
         ----------
@@ -305,8 +331,17 @@ class Dataset:
         """
         time_dim = self.dim_names["time"]
-        ds = _select_relevant_times(
-            ds, time_dim, self.start_time, self.end_time, self.climatology
+        # Ensure start_time is not None for type safety
+        if self.start_time is None:
+            raise ValueError("select_relevant_times called but start_time is None.")
+        ds = select_relevant_times(
+            ds,
+            time_dim,
+            self.start_time,
+            self.end_time,
+            self.climatology,
+            self.allow_flex_time,
         )
         return ds
@@ -353,7 +388,7 @@ class Dataset:
         return ds
-    def infer_horizontal_resolution(self, ds: xr.Dataset):
+    def infer_horizontal_resolution(self, ds: xr.Dataset) -> None:
         """Estimate and set the average horizontal resolution of a dataset based on
         latitude and longitude spacing.
@@ -381,7 +416,7 @@ class Dataset:
         # Set the computed resolution as an attribute
         self.resolution = resolution
-    def compute_minimal_grid_spacing(self, ds: xr.Dataset):
+    def compute_minimal_grid_spacing(self, ds: xr.Dataset) -> float:
         """Compute the minimal grid spacing in a dataset based on latitude and longitude
         spacing, considering Earth's radius.
@@ -443,7 +478,12 @@ class Dataset:
         return is_global
-    def concatenate_longitudes(self, ds, end="upper", verbose=False):
+    def concatenate_longitudes(
+        self,
+        ds: xr.Dataset,
+        end: TConcatEndTypes = "upper",
+        verbose: bool = False,
+    ) -> xr.Dataset:
         """Concatenates fields in dataset twice along the longitude dimension.
         Parameters
@@ -466,58 +506,12 @@ class Dataset:
         ds_concatenated : xr.Dataset
             The concatenated dataset.
         """
-        if verbose:
-            start_time = time.time()
-        ds_concatenated = xr.Dataset()
-        lon = ds[self.dim_names["longitude"]]
-        if end == "lower":
-            lon_minus360 = lon - 360
-            lon_concatenated = xr.concat(
-                [lon_minus360, lon], dim=self.dim_names["longitude"]
-            )
-        elif end == "upper":
-            lon_plus360 = lon + 360
-            lon_concatenated = xr.concat(
-                [lon, lon_plus360], dim=self.dim_names["longitude"]
-            )
-        elif end == "both":
-            lon_minus360 = lon - 360
-            lon_plus360 = lon + 360
-            lon_concatenated = xr.concat(
-                [lon_minus360, lon, lon_plus360], dim=self.dim_names["longitude"]
-            )
-        for var in ds.data_vars:
-            if self.dim_names["longitude"] in ds[var].dims:
-                field = ds[var]
-                if end == "both":
-                    field_concatenated = xr.concat(
-                        [field, field, field], dim=self.dim_names["longitude"]
-                    )
-                else:
-                    field_concatenated = xr.concat(
-                        [field, field], dim=self.dim_names["longitude"]
-                    )
-                if self.use_dask:
-                    field_concatenated = field_concatenated.chunk(
-                        {self.dim_names["longitude"]: -1}
-                    )
-                field_concatenated[self.dim_names["longitude"]] = lon_concatenated
-                ds_concatenated[var] = field_concatenated
-            else:
-                ds_concatenated[var] = ds[var]
-        ds_concatenated[self.dim_names["longitude"]] = lon_concatenated
-        if verbose:
-            logging.info(
-                f"Concatenating the data along the longitude dimension: {time.time() - start_time:.3f} seconds"
+        with Timed(
+            "=== Concatenating the data along the longitude dimension ===",
+            verbose=verbose,
+        ):
+            ds_concatenated = _concatenate_longitudes(
+                ds, self.dim_names, end, self.use_dask
             )
         return ds_concatenated
@@ -552,14 +546,16 @@ class Dataset:
         ds = self.ds.astype({var: "float64" for var in self.ds.data_vars})
         self.ds = ds
+        return None
     def choose_subdomain(
         self,
-        target_coords,
-        buffer_points=20,
-        return_copy=False,
-        return_coords_only=False,
-        verbose=False,
-    ):
+        target_coords: dict[str, Any],
+        buffer_points: int = DEFAULT_NR_BUFFER_POINTS,
+        return_copy: bool = False,
+        return_coords_only: bool = False,
+        verbose: bool = False,
+    ) -> xr.Dataset | LatLonDataset | None:
         """Selects a subdomain from the xarray Dataset based on specified target
         coordinates, extending the selection by a defined buffer. Adjusts longitude
         ranges as necessary to accommodate the dataset's expected range and handles
@@ -596,95 +592,16 @@ class Dataset:
         ValueError
             If the selected latitude or longitude range does not intersect with the dataset.
         """
-        lat_min = target_coords["lat"].min().values
-        lat_max = target_coords["lat"].max().values
-        lon_min = target_coords["lon"].min().values
-        lon_max = target_coords["lon"].max().values
-        margin = self.resolution * buffer_points
-        # Select the subdomain in latitude direction (so that we have to concatenate fewer latitudes below if concatenation is necessary)
-        subdomain = self.ds.sel(
-            **{
-                self.dim_names["latitude"]: slice(lat_min - margin, lat_max + margin),
-            }
-        )
-        lon = subdomain[self.dim_names["longitude"]]
-        if self.is_global:
-            concats = []
-            # Concatenate only if necessary
-            if lon_max + margin > lon.max():
-                # See if shifting by +360 degrees helps
-                if (lon_min - margin > (lon + 360).min()) and (
-                    lon_max + margin < (lon + 360).max()
-                ):
-                    subdomain[self.dim_names["longitude"]] = lon + 360
-                    lon = subdomain[self.dim_names["longitude"]]
-                else:
-                    concats.append("upper")
-            if lon_min - margin < lon.min():
-                # See if shifting by -360 degrees helps
-                if (lon_min - margin > (lon - 360).min()) and (
-                    lon_max + margin < (lon - 360).max()
-                ):
-                    subdomain[self.dim_names["longitude"]] = lon - 360
-                    lon = subdomain[self.dim_names["longitude"]]
-                else:
-                    concats.append("lower")
-            if concats:
-                end = "both" if len(concats) == 2 else concats[0]
-                subdomain = self.concatenate_longitudes(
-                    subdomain, end=end, verbose=False
-                )
-                lon = subdomain[self.dim_names["longitude"]]
-        else:
-            # Adjust longitude range if needed to match the expected range
-            if not target_coords["straddle"]:
-                if lon.min() < -180:
-                    if lon_max + margin > 0:
-                        lon_min -= 360
-                        lon_max -= 360
-                elif lon.min() < 0:
-                    if lon_max + margin > 180:
-                        lon_min -= 360
-                        lon_max -= 360
-            if target_coords["straddle"]:
-                if lon.max() > 360:
-                    if lon_min - margin < 180:
-                        lon_min += 360
-                        lon_max += 360
-                elif lon.max() > 180:
-                    if lon_min - margin < 0:
-                        lon_min += 360
-                        lon_max += 360
-        # Select the subdomain in longitude direction
-        subdomain = subdomain.sel(
-            **{
-                self.dim_names["longitude"]: slice(lon_min - margin, lon_max + margin),
-            }
+        subdomain = choose_subdomain(
+            ds=self.ds,
+            dim_names=self.dim_names,
+            resolution=self.resolution,
+            is_global=self.is_global,
+            target_coords=target_coords,
+            buffer_points=buffer_points,
+            use_dask=self.use_dask,
         )
-        # Check if the selected subdomain has zero dimensions in latitude or longitude
-        if subdomain[self.dim_names["latitude"]].size == 0:
-            raise ValueError("Selected latitude range does not intersect with dataset.")
-        if subdomain[self.dim_names["longitude"]].size == 0:
-            raise ValueError(
-                "Selected longitude range does not intersect with dataset."
-            )
-        # Adjust longitudes to expected range if needed
-        lon = subdomain[self.dim_names["longitude"]]
-        if target_coords["straddle"]:
-            subdomain[self.dim_names["longitude"]] = xr.where(lon > 180, lon - 360, lon)
-        else:
-            subdomain[self.dim_names["longitude"]] = xr.where(lon < 0, lon + 360, lon)
         if return_coords_only:
             # Create and return a dataset with only latitudes and longitudes
             coords_ds = subdomain[
@@ -693,9 +610,10 @@ class Dataset:
             return coords_ds
         if return_copy:
-            return Dataset.from_ds(self, subdomain)
+            return LatLonDataset.from_ds(self, subdomain)
         else:
             self.ds = subdomain
+            return None
     def apply_lateral_fill(self):
         """Apply lateral fill to variables using the dataset's mask and grid dimensions.
@@ -715,10 +633,6 @@ class Dataset:
         point to the same variable in the dataset.
         """
         if self.needs_lateral_fill:
-            logging.info(
-                "Applying 2D horizontal fill to the source data before regridding."
-            )
             lateral_fill = LateralFill(
                 self.ds["mask"],
                 [self.dim_names["latitude"], self.dim_names["longitude"]],
@@ -749,10 +663,6 @@ class Dataset:
                 else:
                     # Apply standard lateral fill for other variables
                     self.ds[var_name] = lateral_fill.apply(self.ds[var_name])
-        else:
-            logging.info(
-                "2D horizontal fill is skipped because source data already contains filled values."
-            )
     def extrapolate_deepest_to_bottom(self):
         """Extrapolate deepest non-NaN values to fill bottom NaNs along the depth
@@ -769,8 +679,8 @@ class Dataset:
                     )
     @classmethod
-    def from_ds(cls, original_dataset: "Dataset", ds: xr.Dataset) -> "Dataset":
-        """Substitute the internal dataset of a Dataset object with a new xarray
+    def from_ds(cls, original_dataset: LatLonDataset, ds: xr.Dataset) -> LatLonDataset:
+        """Substitute the internal dataset of a LatLonDataset object with a new xarray
         Dataset.
         This method creates a new Dataset instance, bypassing the usual `__init__`
@@ -780,18 +690,18 @@ class Dataset:
         Parameters
         ----------
-        original_dataset : Dataset
-            The original Dataset instance from which attributes will be copied.
+        original_dataset : LatLonDataset
+            The original LatLonDataset instance from which attributes will be copied.
         ds : xarray.Dataset
             The new xarray Dataset to assign to the `ds` attribute of the new instance.
         Returns
         -------
-        Dataset
+        LatLonDataset
             A new Dataset instance with the `ds` attribute set to the provided dataset
             and other attributes copied from the original instance.
         """
-        # Create a new Dataset instance without calling __init__ or __post_init__
+        # Create a new LatLonDataset instance without calling __init__ or __post_init__
         dataset = cls.__new__(cls)
         # Directly set the provided dataset as the 'ds' attribute
@@ -806,7 +716,7 @@ class Dataset:
 @dataclass(kw_only=True)
-class TPXODataset(Dataset):
+class TPXODataset(LatLonDataset):
     """Represents tidal data on the original grid from the TPXO dataset.
     Parameters
@@ -871,7 +781,7 @@ class TPXODataset(Dataset):
             ValueError
                 If longitude or latitude values do not match the grid.
         """
-        ds_grid = _load_data(self.grid_filename, self.dim_names, self.use_dask)
+        ds_grid = load_data(self.grid_filename, self.dim_names, self.use_dask)
         # Define mask and coordinate names based on location
         if self.location == "h":
@@ -902,21 +812,13 @@ class TPXODataset(Dataset):
         # Drop all dimensions except 'longitude' and 'latitude'
         dims_to_keep = {"longitude", "latitude"}
-        dims_to_drop = [dim for dim in ds_grid.dims if dim not in dims_to_keep]
+        dims_to_drop: set[str] = set(ds_grid.dims) - dims_to_keep
         if dims_to_drop:
             ds_grid = ds_grid.isel({dim: 0 for dim in dims_to_drop})
         # Ensure correct dimension order
         ds_grid = ds_grid.transpose("latitude", "longitude")
-        dims_to_keep = {"longitude", "latitude"}
-        dims_to_drop = set(ds_grid.dims) - dims_to_keep
-        ds_grid = (
-            ds_grid.isel({dim: 0 for dim in dims_to_drop}) if dims_to_drop else ds_grid
-        )
-        # Bring dimensions in correct order
-        ds_grid = ds_grid.transpose("latitude", "longitude")
         ds = ds.rename({"con": "nc"})
         ds = ds.assign_coords(
             {
@@ -1029,7 +931,7 @@ class TPXODataset(Dataset):
 @dataclass(kw_only=True)
-class GLORYSDataset(Dataset):
+class GLORYSDataset(LatLonDataset):
     """Represents GLORYS data on original grid."""
     var_names: dict[str, str] = field(
@@ -1051,7 +953,7 @@ class GLORYSDataset(Dataset):
         }
     )
-    climatology: bool | None = False
+    climatology: bool = False
     def post_process(self):
         """Apply a mask to the dataset based on the 'zeta' variable, with 0 where 'zeta'
@@ -1067,19 +969,29 @@ class GLORYSDataset(Dataset):
         None
             The dataset is modified in-place by applying the mask to each variable.
         """
-        mask = xr.where(
-            self.ds[self.var_names["zeta"]].isel({self.dim_names["time"]: 0}).isnull(),
-            0,
-            1,
-        )
-        mask_vel = xr.where(
-            self.ds[self.var_names["u"]]
-            .isel({self.dim_names["time"]: 0, self.dim_names["depth"]: 0})
-            .isnull(),
-            0,
-            1,
-        )
+        zeta = self.ds[self.var_names["zeta"]]
+        u = self.ds[self.var_names["u"]]
+        # Select time=0 if time dimension exists, otherwise use data as-is
+        if self.dim_names["time"] in zeta.dims:
+            zeta_ref = zeta.isel({self.dim_names["time"]: 0})
+        else:
+            zeta_ref = zeta
+        if self.dim_names["time"] in u.dims:
+            u_ref = u.isel({self.dim_names["time"]: 0})
+        else:
+            u_ref = u
+        # Also handle depth for velocity
+        if self.dim_names["depth"] in u_ref.dims:
+            u_ref = u_ref.isel({self.dim_names["depth"]: 0})
+        # Create masks
+        mask = xr.where(zeta_ref.isnull(), 0, 1)
+        mask_vel = xr.where(u_ref.isnull(), 0, 1)
+        # Save to dataset
         self.ds["mask"] = mask
         self.ds["mask_vel"] = mask_vel
@@ -1130,7 +1042,7 @@ class GLORYSDefaultDataset(GLORYSDataset):
         spec = importlib.util.find_spec(package_name)
         if not spec:
-            msg = _get_pkg_error_msg("cloud-based GLORYS data", package_name, "stream")
+            msg = get_pkg_error_msg("cloud-based GLORYS data", package_name, "stream")
             raise RuntimeError(msg)
         try:
@@ -1151,18 +1063,40 @@ class GLORYSDefaultDataset(GLORYSDataset):
             The streaming dataset
         """
         copernicusmarine = self._load_copernicus()
-        return copernicusmarine.open_dataset(
+        # ds = copernicusmarine.download_functions.download_zarr.open_dataset_from_arco_series(
+        #    dataset_url="https://s3.waw3-1.cloudferro.com/mdl-arco-geo-025/arco/GLOBAL_MULTIYEAR_PHY_001_030/cmems_mod_glo_phy_my_0.083deg_P1D-m_202311/geoChunked.zarr",
+        #    variables=["thetao", "so", "uo", "vo", "zos"],
+        #    geographical_parameters=copernicusmarine.download_functions.subset_parameters.GeographicalParameters(),
+        #    temporal_parameters=copernicusmarine.download_functions.subset_parameters.TemporalParameters(
+        #        start_datetime=self.start_time, end_datetime=self.end_time
+        #    ),
+        #    depth_parameters=copernicusmarine.download_functions.subset_parameters.DepthParameters(),
+        #    coordinates_selection_method="outside",
+        #    optimum_dask_chunking={
+        #        "time": 1,
+        #        "depth": -1,
+        #        "latitude": -1,
+        #        "longitude": -1,
+        #    },
+        # )
+        ds = copernicusmarine.open_dataset(
             self.dataset_name,
             start_datetime=self.start_time,
             end_datetime=self.end_time,
             service="arco-geo-series",
-            coordinates_selection_method="inside",
-            chunk_size_limit=2,
+            coordinates_selection_method="outside",
+            chunk_size_limit=-1,
         )
+        chunks = get_dask_chunks(self.dim_names)
+        ds = ds.chunk(chunks)
+        return ds
 @dataclass(kw_only=True)
-class UnifiedDataset(Dataset):
+class UnifiedDataset(LatLonDataset):
     """Represents unified BGC data on original grid.
     Notes
@@ -1285,7 +1219,7 @@ class UnifiedBGCDataset(UnifiedDataset):
         }
     )
-    climatology: bool | None = True
+    climatology: bool = True
 @dataclass(kw_only=True)
@@ -1307,11 +1241,11 @@ class UnifiedBGCSurfaceDataset(UnifiedDataset):
         }
     )
-    climatology: bool | None = True
+    climatology: bool = True
 @dataclass(kw_only=True)
-class CESMDataset(Dataset):
+class CESMDataset(LatLonDataset):
     """Represents CESM data on original grid."""
     # overwrite clean_up method from parent class
@@ -1422,9 +1356,9 @@ class CESMBGCDataset(CESMDataset):
         }
     )
-    climatology: bool | None = False
+    climatology: bool = False
-    def post_process(self):
+    def post_process(self) -> None:
         """
         Processes and converts CESM data values as follows:
         - Convert depth values from cm to m.
@@ -1493,9 +1427,9 @@ class CESMBGCSurfaceForcingDataset(CESMDataset):
         }
     )
-    climatology: bool | None = False
+    climatology: bool = False
-    def post_process(self):
+    def post_process(self) -> None:
         """Perform post-processing on the dataset to remove specific variables.
         This method checks if the variable "z_t" exists in the dataset. If it does,
@@ -1518,7 +1452,7 @@ class CESMBGCSurfaceForcingDataset(CESMDataset):
 @dataclass(kw_only=True)
-class ERA5Dataset(Dataset):
+class ERA5Dataset(LatLonDataset):
     """Represents ERA5 data on original grid."""
     var_names: dict[str, str] = field(
@@ -1542,9 +1476,9 @@ class ERA5Dataset(Dataset):
         }
     )
-    climatology: bool | None = False
+    climatology: bool = False
-    def post_process(self):
+    def post_process(self) -> None:
         """
         Processes and converts ERA5 data values as follows:
         - Convert radiation values from J/m^2 to W/m^2.
@@ -1632,17 +1566,17 @@ class ERA5ARCODataset(ERA5Dataset):
         }
     )
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         self.read_zarr = True
-        if not _has_gcsfs():
-            msg = _get_pkg_error_msg("cloud-based ERA5 data", "gcsfs", "stream")
+        if not has_gcsfs():
+            msg = get_pkg_error_msg("cloud-based ERA5 data", "gcsfs", "stream")
             raise RuntimeError(msg)
         super().__post_init__()
 @dataclass(kw_only=True)
-class ERA5Correction(Dataset):
+class ERA5Correction(LatLonDataset):
     """Global dataset to correct ERA5 radiation.
     The dataset contains multiplicative correction factors for the ERA5 shortwave
@@ -1664,9 +1598,9 @@ class ERA5Correction(Dataset):
             "time": "time",
         }
     )
-    climatology: bool | None = True
+    climatology: bool = True
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if not self.climatology:
             raise NotImplementedError(
                 "Correction data must be a climatology. Set climatology to True."
@@ -1674,32 +1608,31 @@ class ERA5Correction(Dataset):
         super().__post_init__()
-    def choose_subdomain(self, target_coords, straddle: bool):
-        """Converts longitude values in the dataset if necessary and selects a subdomain
-        based on the specified coordinates.
+    def match_subdomain(self, target_coords: dict[str, Any]) -> None:
+        """
+        Selects a subdomain from the dataset matching the specified coordinates.
-        This method converts longitude values between different ranges if required and then extracts a subset of the
-        dataset according to the given coordinates. It updates the dataset in place to reflect the selected subdomain.
+        This method extracts a subset of the dataset (`self.ds`) based on given latitude
+        and longitude values. If the dataset spans the globe, it concatenates longitudes
+        to ensure seamless wrapping.
         Parameters
         ----------
-        target_coords : dict
-            A dictionary specifying the target coordinates for selecting the subdomain. Keys should correspond to the
-            dimension names of the dataset (e.g., latitude and longitude), and values should be the desired ranges or
-            specific coordinate values.
-        straddle : bool
-            If True, assumes that target longitudes are in the range [-180, 180]. If False, assumes longitudes are in the
-            range [0, 360]. This parameter determines how longitude values are converted if necessary.
+        target_coords : dict[str, Any]
+            A dictionary containing the target latitude and longitude values to select.
+            Expected keys: "lat" and "lon", each mapped to a DataArray of coordinates.
         Raises
         ------
         ValueError
-            If the specified subdomain does not fully contain the specified latitude or longitude values. This can occur
-            if the dataset does not cover the full range of provided coordinates.
+            If the selected subdomain does not contain all specified latitude or
+            longitude values.
         Notes
         -----
-        - The dataset (`self.ds`) is updated in place to reflect the chosen subdomain.
+        - The dataset (`self.ds`) is updated in place.
+        - Assumes latitude values in `target_coords["lat"]` are within dataset bounds.
+        - For global datasets, longitude concatenation is applied unconditionally.
         """
         # Select the subdomain in latitude direction (so that we have to concatenate fewer latitudes below if concatenation is performed)
         subdomain = self.ds.sel({self.dim_names["latitude"]: target_coords["lat"]})
@@ -1726,7 +1659,7 @@ class ERA5Correction(Dataset):
 @dataclass(kw_only=True)
-class ETOPO5Dataset(Dataset):
+class ETOPO5Dataset(LatLonDataset):
     """Represents topography data on the original grid from the ETOPO5 dataset."""
     filename: str = field(default_factory=lambda: download_topo("etopo5.nc"))
@@ -1762,7 +1695,7 @@ class ETOPO5Dataset(Dataset):
 @dataclass(kw_only=True)
-class SRTM15Dataset(Dataset):
+class SRTM15Dataset(LatLonDataset):
     """Represents topography data on the original grid from the SRTM15 dataset."""
     var_names: dict[str, str] = field(
@@ -1775,428 +1708,6 @@ class SRTM15Dataset(Dataset):
     )
-# river datasets
-@dataclass(kw_only=True)
-class RiverDataset:
-    """Represents river data.
-    Parameters
-    ----------
-    filename : Union[str, Path, List[Union[str, Path]]]
-        The path to the data file(s). Can be a single string (with or without wildcards), a single Path object,
-        or a list of strings or Path objects containing multiple files.
-    start_time : datetime
-        The start time for selecting relevant data.
-    end_time : datetime
-        The end time for selecting relevant data.
-    dim_names: Dict[str, str]
-        Dictionary specifying the names of dimensions in the dataset.
-        Requires "station" and "time" as keys.
-    var_names: Dict[str, str]
-        Dictionary of variable names that are required in the dataset.
-        Requires the keys "latitude", "longitude", "flux", "ratio", and "name".
-    opt_var_names: Dict[str, str], optional
-        Dictionary of variable names that are optional in the dataset.
-        Defaults to an empty dictionary.
-    climatology : bool
-        Indicates whether the dataset is climatological. Defaults to False.
-    Attributes
-    ----------
-    ds : xr.Dataset
-        The xarray Dataset containing the forcing data on its original grid.
-    """
-    filename: str | Path | list[str | Path]
-    start_time: datetime
-    end_time: datetime
-    dim_names: dict[str, str]
-    var_names: dict[str, str]
-    opt_var_names: dict[str, str] | None = field(default_factory=dict)
-    climatology: bool | None = False
-    ds: xr.Dataset = field(init=False, repr=False)
-    def __post_init__(self):
-        # Validate start_time and end_time
-        if not isinstance(self.start_time, datetime):
-            raise TypeError(
-                f"start_time must be a datetime object, but got {type(self.start_time).__name__}."
-            )
-        if not isinstance(self.end_time, datetime):
-            raise TypeError(
-                f"end_time must be a datetime object, but got {type(self.end_time).__name__}."
-            )
-        ds = self.load_data()
-        ds = self.clean_up(ds)
-        self.check_dataset(ds)
-        ds = _deduplicate_river_names(
-            ds, self.var_names["name"], self.dim_names["station"]
-        )
-        # Select relevant times
-        ds = self.add_time_info(ds)
-        self.ds = ds
-    def load_data(self) -> xr.Dataset:
-        """Load dataset from the specified file.
-        Returns
-        -------
-        ds : xr.Dataset
-            The loaded xarray Dataset containing the forcing data.
-        """
-        ds = _load_data(
-            self.filename, self.dim_names, use_dask=False, decode_times=False
-        )
-        return ds
-    def clean_up(self, ds: xr.Dataset) -> xr.Dataset:
-        """Decodes the 'name' variable (if byte-encoded) and updates the dataset.
-        This method checks if the 'name' variable is of dtype 'object' (i.e., byte-encoded),
-        and if so, decodes each byte array to a string and updates the dataset.
-        It also ensures that the 'station' dimension is of integer type.
-        Parameters
-        ----------
-        ds : xr.Dataset
-            The dataset containing the 'name' variable to decode.
-        Returns
-        -------
-        ds : xr.Dataset
-            The dataset with the decoded 'name' variable.
-        """
-        if ds[self.var_names["name"]].dtype == "object":
-            names = []
-            for i in range(len(ds[self.dim_names["station"]])):
-                byte_array = ds[self.var_names["name"]].isel(
-                    **{self.dim_names["station"]: i}
-                )
-                name = decode_string(byte_array)
-                names.append(name)
-            ds[self.var_names["name"]] = xr.DataArray(
-                data=names, dims=self.dim_names["station"]
-            )
-        if ds[self.dim_names["station"]].dtype == "float64":
-            ds[self.dim_names["station"]] = ds[self.dim_names["station"]].astype(int)
-        # Drop all variables that have chars dim
-        vars_to_drop = ["ocn_name", "stn_name", "ct_name", "cn_name", "chars"]
-        existing_vars = [var for var in vars_to_drop if var in ds]
-        ds = ds.drop_vars(existing_vars)
-        return ds
-    def check_dataset(self, ds: xr.Dataset) -> None:
-        """Validate required variables, dimensions, and uniqueness of river names.
-        Parameters
-        ----------
-        ds : xr.Dataset
-            The xarray Dataset to check.
-        Raises
-        ------
-        ValueError
-            If the dataset does not contain the specified variables or dimensions.
-        """
-        _check_dataset(ds, self.dim_names, self.var_names, self.opt_var_names)
-    def add_time_info(self, ds: xr.Dataset) -> xr.Dataset:
-        """Dummy method to be overridden by child classes to add time information to the
-        dataset.
-        This method is intended as a placeholder and should be implemented in subclasses
-        to provide specific functionality for adding time-related information to the dataset.
-        Parameters
-        ----------
-        ds : xr.Dataset
-            The xarray Dataset to which time information will be added.
-        Returns
-        -------
-        xr.Dataset
-            The xarray Dataset with time information added (as implemented by child classes).
-        """
-        return ds
-    def select_relevant_times(self, ds) -> xr.Dataset:
-        """Select a subset of the dataset based on the specified time range.
-        This method filters the dataset to include all records between `start_time` and `end_time`.
-        Additionally, it ensures that one record at or before `start_time` and one record at or
-        after `end_time` are included, even if they fall outside the strict time range.
-        If no `end_time` is specified, the method will select the time range of
-        [start_time, start_time + 24 hours] and return the closest time entry to `start_time` within that range.
-        Parameters
-        ----------
-        ds : xr.Dataset
-            The input dataset to be filtered. Must contain a time dimension.
-        Returns
-        -------
-        xr.Dataset
-            A dataset filtered to the specified time range, including the closest entries
-            at or before `start_time` and at or after `end_time` if applicable.
-        Warns
-        -----
-        UserWarning
-            If no records at or before `start_time` or no records at or after `end_time` are found.
-        UserWarning
-            If the dataset does not contain any time dimension or the time dimension is incorrectly named.
-        """
-        time_dim = self.dim_names["time"]
-        ds = _select_relevant_times(ds, time_dim, self.start_time, self.end_time, False)
-        return ds
-    def compute_climatology(self):
-        logging.info("Compute climatology for river forcing.")
-        time_dim = self.dim_names["time"]
-        flux = self.ds[self.var_names["flux"]].groupby(f"{time_dim}.month").mean()
-        self.ds[self.var_names["flux"]] = flux
-        ds = assign_dates_to_climatology(self.ds, "month")
-        ds = ds.swap_dims({"month": "time"})
-        self.ds = ds
-        updated_dim_names = {**self.dim_names}
-        updated_dim_names["time"] = "time"
-        self.dim_names = updated_dim_names
-        self.climatology = True
-    def sort_by_river_volume(self, ds: xr.Dataset) -> xr.Dataset:
-        """Sorts the dataset by river volume in descending order (largest rivers first),
-        if the volume variable is available.
-        This method uses the river volume to reorder the dataset such that the rivers with
-        the largest volumes come first in the `station` dimension. If the volume variable
-        is not present in the dataset, a warning is logged.
-        Parameters
-        ----------
-        ds : xr.Dataset
-            The xarray Dataset containing the river data to be sorted by volume.
-        Returns
-        -------
-        xr.Dataset
-            The dataset with rivers sorted by their volume in descending order.
-            If the volume variable is not available, the original dataset is returned.
-        """
-        if "vol" in self.opt_var_names:
-            volume_values = ds[self.opt_var_names["vol"]].values
-            if isinstance(volume_values, np.ndarray):
-                # Check if all volume values are the same
-                if np.all(volume_values == volume_values[0]):
-                    # If all volumes are the same, no need to reverse order
-                    sorted_indices = np.argsort(
-                        volume_values
-                    )  # Sort in ascending order
-                else:
-                    # If volumes differ, reverse order for descending sort
-                    sorted_indices = np.argsort(volume_values)[
-                        ::-1
-                    ]  # Reverse for descending order
-                ds = ds.isel(**{self.dim_names["station"]: sorted_indices})
-            else:
-                logging.warning("The volume data is not in a valid array format.")
-        else:
-            logging.warning(
-                "Cannot sort rivers by volume. 'vol' is missing in the variable names."
-            )
-        return ds
-    def extract_relevant_rivers(self, target_coords, dx):
-        """Extracts a subset of the dataset based on the proximity of river mouths to
-        target coordinates.
-        This method calculates the distance between each river mouth and the provided target coordinates
-        (latitude and longitude) using the `gc_dist` function. It then filters the dataset to include only those
-        river stations whose minimum distance from the target is less than a specified threshold distance (`dx`).
-        Parameters
-        ----------
-        target_coords : dict
-            A dictionary containing the target coordinates for the comparison. It should include:
-            - "lon" (float): The target longitude in degrees.
-            - "lat" (float): The target latitude in degrees.
-            - "straddle" (bool): A flag indicating whether to adjust the longitudes for stations that cross the
-              International Date Line. If `True`, longitudes greater than 180 degrees are adjusted by subtracting 360,
-              otherwise, negative longitudes are adjusted by adding 360.
-        dx : float
-            The maximum distance threshold (in meters) for including a river station. Only river mouths that are
-            within `dx` meters from the target coordinates will be included in the returned dataset.
-        Returns
-        -------
-        indices : dict[str, list[tuple]]
-            A dictionary containing the indices of the rivers that are within the threshold distance from
-            the target coordinates. The dictionary structure consists of river names as keys, and each value is a list of tuples. Each tuple represents
-            a pair of indices corresponding to the `eta_rho` and `xi_rho` grid coordinates of the river.
-        """
-        # Retrieve longitude and latitude of river mouths
-        river_lon = self.ds[self.var_names["longitude"]]
-        river_lat = self.ds[self.var_names["latitude"]]
-        # Adjust longitude based on whether it crosses the International Date Line (straddle case)
-        if target_coords["straddle"]:
-            river_lon = xr.where(river_lon > 180, river_lon - 360, river_lon)
-        else:
-            river_lon = xr.where(river_lon < 0, river_lon + 360, river_lon)
-        # Calculate the distance between the target coordinates and each river mouth
-        dist = gc_dist(target_coords["lon"], target_coords["lat"], river_lon, river_lat)
-        dist_min = dist.min(dim=["eta_rho", "xi_rho"])
-        # Filter the dataset to include only stations within the distance threshold
-        if (dist_min < dx).any():
-            ds = self.ds.where(dist_min < dx, drop=True)
-            ds = self.sort_by_river_volume(ds)
-            dist = dist.where(dist_min < dx, drop=True).transpose(
-                self.dim_names["station"], "eta_rho", "xi_rho"
-            )
-            river_indices = get_indices_of_nearest_grid_cell_for_rivers(dist, self)
-        else:
-            ds = xr.Dataset()
-            river_indices = {}
-        self.ds = ds
-        return river_indices
-    def extract_named_rivers(self, indices):
-        """Extracts a subset of the dataset based on the provided river names in the
-        indices dictionary.
-        This method filters the dataset to include only the rivers specified in the `indices` dictionary.
-        The resulting subset is stored in the `ds` attribute of the class.
-        Parameters
-        ----------
-        indices : dict
-            A dictionary where the keys are river names (strings) and the values are dictionaries
-            containing river-related data (e.g., river indices, coordinates).
-        Returns
-        -------
-        None
-            The method modifies the `self.ds` attribute in place, setting it to the filtered dataset
-            containing only the data related to the specified rivers.
-        Raises
-        ------
-        ValueError
-            - If `indices` is not a dictionary.
-            - If any of the requested river names are not found in the dataset.
-        """
-        if not isinstance(indices, dict):
-            raise ValueError("`indices` must be a dictionary.")
-        river_names = list(indices.keys())
-        # Ensure the dataset is filtered based on the provided river names
-        ds_filtered = self.ds.where(
-            self.ds[self.var_names["name"]].isin(river_names), drop=True
-        )
-        # Check that all requested rivers exist in the dataset
-        filtered_river_names = set(ds_filtered[self.var_names["name"]].values)
-        missing_rivers = set(river_names) - filtered_river_names
-        if missing_rivers:
-            raise ValueError(
-                f"The following rivers were not found in the dataset: {missing_rivers}"
-            )
-        # Set the filtered dataset as the new `ds`
-        self.ds = ds_filtered
-@dataclass(kw_only=True)
-class DaiRiverDataset(RiverDataset):
-    """Represents river data from the Dai river dataset."""
-    filename: str | Path | list[str | Path] = field(
-        default_factory=lambda: download_river_data("dai_trenberth_may2019.nc")
-    )
-    dim_names: dict[str, str] = field(
-        default_factory=lambda: {
-            "station": "station",
-            "time": "time",
-        }
-    )
-    var_names: dict[str, str] = field(
-        default_factory=lambda: {
-            "latitude": "lat_mou",
-            "longitude": "lon_mou",
-            "flux": "FLOW",
-            "ratio": "ratio_m2s",
-            "name": "riv_name",
-        }
-    )
-    opt_var_names: dict[str, str] = field(
-        default_factory=lambda: {
-            "vol": "vol_stn",
-        }
-    )
-    climatology: bool | None = False
-    def add_time_info(self, ds: xr.Dataset) -> xr.Dataset:
-        """Adds time information to the dataset based on the climatology flag and
-        dimension names.
-        This method processes the dataset to include time information according to the climatology
-        setting. If the dataset represents climatology data and the time dimension is labeled as
-        "month", it assigns dates to the dataset based on a monthly climatology. Additionally, it
-        handles dimension name updates if necessary.
-        Parameters
-        ----------
-        ds : xr.Dataset
-            The input dataset to which time information will be added.
-        Returns
-        -------
-        xr.Dataset
-            The dataset with time information added, including adjustments for climatology and
-            dimension names.
-        """
-        time_dim = self.dim_names["time"]
-        # Extract the 'time' variable as a numpy array
-        time_vals = ds[time_dim].values
-        # Handle rounding of the time values
-        year = np.round(time_vals * 1e-2).astype(int)
-        month = np.round((time_vals * 1e-2 - year) * 1e2).astype(int)
-        # Convert to datetime (assuming the day is always 15th for this example)
-        dates = [datetime(year=i, month=m, day=15) for i, m in zip(year, month)]
-        ds[time_dim] = dates
-        return ds
 @dataclass
 class TPXOManager:
     """Manages multiple TPXODataset instances and selects and processes tidal
@@ -2684,208 +2195,6 @@ class TPXOManager:
         object.__setattr__(self.datasets["sal"], "var_names", var_names)
-# shared functions
-def _check_dataset(
-    ds: xr.Dataset,
-    dim_names: dict[str, str],
-    var_names: dict[str, str],
-    opt_var_names: dict[str, str] | None = None,
-) -> None:
-    """Check if the dataset contains the specified variables and dimensions.
-    Parameters
-    ----------
-    ds : xr.Dataset
-        The xarray Dataset to check.
-    dim_names: Dict[str, str], optional
-        Dictionary specifying the names of dimensions in the dataset.
-    var_names: Dict[str, str]
-        Dictionary of variable names that are required in the dataset.
-    opt_var_names : Optional[Dict[str, str]], optional
-        Dictionary of optional variable names.
-        These variables are not strictly required, and the function will not raise an error if they are missing.
-        Default is None, meaning no optional variables are considered.
-    Raises
-    ------
-    ValueError
-        If the dataset does not contain the specified variables or dimensions.
-    """
-    missing_dims = [dim for dim in dim_names.values() if dim not in ds.dims]
-    if missing_dims:
-        raise ValueError(
-            f"Dataset does not contain all required dimensions. The following dimensions are missing: {missing_dims}"
-        )
-    missing_vars = [var for var in var_names.values() if var not in ds.data_vars]
-    if missing_vars:
-        raise ValueError(
-            f"Dataset does not contain all required variables. The following variables are missing: {missing_vars}"
-        )
-    if opt_var_names:
-        missing_optional_vars = [
-            var for var in opt_var_names.values() if var not in ds.data_vars
-        ]
-        if missing_optional_vars:
-            logging.warning(
-                f"Optional variables missing (but not critical): {missing_optional_vars}"
-            )
-def _select_relevant_times(
-    ds, time_dim, start_time, end_time=None, climatology=False
-) -> xr.Dataset:
-    """Select a subset of the dataset based on the specified time range.
-    This method filters the dataset to include all records between `start_time` and `end_time`.
-    Additionally, it ensures that one record at or before `start_time` and one record at or
-    after `end_time` are included, even if they fall outside the strict time range.
-    If no `end_time` is specified, the method will select the time range of
-    [start_time, start_time + 24 hours] and return the closest time entry to `start_time` within that range.
-    Parameters
-    ----------
-    ds : xr.Dataset
-        The input dataset to be filtered. Must contain a time dimension.
-    time_dim: str
-        Name of time dimension.
-    start_time : datetime
-        The start time for selecting relevant data.
-    end_time : Optional[datetime], optional
-        The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided.
-    climatology : bool
-        Indicates whether the dataset is climatological. Defaults to False.
-    Returns
-    -------
-    xr.Dataset
-        A dataset filtered to the specified time range, including the closest entries
-        at or before `start_time` and at or after `end_time` if applicable.
-    Raises
-    ------
-    ValueError
-        If no matching times are found between `start_time` and `start_time + 24 hours`.
-    Warns
-    -----
-    UserWarning
-        If the dataset contains exactly 12 time steps but the climatology flag is not set.
-        This may indicate that the dataset represents climatology data.
-    UserWarning
-        If no records at or before `start_time` or no records at or after `end_time` are found.
-    UserWarning
-        If the dataset does not contain any time dimension or the time dimension is incorrectly named.
-    Notes
-    -----
-    - If the `climatology` flag is set and `end_time` is not provided, the method will
-      interpolate initial conditions from climatology data.
-    - If the dataset uses `cftime` datetime objects, these will be converted to standard
-      `np.datetime64` objects before filtering.
-    """
-    if time_dim in ds.variables:
-        if climatology:
-            if len(ds[time_dim]) != 12:
-                raise ValueError(
-                    f"The dataset contains {len(ds[time_dim])} time steps, but the climatology flag is set to True, which requires exactly 12 time steps."
-                )
-            if not end_time:
-                # Convert from timedelta64[ns] to fractional days
-                ds["time"] = ds["time"] / np.timedelta64(1, "D")
-                # Interpolate from climatology for initial conditions
-                ds = interpolate_from_climatology(ds, time_dim, start_time)
-        else:
-            time_type = get_time_type(ds[time_dim])
-            if time_type == "int":
-                raise ValueError(
-                    "The dataset contains integer time values, which are only supported when the climatology flag is set to True. However, your climatology flag is set to False."
-                )
-            if time_type == "cftime":
-                ds = ds.assign_coords(
-                    {time_dim: convert_cftime_to_datetime(ds[time_dim])}
-                )
-            if end_time:
-                end_time = end_time
-                # Identify records before or at start_time
-                before_start = ds[time_dim] <= np.datetime64(start_time)
-                if before_start.any():
-                    closest_before_start = (
-                        ds[time_dim].where(before_start, drop=True).max()
-                    )
-                else:
-                    logging.warning("No records found at or before the start_time.")
-                    closest_before_start = ds[time_dim].min()
-                # Identify records after or at end_time
-                after_end = ds[time_dim] >= np.datetime64(end_time)
-                if after_end.any():
-                    closest_after_end = ds[time_dim].where(after_end, drop=True).min()
-                else:
-                    logging.warning("No records found at or after the end_time.")
-                    closest_after_end = ds[time_dim].max()
-                # Select records within the time range and add the closest before/after
-                within_range = (ds[time_dim] > np.datetime64(start_time)) & (
-                    ds[time_dim] < np.datetime64(end_time)
-                )
-                selected_times = ds[time_dim].where(
-                    within_range
-                    | (ds[time_dim] == closest_before_start)
-                    | (ds[time_dim] == closest_after_end),
-                    drop=True,
-                )
-                ds = ds.sel({time_dim: selected_times})
-            else:
-                # Look in time range [start_time, start_time + 24h]
-                end_time = start_time + timedelta(days=1)
-                times = (np.datetime64(start_time) <= ds[time_dim]) & (
-                    ds[time_dim] < np.datetime64(end_time)
-                )
-                if np.all(~times):
-                    raise ValueError(
-                        f"The dataset does not contain any time entries between the specified start_time: {start_time} "
-                        f"and {start_time + timedelta(hours=24)}. "
-                        "Please ensure the dataset includes time entries for that range."
-                    )
-                ds = ds.where(times, drop=True)
-                if ds.sizes[time_dim] > 1:
-                    # Pick the time closest to start_time
-                    ds = ds.isel({time_dim: 0})
-                logging.info(
-                    f"Selected time entry closest to the specified start_time ({start_time}) within the range [{start_time}, {start_time + timedelta(hours=24)}]: {ds[time_dim].values}"
-                )
-    else:
-        logging.warning(
-            "Dataset does not contain any time information. Please check if the time dimension "
-            "is correctly named or if the dataset includes time data."
-        )
-    return ds
-def decode_string(byte_array):
-    # Decode each byte and handle errors with 'ignore'
-    decoded_string = "".join(
-        [
-            x.decode("utf-8", errors="ignore")  # Ignore invalid byte sequences
-            for x in byte_array.values
-            if isinstance(x, bytes) and x != b" " and x is not np.nan
-        ]
-    )
-    return decoded_string
 def modified_julian_days(year, month, day, hour=0):
     """Calculate the Modified Julian Day (MJD) for a given date and time.
@@ -2943,77 +2252,273 @@ def modified_julian_days(year, month, day, hour=0):
     return mjd
-def get_indices_of_nearest_grid_cell_for_rivers(
-    dist: xr.DataArray, data: RiverDataset
-) -> dict[str, list[tuple[int, int]]]:
-    """Get the indices of the nearest grid cell for each river based on distance.
+def _concatenate_longitudes(
+    ds: xr.Dataset,
+    dim_names: Mapping[str, str],
+    end: TConcatEndTypes,
+    use_dask: bool = False,
+) -> xr.Dataset:
+    """
+    Concatenate longitude dimension to handle global grids that cross
+    the 0/360-degree or -180/180-degree boundary.
+    Extends the longitude dimension either lower, upper, or both sides
+    by +/- 360 degrees and duplicates the corresponding variables along
+    that dimension.
     Parameters
     ----------
-    dist : xr.DataArray
-        A 2D or 3D array representing distances from each river to coastal grid cells,
-        with dimensions including "eta_rho" and "xi_rho".
-    data : RiverDataset
-        An instance of RiverDataset containing river names and dimension metadata.
+    ds : xr.Dataset
+        Input xarray Dataset to be concatenated.
+    dim_names : Mapping[str, str]
+        Dictionary or mapping containing dimension names. Must include "longitude".
+    end : str
+        Specifies which side(s) to extend:
+        - "lower": extend by subtracting 360 degrees.
+        - "upper": extend by adding 360 degrees.
+        - "both": extend on both sides.
+    use_dask : bool, default False
+        If True, chunk the concatenated longitude dimension using Dask.
     Returns
     -------
-    dict[str, list[tuple[int, int]]]
-        Dictionary mapping each river name to a list containing the (eta_rho, xi_rho) index
-        of the closest coastal grid cell.
+    xr.Dataset
+        Dataset with longitude dimension extended and data variables duplicated.
+    Notes
+    -----
+    Only data variables containing the longitude dimension are concatenated;
+    others are left unchanged.
     """
-    # Find indices of the nearest coastal grid cell for each river
-    indices = dist.argmin(dim=["eta_rho", "xi_rho"])
-    eta_rho_values = indices["eta_rho"].values
-    xi_rho_values = indices["xi_rho"].values
-    # Get the corresponding station indices and river names
-    stations = indices["eta_rho"][data.dim_names["station"]].values
-    names = (
-        data.ds[data.var_names["name"]]
-        .sel({data.dim_names["station"]: stations})
-        .values
-    )
+    ds_concat = xr.Dataset()
+    lon_name = dim_names["longitude"]
+    lon = ds[lon_name]
+    match end:
+        case "lower":
+            lon_concat = xr.concat([lon - 360, lon], dim=lon_name)
+            n_copies = 2
+        case "upper":
+            lon_concat = xr.concat([lon, lon + 360], dim=lon_name)
+            n_copies = 2
+        case "both":
+            lon_concat = xr.concat([lon - 360, lon, lon + 360], dim=lon_name)
+            n_copies = 3
+        case _:
+            raise ValueError(f"Invalid `end` value: {end}")
+    for var in ds.variables:
+        if lon_name in ds[var].dims:
+            field = ds[var]
+            field_concat = xr.concat([field] * n_copies, dim=lon_name)
+            if use_dask:
+                field_concat = field_concat.chunk({lon_name: -1})
+            ds_concat[var] = field_concat
+        else:
+            ds_concat[var] = ds[var]
-    # Build dictionary of river name to grid index
-    river_indices = {
-        str(names[i]): [(int(eta_rho_values[i]), int(xi_rho_values[i]))]
-        for i in range(len(stations))
-    }
+    ds_concat = ds_concat.assign_coords({lon_name: lon_concat.values})
-    return river_indices
+    return ds_concat
-def _deduplicate_river_names(
-    ds: xr.Dataset, name_var: str, station_dim: str
+def choose_subdomain(
+    ds: xr.Dataset,
+    dim_names: Mapping[str, str],
+    resolution: float,
+    is_global: bool,
+    target_coords: Mapping[str, Any],
+    buffer_points: int = 20,
+    use_dask: bool = False,
 ) -> xr.Dataset:
-    """Ensure river names are unique by appending _1, _2 to duplicates, excluding non-
-    duplicates.
     """
-    original = ds[name_var]
+    Select a subdomain from an xarray Dataset based on target coordinates,
+    with optional buffer points and global longitude handling.
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The full xarray Dataset to subset.
+    dim_names : Mapping[str, str]
+        Dictionary mapping logical dimension names to dataset dimension names.
+        Example: {"latitude": "latitude", "longitude": "longitude"}.
+    resolution : float
+        Spatial resolution of the dataset, used to compute buffer margin.
+    is_global : bool
+        Whether the dataset covers global longitude (affects concatenation logic).
+    target_coords : Mapping[str, Any]
+        Dictionary containing target latitude and longitude coordinates.
+        Expected keys: "lat", "lon", and "straddle" (boolean for crossing 180°).
+    buffer_points : int, default 20
+        Number of grid points to extend beyond the target coordinates.
+    use_dask: bool, optional
+        Indicates whether to use dask for chunking. If True, data is loaded with dask; if False, data is processed eagerly. Defaults to False.
-    # Force cast to plain Python strings
-    names = [str(name) for name in original.values]
+    Returns
+    -------
+    xr.Dataset
+        Subset of the input Dataset covering the requested coordinates plus buffer.
-    # Count all names
-    name_counts = Counter(names)
-    seen = defaultdict(int)
+    Raises
+    ------
+    ValueError
+        If the selected latitude or longitude range does not intersect the dataset.
+    """
+    lat_min = target_coords["lat"].min().values
+    lat_max = target_coords["lat"].max().values
+    lon_min = target_coords["lon"].min().values
+    lon_max = target_coords["lon"].max().values
-    unique_names = []
-    for name in names:
-        if name_counts[name] > 1:
-            seen[name] += 1
-            unique_names.append(f"{name}_{seen[name]}")
-        else:
-            unique_names.append(name)
+    margin = resolution * buffer_points
+    # Select the subdomain in latitude direction (so that we have to concatenate fewer latitudes below if concatenation is necessary)
+    subdomain = ds.sel(
+        **{
+            dim_names["latitude"]: slice(lat_min - margin, lat_max + margin),
+        }
+    )
+    lon = subdomain[dim_names["longitude"]]
+    if is_global:
+        concats = []
+        # Concatenate only if necessary
+        if lon_max + margin > lon.max():
+            # See if shifting by +360 degrees helps
+            if (lon_min - margin > (lon + 360).min()) and (
+                lon_max + margin < (lon + 360).max()
+            ):
+                subdomain[dim_names["longitude"]] = lon + 360
+                lon = subdomain[dim_names["longitude"]]
+            else:
+                concats.append("upper")
+        if lon_min - margin < lon.min():
+            # See if shifting by -360 degrees helps
+            if (lon_min - margin > (lon - 360).min()) and (
+                lon_max + margin < (lon - 360).max()
+            ):
+                subdomain[dim_names["longitude"]] = lon - 360
+                lon = subdomain[dim_names["longitude"]]
+            else:
+                concats.append("lower")
+        if concats:
+            end = "both" if len(concats) == 2 else concats[0]
+            end = cast(TConcatEndTypes, end)
+            subdomain = _concatenate_longitudes(
+                subdomain, dim_names=dim_names, end=end, use_dask=use_dask
+            )
+            lon = subdomain[dim_names["longitude"]]
+    else:
+        # Adjust longitude range if needed to match the expected range
+        if not target_coords["straddle"]:
+            if lon.min() < -180:
+                if lon_max + margin > 0:
+                    lon_min -= 360
+                    lon_max -= 360
+            elif lon.min() < 0:
+                if lon_max + margin > 180:
+                    lon_min -= 360
+                    lon_max -= 360
+        if target_coords["straddle"]:
+            if lon.max() > 360:
+                if lon_min - margin < 180:
+                    lon_min += 360
+                    lon_max += 360
+            elif lon.max() > 180:
+                if lon_min - margin < 0:
+                    lon_min += 360
+                    lon_max += 360
+    # Select the subdomain in longitude direction
+    subdomain = subdomain.sel(
+        **{
+            dim_names["longitude"]: slice(lon_min - margin, lon_max + margin),
+        }
+    )
+    # Check if the selected subdomain has zero dimensions in latitude or longitude
+    if (
+        dim_names["latitude"] not in subdomain
+        or subdomain[dim_names["latitude"]].size == 0
+    ):
+        raise ValueError("Selected latitude range does not intersect with dataset.")
+    if (
+        dim_names["longitude"] not in subdomain
+        or subdomain[dim_names["longitude"]].size == 0
+    ):
+        raise ValueError("Selected longitude range does not intersect with dataset.")
+    # Adjust longitudes to expected range if needed
+    lon = subdomain[dim_names["longitude"]]
+    if target_coords["straddle"]:
+        subdomain[dim_names["longitude"]] = xr.where(lon > 180, lon - 360, lon)
+    else:
+        subdomain[dim_names["longitude"]] = xr.where(lon < 0, lon + 360, lon)
-    # Replace with updated names while preserving dtype, dims, attrs
-    updated_array = xr.DataArray(
-        data=np.array(unique_names, dtype=f"<U{max(len(n) for n in unique_names)}"),
-        dims=original.dims,
-        attrs=original.attrs,
+    return subdomain
+def get_glorys_bounds(
+    grid: Grid,
+    glorys_grid_path: Path | str | None = None,
+) -> dict[str, float]:
+    """
+    Compute the latitude/longitude bounds of a GLORYS spatial subset
+    that fully covers the given ROMS grid (with margin for regridding).
+    Parameters
+    ----------
+    grid : Grid
+        The grid object.
+    glorys_grid_path : str, optional
+        Path to the GLORYS global grid file. If None, defaults to
+        "<repo_root>/data/grids/GLORYS_global_grid.nc".
+    Returns
+    -------
+    dict[str, float]
+        Dictionary containing the bounding box values:
+        - `"minimum_latitude"` : float
+        - `"maximum_latitude"` : float
+        - `"minimum_longitude"` : float
+        - `"maximum_longitude"` : float
+    Notes
+    -----
+    - The resolution is estimated as the mean of latitude and longitude spacing.
+    """
+    if glorys_grid_path is None:
+        glorys_grid_path = GLORYS_GLOBAL_GRID_PATH
+    ds = xr.open_dataset(glorys_grid_path)
+    # Estimate grid resolution (mean spacing in degrees)
+    res_lat = ds.latitude.diff("latitude").mean()
+    res_lon = ds.longitude.diff("longitude").mean()
+    resolution = (res_lat + res_lon) / 2
+    # Extract target grid coordinates
+    target_coords = get_target_coords(grid)
+    # Select subdomain with margin
+    ds_subset = choose_subdomain(
+        ds=ds,
+        dim_names={"latitude": "latitude", "longitude": "longitude"},
+        resolution=resolution,
+        is_global=True,
+        target_coords=target_coords,
+        buffer_points=DEFAULT_NR_BUFFER_POINTS + 1,
     )
-    ds[name_var] = updated_array
-    return ds
+    # Compute bounds
+    return {
+        "minimum_latitude": float(ds_subset.latitude.min()),
+        "maximum_latitude": float(ds_subset.latitude.max()),
+        "minimum_longitude": float(ds_subset.longitude.min()),
+        "maximum_longitude": float(ds_subset.longitude.max()),
+    }

roms-tools 3.1.2__py3-none-any.whl → 3.3.0__py3-none-any.whl

roms-tools 3.1.2py3-none-any.whl → 3.3.0py3-none-any.whl