PyPI - roms-tools - Versions diffs - 1.6.2__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

roms-tools 1.6.2py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (283) hide show

roms_tools/setup/datasets.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import time
 import re
 import xarray as xr
 from dataclasses import dataclass, field
@@ -13,10 +14,17 @@ from roms_tools.setup.utils import (
     get_time_type,
     convert_cftime_to_datetime,
     one_dim_fill,
+    gc_dist,
+)
+from roms_tools.setup.download import (
+    download_correction_data,
+    download_topo,
+    download_river_data,
 )
-from roms_tools.setup.download import download_correction_data
 from roms_tools.setup.fill import LateralFill
+# lat-lon datasets
 @dataclass(frozen=True, kw_only=True)
 class Dataset:
@@ -32,10 +40,10 @@ class Dataset:
     end_time : Optional[datetime], optional
         The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided,
         or no filtering is applied if start_time is not provided.
-    var_names: Dict[str, str]
-        Dictionary of variable names that are required in the dataset.
     dim_names: Dict[str, str], optional
         Dictionary specifying the names of dimensions in the dataset.
+    var_names: Dict[str, str]
+        Dictionary of variable names that are required in the dataset.
     climatology : bool
         Indicates whether the dataset is climatological. Defaults to False.
     use_dask: bool
@@ -62,7 +70,6 @@ class Dataset:
     filename: Union[str, Path, List[Union[str, Path]]]
     start_time: Optional[datetime] = None
     end_time: Optional[datetime] = None
-    var_names: Dict[str, str]
     dim_names: Dict[str, str] = field(
         default_factory=lambda: {
             "longitude": "longitude",
@@ -70,8 +77,9 @@ class Dataset:
             "time": "time",
         }
     )
+    var_names: Dict[str, str]
     climatology: Optional[bool] = False
-    use_dask: Optional[bool] = True
+    use_dask: Optional[bool] = False
     apply_post_processing: Optional[bool] = True
     is_global: bool = field(init=False, repr=False)
@@ -114,6 +122,8 @@ class Dataset:
         # Make sure that latitude is ascending
         ds = self.ensure_dimension_is_ascending(ds, dim="latitude")
+        # Make sure there are no 360 degree jumps in longitude
+        ds = self.ensure_dimension_is_ascending(ds, dim="longitude")
         if "depth" in self.dim_names:
             # Make sure that depth is ascending
@@ -123,11 +133,6 @@ class Dataset:
         # Check whether the data covers the entire globe
         object.__setattr__(self, "is_global", self.check_if_global(ds))
-        # If dataset is global concatenate three copies of field along longitude dimension
-        if self.is_global:
-            ds = self.concatenate_longitudes(ds)
         object.__setattr__(self, "ds", ds)
         if self.apply_post_processing:
@@ -149,101 +154,7 @@ class Dataset:
             If a list of files is provided but self.dim_names["time"] is not available or use_dask=False.
         """
-        # Precompile the regex for matching wildcard characters
-        wildcard_regex = re.compile(r"[\*\?\[\]]")
-        # Convert Path objects to strings
-        if isinstance(self.filename, (str, Path)):
-            filename_str = str(self.filename)
-        elif isinstance(self.filename, list):
-            filename_str = [str(f) for f in self.filename]
-        else:
-            raise ValueError(
-                "filename must be a string, Path, or a list of strings/Paths."
-            )
-        # Handle the case when filename is a string
-        contains_wildcard = False
-        if isinstance(filename_str, str):
-            contains_wildcard = bool(wildcard_regex.search(filename_str))
-            if contains_wildcard:
-                matching_files = glob.glob(filename_str)
-                if not matching_files:
-                    raise FileNotFoundError(
-                        f"No files found matching the pattern '{filename_str}'."
-                    )
-            else:
-                matching_files = [filename_str]
-        # Handle the case when filename is a list
-        elif isinstance(filename_str, list):
-            contains_wildcard = any(wildcard_regex.search(f) for f in filename_str)
-            if contains_wildcard:
-                matching_files = []
-                for f in filename_str:
-                    files = glob.glob(f)
-                    if not files:
-                        raise FileNotFoundError(
-                            f"No files found matching the pattern '{f}'."
-                        )
-                    matching_files.extend(files)
-            else:
-                matching_files = filename_str
-        # Check if time dimension is available when multiple files are provided
-        if isinstance(filename_str, list) and "time" not in self.dim_names:
-            raise ValueError(
-                "A list of files is provided, but time dimension is not available. "
-                "A time dimension must be available to concatenate the files."
-            )
-        # Determine the kwargs for combining datasets
-        if contains_wildcard or len(matching_files) == 1:
-            # If there is a wildcard or just one file, use by_coords
-            kwargs = {"combine": "by_coords"}
-        else:
-            # Otherwise, use nested combine based on time
-            kwargs = {"combine": "nested", "concat_dim": self.dim_names["time"]}
-        # Base kwargs used for dataset combination
-        combine_kwargs = {
-            "coords": "minimal",
-            "compat": "override",
-            "combine_attrs": "override",
-        }
-        if self.use_dask:
-            chunks = {
-                self.dim_names["latitude"]: -1,
-                self.dim_names["longitude"]: -1,
-            }
-            if "depth" in self.dim_names:
-                chunks[self.dim_names["depth"]] = -1
-            if "time" in self.dim_names:
-                chunks[self.dim_names["time"]] = 1
-            ds = xr.open_mfdataset(
-                matching_files,
-                chunks=chunks,
-                **combine_kwargs,
-                **kwargs,
-            )
-        else:
-            ds_list = []
-            for file in matching_files:
-                ds = xr.open_dataset(file, chunks=None)
-                ds_list.append(ds)
-            if kwargs["combine"] == "by_coords":
-                ds = xr.combine_by_coords(ds_list, **combine_kwargs)
-            elif kwargs["combine"] == "nested":
-                ds = xr.combine_nested(
-                    ds_list, concat_dim=kwargs["concat_dim"], **combine_kwargs
-                )
-        if "time" in self.dim_names and self.dim_names["time"] not in ds.dims:
-            ds = ds.expand_dims(self.dim_names["time"])
+        ds = _load_data(self.filename, self.dim_names, self.use_dask)
         return ds
@@ -278,19 +189,8 @@ class Dataset:
         ValueError
             If the dataset does not contain the specified variables or dimensions.
         """
-        missing_vars = [
-            var for var in self.var_names.values() if var not in ds.data_vars
-        ]
-        if missing_vars:
-            raise ValueError(
-                f"Dataset does not contain all required variables. The following variables are missing: {missing_vars}"
-            )
-        missing_dims = [dim for dim in self.dim_names.values() if dim not in ds.dims]
-        if missing_dims:
-            raise ValueError(
-                f"Dataset does not contain all required dimensions. The following dimensions are missing: {missing_vars}"
-            )
+        _check_dataset(ds, self.dim_names, self.var_names)
     def select_relevant_fields(self, ds) -> xr.Dataset:
         """Selects and returns a subset of the dataset containing only the variables
@@ -379,86 +279,10 @@ class Dataset:
         """
         time_dim = self.dim_names["time"]
-        if time_dim in ds.variables:
-            if self.climatology:
-                if len(ds[time_dim]) != 12:
-                    raise ValueError(
-                        f"The dataset contains {len(ds[time_dim])} time steps, but the climatology flag is set to True, which requires exactly 12 time steps."
-                    )
-                if not self.end_time:
-                    # Interpolate from climatology for initial conditions
-                    ds = interpolate_from_climatology(
-                        ds, self.dim_names["time"], self.start_time
-                    )
-            else:
-                time_type = get_time_type(ds[time_dim])
-                if time_type == "int":
-                    raise ValueError(
-                        "The dataset contains integer time values, which are only supported when the climatology flag is set to True. However, your climatology flag is set to False."
-                    )
-                if time_type == "cftime":
-                    ds = ds.assign_coords(
-                        {time_dim: convert_cftime_to_datetime(ds[time_dim])}
-                    )
-                if self.end_time:
-                    end_time = self.end_time
-                    # Identify records before or at start_time
-                    before_start = ds[time_dim] <= np.datetime64(self.start_time)
-                    if before_start.any():
-                        closest_before_start = (
-                            ds[time_dim].where(before_start, drop=True).max()
-                        )
-                    else:
-                        logging.warning("No records found at or before the start_time.")
-                        closest_before_start = ds[time_dim].min()
-                    # Identify records after or at end_time
-                    after_end = ds[time_dim] >= np.datetime64(end_time)
-                    if after_end.any():
-                        closest_after_end = (
-                            ds[time_dim].where(after_end, drop=True).min()
-                        )
-                    else:
-                        logging.warning("No records found at or after the end_time.")
-                        closest_after_end = ds[time_dim].max()
-                    # Select records within the time range and add the closest before/after
-                    within_range = (ds[time_dim] > np.datetime64(self.start_time)) & (
-                        ds[time_dim] < np.datetime64(end_time)
-                    )
-                    selected_times = ds[time_dim].where(
-                        within_range
-                        | (ds[time_dim] == closest_before_start)
-                        | (ds[time_dim] == closest_after_end),
-                        drop=True,
-                    )
-                    ds = ds.sel({time_dim: selected_times})
-                else:
-                    # Look in time range [self.start_time, self.start_time + 24h]
-                    end_time = self.start_time + timedelta(days=1)
-                    times = (np.datetime64(self.start_time) <= ds[time_dim]) & (
-                        ds[time_dim] < np.datetime64(end_time)
-                    )
-                    if np.all(~times):
-                        raise ValueError(
-                            f"The dataset does not contain any time entries between the specified start_time: {self.start_time} "
-                            f"and {self.start_time + timedelta(hours=24)}. "
-                            "Please ensure the dataset includes time entries for that range."
-                        )
-                    ds = ds.where(times, drop=True)
-                    if ds.sizes[time_dim] > 1:
-                        # Pick the time closest to self.start_time
-                        ds = ds.isel({time_dim: 0})
-                    logging.info(
-                        f"Selected time entry closest to the specified start_time ({self.start_time}) within the range [{self.start_time}, {self.start_time + timedelta(hours=24)}]: {ds[time_dim].values}"
-                    )
-        else:
-            logging.warning(
-                "Dataset does not contain any time information. Please check if the time dimension "
-                "is correctly named or if the dataset includes time data."
-            )
+        ds = _select_relevant_times(
+            ds, time_dim, self.start_time, self.end_time, self.climatology
+        )
         return ds
@@ -467,7 +291,11 @@ class Dataset:
     ) -> xr.Dataset:
         """Ensure that the specified dimension in the dataset is in ascending order.
-        If the values along the specified dimension are in descending order, this function reverses the order of the dimension to make it ascending.
+        This function checks the order of values along the specified dimension. If they
+        are in descending order, it reverses the dimension to make it ascending. For
+        the "longitude" dimension, if it has a discontinuity (e.g., [0, 180][-180, 0]),
+        the function adjusts values to eliminate the 360-degree jump, transforming
+        the range into a continuous [0, 360) span.
         Parameters
         ----------
@@ -481,14 +309,23 @@ class Dataset:
         -------
         xr.Dataset
             A new `xarray.Dataset` with the specified dimension in ascending order.
-            If the dimension was already in ascending order, the original dataset is returned unchanged.
-            If the dimension was in descending order, the dataset is returned with the dimension reversed.
+            - If the dimension was already in ascending order, the original dataset is returned unchanged.
+            - If the dimension was in descending order, the dataset is returned with the dimension reversed.
+            - If the dimension is "longitude" with a discontinuity (e.g., [0, 180][-180, 0]), the values are adjusted to eliminate the 360-degree jump.
         """
-        # Make sure that latitude is ascending
+        # Check if the dimension is in descending order and reverse if needed
         diff = np.diff(ds[self.dim_names[dim]])
         if np.all(diff < 0):
             ds = ds.isel(**{self.dim_names[dim]: slice(None, None, -1)})
+        # Check for a discontinuity in longitude and adjust values if present
+        elif np.any(diff < 0) and dim == "longitude":
+            ds[self.dim_names[dim]] = xr.where(
+                ds[self.dim_names[dim]] < 0,
+                ds[self.dim_names[dim]] + 360,
+                ds[self.dim_names[dim]],
+            )
         return ds
     def infer_horizontal_resolution(self, ds: xr.Dataset):
@@ -542,43 +379,68 @@ class Dataset:
         return is_global
-    def concatenate_longitudes(self, ds):
-        """
-        Concatenates the field three times: with longitudes shifted by -360, original longitudes, and shifted by +360.
+    def concatenate_longitudes(self, ds, end="upper", verbose=False):
+        """Concatenates fields in dataset twice along the longitude dimension.
         Parameters
         ----------
-        field : xr.DataArray
-            The field to be concatenated.
+        ds: xr.Dataset
+            The dataset to be concatenated. The longitude dimension must be present in this dataset.
+        end : str, optional
+            Specifies which end to shift the longitudes.
+            Options are:
+                - "lower": shifts longitudes by -360 degrees and concatenates to the lower end.
+                - "upper": shifts longitudes by +360 degrees and concatenates to the upper end.
+                - "both": shifts longitudes by -360 degrees and 360 degrees and concatenates to both ends.
+            Default is "upper".
+        verbose : bool, optional
+            If True, print message if dataset is concatenated along longitude dimension.
+            Defaults to False.
         Returns
         -------
-        xr.DataArray
-            The concatenated field, with the longitude dimension extended.
+        ds_concatenated : xr.Dataset
+            The concatenated dataset.
+        """
-        Notes
-        -----
-        Concatenating three times may be overkill in most situations, but it is safe. Alternatively, we could refactor
-        to figure out whether concatenating on the lower end, upper end, or at all is needed.
+        if verbose:
+            start_time = time.time()
-        """
         ds_concatenated = xr.Dataset()
         lon = ds[self.dim_names["longitude"]]
-        lon_minus360 = lon - 360
-        lon_plus360 = lon + 360
-        lon_concatenated = xr.concat(
-            [lon_minus360, lon, lon_plus360], dim=self.dim_names["longitude"]
-        )
+        if end == "lower":
+            lon_minus360 = lon - 360
+            lon_concatenated = xr.concat(
+                [lon_minus360, lon], dim=self.dim_names["longitude"]
+            )
-        ds_concatenated[self.dim_names["longitude"]] = lon_concatenated
+        elif end == "upper":
+            lon_plus360 = lon + 360
+            lon_concatenated = xr.concat(
+                [lon, lon_plus360], dim=self.dim_names["longitude"]
+            )
+        elif end == "both":
+            lon_minus360 = lon - 360
+            lon_plus360 = lon + 360
+            lon_concatenated = xr.concat(
+                [lon_minus360, lon, lon_plus360], dim=self.dim_names["longitude"]
+            )
-        for var in self.var_names.values():
+        for var in ds.data_vars:
             if self.dim_names["longitude"] in ds[var].dims:
                 field = ds[var]
-                field_concatenated = xr.concat(
-                    [field, field, field], dim=self.dim_names["longitude"]
-                )
+                if end == "both":
+                    field_concatenated = xr.concat(
+                        [field, field, field], dim=self.dim_names["longitude"]
+                    )
+                else:
+                    field_concatenated = xr.concat(
+                        [field, field], dim=self.dim_names["longitude"]
+                    )
                 if self.use_dask:
                     field_concatenated = field_concatenated.chunk(
                         {self.dim_names["longitude"]: -1}
@@ -588,6 +450,13 @@ class Dataset:
             else:
                 ds_concatenated[var] = ds[var]
+        ds_concatenated[self.dim_names["longitude"]] = lon_concatenated
+        if verbose:
+            logging.info(
+                f"Concatenating the data along the longitude dimension: {time.time() - start_time:.3f} seconds"
+            )
         return ds_concatenated
     def post_process(self):
@@ -601,7 +470,9 @@ class Dataset:
         """
         pass
-    def choose_subdomain(self, target_coords, buffer_points=20, return_copy=False):
+    def choose_subdomain(
+        self, target_coords, buffer_points=20, return_copy=False, verbose=False
+    ):
         """Selects a subdomain from the xarray Dataset based on specified target
         coordinates, extending the selection by a defined buffer. Adjusts longitude
         ranges as necessary to accommodate the dataset's expected range and handles
@@ -618,6 +489,9 @@ class Dataset:
         return_subdomain : bool, optional
             If True, returns the subset of the original dataset representing the chosen
             subdomain. If False, assigns the subset to `self.ds`. Defaults to False.
+        verbose : bool, optional
+            If True, print message if dataset is concatenated along longitude dimension.
+            Defaults to False.
         Returns
         -------
@@ -640,9 +514,43 @@ class Dataset:
         margin = self.resolution * buffer_points
-        if not self.is_global:
+        # Select the subdomain in latitude direction (so that we have to concatenate fewer latitudes below if concatenation is necessary)
+        subdomain = self.ds.sel(
+            **{
+                self.dim_names["latitude"]: slice(lat_min - margin, lat_max + margin),
+            }
+        )
+        lon = subdomain[self.dim_names["longitude"]]
+        if self.is_global:
+            # Concatenate only if necessary
+            if lon_max + margin > lon.max():
+                # See if shifting by +360 degrees helps
+                if (lon_min - margin > (lon + 360).min()) and (
+                    lon_max + margin < (lon + 360).max()
+                ):
+                    subdomain[self.dim_names["longitude"]] = lon + 360
+                    lon = subdomain[self.dim_names["longitude"]]
+                else:
+                    subdomain = self.concatenate_longitudes(
+                        subdomain, end="upper", verbose=verbose
+                    )
+                    lon = subdomain[self.dim_names["longitude"]]
+            if lon_min - margin < lon.min():
+                # See if shifting by -360 degrees helps
+                if (lon_min - margin > (lon - 360).min()) and (
+                    lon_max + margin < (lon - 360).max()
+                ):
+                    subdomain[self.dim_names["longitude"]] = lon - 360
+                    lon = subdomain[self.dim_names["longitude"]]
+                else:
+                    subdomain = self.concatenate_longitudes(
+                        subdomain, end="lower", verbose=verbose
+                    )
+                    lon = subdomain[self.dim_names["longitude"]]
+        else:
             # Adjust longitude range if needed to match the expected range
-            lon = self.ds[self.dim_names["longitude"]]
             if not target_coords["straddle"]:
                 if lon.min() < -180:
                     if lon_max + margin > 0:
@@ -662,12 +570,9 @@ class Dataset:
                     if lon_min - margin < 0:
                         lon_min += 360
                         lon_max += 360
-        # Select the subdomain
-        subdomain = self.ds.sel(
+        # Select the subdomain in longitude direction
+        subdomain = subdomain.sel(
             **{
-                self.dim_names["latitude"]: slice(lat_min - margin, lat_max + margin),
                 self.dim_names["longitude"]: slice(lon_min - margin, lon_max + margin),
             }
         )
@@ -1522,3 +1427,853 @@ class ERA5Correction(Dataset):
                 "The correction dataset does not contain all specified longitude values."
             )
         object.__setattr__(self, "ds", subdomain)
+@dataclass(frozen=True, kw_only=True)
+class ETOPO5Dataset(Dataset):
+    """Represents topography data on the original grid from the ETOPO5 dataset.
+    Parameters
+    ----------
+    filename : str, optional
+        The path to the ETOPO5 dataset file. If not provided, the dataset will be downloaded
+        automatically via the `pooch` library.
+    var_names : Dict[str, str], optional
+        Dictionary of variable names required in the dataset. Defaults to:
+        {
+            "topo": "topo",
+        }
+    dim_names : Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset. Defaults to:
+        {"longitude": "lon", "latitude": "lat"}.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the ETOPO5 data, loaded from the specified file.
+    """
+    filename: str = field(default_factory=lambda: download_topo("etopo5.nc"))
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "topo": "topo",
+        }
+    )
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {"longitude": "lon", "latitude": "lat"}
+    )
+    ds: xr.Dataset = field(init=False, repr=False)
+    def clean_up(self, ds: xr.Dataset) -> xr.Dataset:
+        """Assign lat and lon as coordinates.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The input dataset.
+        Returns
+        -------
+        ds : xr.Dataset
+            A cleaned `xarray.Dataset` with updated coordinates.
+        """
+        ds = ds.assign_coords(
+            {
+                "lon": ds["topo_lon"],
+                "lat": ds["topo_lat"],
+            }
+        )
+        return ds
+@dataclass(frozen=True, kw_only=True)
+class SRTM15Dataset(Dataset):
+    """Represents topography data on the original grid from the SRTM15 dataset.
+    Parameters
+    ----------
+    filename : str
+        The path to the SRTM15 dataset file.
+    var_names : Dict[str, str], optional
+        Dictionary of variable names required in the dataset. Defaults to:
+        {
+            "topo": "z",
+        }
+    dim_names : Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset. Defaults to:
+        {"longitude": "lon", "latitude": "lat"}.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the SRTM15 data, loaded from the specified file.
+    """
+    filename: str
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "topo": "z",
+        }
+    )
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {"longitude": "lon", "latitude": "lat"}
+    )
+    ds: xr.Dataset = field(init=False, repr=False)
+# river datasets
+@dataclass(frozen=True, kw_only=True)
+class RiverDataset:
+    """Represents river data.
+    Parameters
+    ----------
+    filename : Union[str, Path, List[Union[str, Path]]]
+        The path to the data file(s). Can be a single string (with or without wildcards), a single Path object,
+        or a list of strings or Path objects containing multiple files.
+    start_time : datetime
+        The start time for selecting relevant data.
+    end_time : datetime
+        The end time for selecting relevant data.
+    dim_names: Dict[str, str]
+        Dictionary specifying the names of dimensions in the dataset.
+        Requires "station" and "time" as keys.
+    var_names: Dict[str, str]
+        Dictionary of variable names that are required in the dataset.
+        Requires the keys "latitude", "longitude", "flux", "ratio", and "name".
+    opt_var_names: Dict[str, str], optional
+        Dictionary of variable names that are optional in the dataset.
+        Defaults to an empty dictionary.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to False.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the forcing data on its original grid.
+    """
+    filename: Union[str, Path, List[Union[str, Path]]]
+    start_time: datetime
+    end_time: datetime
+    dim_names: Dict[str, str]
+    var_names: Dict[str, str]
+    opt_var_names: Optional[Dict[str, str]] = field(default_factory=dict)
+    climatology: Optional[bool] = False
+    ds: xr.Dataset = field(init=False, repr=False)
+    def __post_init__(self):
+        # Validate start_time and end_time
+        if not isinstance(self.start_time, datetime):
+            raise TypeError(
+                f"start_time must be a datetime object, but got {type(self.start_time).__name__}."
+            )
+        if not isinstance(self.end_time, datetime):
+            raise TypeError(
+                f"end_time must be a datetime object, but got {type(self.end_time).__name__}."
+            )
+        ds = self.load_data()
+        ds = self.clean_up(ds)
+        self.check_dataset(ds)
+        # Select relevant times
+        ds = self.add_time_info(ds)
+        object.__setattr__(self, "ds", ds)
+    def load_data(self) -> xr.Dataset:
+        """Load dataset from the specified file.
+        Returns
+        -------
+        ds : xr.Dataset
+            The loaded xarray Dataset containing the forcing data.
+        """
+        ds = _load_data(
+            self.filename, self.dim_names, use_dask=False, decode_times=False
+        )
+        return ds
+    def clean_up(self, ds: xr.Dataset) -> xr.Dataset:
+        """Decodes the 'name' variable (if byte-encoded) and updates the dataset.
+        This method checks if the 'name' variable is of dtype 'object' (i.e., byte-encoded),
+        and if so, decodes each byte array to a string and updates the dataset.
+        It also ensures that the 'station' dimension is of integer type.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The dataset containing the 'name' variable to decode.
+        Returns
+        -------
+        ds : xr.Dataset
+            The dataset with the decoded 'name' variable.
+        """
+        if ds[self.var_names["name"]].dtype == "object":
+            names = []
+            for i in range(len(ds[self.dim_names["station"]])):
+                byte_array = ds[self.var_names["name"]].isel(
+                    **{self.dim_names["station"]: i}
+                )
+                name = decode_string(byte_array)
+                names.append(name)
+            ds[self.var_names["name"]] = xr.DataArray(
+                data=names, dims=self.dim_names["station"]
+            )
+        if ds[self.dim_names["station"]].dtype == "float64":
+            ds[self.dim_names["station"]] = ds[self.dim_names["station"]].astype(int)
+        # Drop all variables that have chars dim
+        vars_to_drop = ["ocn_name", "stn_name", "ct_name", "cn_name", "chars"]
+        existing_vars = [var for var in vars_to_drop if var in ds]
+        ds = ds.drop_vars(existing_vars)
+        return ds
+    def check_dataset(self, ds: xr.Dataset) -> None:
+        """Check if the dataset contains the specified variables and dimensions.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The xarray Dataset to check.
+        Raises
+        ------
+        ValueError
+            If the dataset does not contain the specified variables or dimensions.
+        """
+        _check_dataset(ds, self.dim_names, self.var_names, self.opt_var_names)
+    def add_time_info(self, ds: xr.Dataset) -> xr.Dataset:
+        """Dummy method to be overridden by child classes to add time information to the
+        dataset.
+        This method is intended as a placeholder and should be implemented in subclasses
+        to provide specific functionality for adding time-related information to the dataset.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The xarray Dataset to which time information will be added.
+        Returns
+        -------
+        xr.Dataset
+            The xarray Dataset with time information added (as implemented by child classes).
+        """
+        return ds
+    def select_relevant_times(self, ds) -> xr.Dataset:
+        """Select a subset of the dataset based on the specified time range.
+        This method filters the dataset to include all records between `start_time` and `end_time`.
+        Additionally, it ensures that one record at or before `start_time` and one record at or
+        after `end_time` are included, even if they fall outside the strict time range.
+        If no `end_time` is specified, the method will select the time range of
+        [start_time, start_time + 24 hours] and return the closest time entry to `start_time` within that range.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The input dataset to be filtered. Must contain a time dimension.
+        Returns
+        -------
+        xr.Dataset
+            A dataset filtered to the specified time range, including the closest entries
+            at or before `start_time` and at or after `end_time` if applicable.
+        Warns
+        -----
+        UserWarning
+            If no records at or before `start_time` or no records at or after `end_time` are found.
+        UserWarning
+            If the dataset does not contain any time dimension or the time dimension is incorrectly named.
+        """
+        time_dim = self.dim_names["time"]
+        ds = _select_relevant_times(ds, time_dim, self.start_time, self.end_time, False)
+        return ds
+    def compute_climatology(self):
+        logging.info("Compute climatology for river forcing.")
+        time_dim = self.dim_names["time"]
+        flux = self.ds[self.var_names["flux"]].groupby(f"{time_dim}.month").mean()
+        self.ds[self.var_names["flux"]] = flux
+        ds = assign_dates_to_climatology(self.ds, "month")
+        ds = ds.swap_dims({"month": "time"})
+        object.__setattr__(self, "ds", ds)
+        updated_dim_names = {**self.dim_names}
+        updated_dim_names["time"] = "time"
+        object.__setattr__(self, "dim_names", updated_dim_names)
+        object.__setattr__(self, "climatology", True)
+    def sort_by_river_volume(self, ds: xr.Dataset) -> xr.Dataset:
+        """Sorts the dataset by river volume in descending order (largest rivers first),
+        if the volume variable is available.
+        This method uses the river volume to reorder the dataset such that the rivers with
+        the largest volumes come first in the `station` dimension. If the volume variable
+        is not present in the dataset, a warning is logged.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The xarray Dataset containing the river data to be sorted by volume.
+        Returns
+        -------
+        xr.Dataset
+            The dataset with rivers sorted by their volume in descending order.
+            If the volume variable is not available, the original dataset is returned.
+        """
+        if "vol" in self.opt_var_names:
+            volume_values = ds[self.opt_var_names["vol"]].values
+            if isinstance(volume_values, np.ndarray):
+                # Check if all volume values are the same
+                if np.all(volume_values == volume_values[0]):
+                    # If all volumes are the same, no need to reverse order
+                    sorted_indices = np.argsort(
+                        volume_values
+                    )  # Sort in ascending order
+                else:
+                    # If volumes differ, reverse order for descending sort
+                    sorted_indices = np.argsort(volume_values)[
+                        ::-1
+                    ]  # Reverse for descending order
+                ds = ds.isel(**{self.dim_names["station"]: sorted_indices})
+            else:
+                logging.warning("The volume data is not in a valid array format.")
+        else:
+            logging.warning(
+                "Cannot sort rivers by volume. 'vol' is missing in the variable names."
+            )
+        return ds
+    def extract_relevant_rivers(self, target_coords, dx):
+        """Extracts a subset of the dataset based on the proximity of river mouths to
+        target coordinates.
+        This method calculates the distance between each river mouth and the provided target coordinates
+        (latitude and longitude) using the `gc_dist` function. It then filters the dataset to include only those
+        river stations whose minimum distance from the target is less than a specified threshold distance (`dx`).
+        Parameters
+        ----------
+        target_coords : dict
+            A dictionary containing the target coordinates for the comparison. It should include:
+            - "lon" (float): The target longitude in degrees.
+            - "lat" (float): The target latitude in degrees.
+            - "straddle" (bool): A flag indicating whether to adjust the longitudes for stations that cross the
+              International Date Line. If `True`, longitudes greater than 180 degrees are adjusted by subtracting 360,
+              otherwise, negative longitudes are adjusted by adding 360.
+        dx : float
+            The maximum distance threshold (in meters) for including a river station. Only river mouths that are
+            within `dx` meters from the target coordinates will be included in the returned dataset.
+        Returns
+        -------
+        indices : dict
+            A dictionary containing the indices of the rivers that are within the threshold distance from
+            the target coordinates. The dictionary keys are:
+            - "station" : numpy.ndarray
+                The indices of the rivers that satisfy the distance threshold.
+            - "eta_rho" : numpy.ndarray
+                The indices of the `eta_rho` dimension corresponding to the selected stations.
+            - "xi_rho" : numpy.ndarray
+                The indices of the `xi_rho` dimension corresponding to the selected stations.
+        """
+        # Retrieve longitude and latitude of river mouths
+        river_lon = self.ds[self.var_names["longitude"]]
+        river_lat = self.ds[self.var_names["latitude"]]
+        # Adjust longitude based on whether it crosses the International Date Line (straddle case)
+        if target_coords["straddle"]:
+            river_lon = xr.where(river_lon > 180, river_lon - 360, river_lon)
+        else:
+            river_lon = xr.where(river_lon < 0, river_lon + 360, river_lon)
+        # Calculate the distance between the target coordinates and each river mouth
+        dist = gc_dist(target_coords["lon"], target_coords["lat"], river_lon, river_lat)
+        dist_min = dist.min(dim=["eta_rho", "xi_rho"])
+        # Filter the dataset to include only stations within the distance threshold
+        if (dist_min < dx).any():
+            ds = self.ds.where(dist_min < dx, drop=True)
+            ds = self.sort_by_river_volume(ds)
+            dist = dist.where(dist_min < dx, drop=True).transpose(
+                self.dim_names["station"], "eta_rho", "xi_rho"
+            )
+            dist_min = dist_min.where(dist_min < dx, drop=True)
+            # Find the indices of the closest grid cell to the river mouth
+            indices = np.where(dist == dist_min)
+            names = (
+                self.ds[self.var_names["name"]]
+                .isel({self.dim_names["station"]: indices[0]})
+                .values
+            )
+            # Return the indices in a dictionary format
+            indices = {
+                "station": indices[0],
+                "eta_rho": indices[1],
+                "xi_rho": indices[2],
+                "name": names,
+            }
+        else:
+            ds = xr.Dataset()
+            indices = {
+                "station": [],
+                "eta_rho": [],
+                "xi_rho": [],
+                "name": [],
+            }
+        object.__setattr__(self, "ds", ds)
+        return indices
+@dataclass(frozen=True, kw_only=True)
+class DaiRiverDataset(RiverDataset):
+    """Represents river data from the Dai river dataset.
+    Parameters
+    ----------
+    filename : Union[str, Path, List[Union[str, Path]]], optional
+        The path to the Dai River dataset file. If not provided, the dataset will be downloaded
+        automatically via the `pooch` library.
+    start_time : datetime
+        The start time for selecting relevant data.
+    end_time : datetime
+        The end time for selecting relevant data.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    var_names: Dict[str, str], optional
+        Dictionary of variable names that are required in the dataset.
+    opt_var_names: Dict[str, str], optional
+        Dictionary of variable names that are optional in the dataset.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to False.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the forcing data on its original grid.
+    """
+    filename: Union[str, Path, List[Union[str, Path]]] = field(
+        default_factory=lambda: download_river_data("dai_trenberth_may2019.nc")
+    )
+    start_time: datetime
+    end_time: datetime
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "station": "station",
+            "time": "time",
+        }
+    )
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "latitude": "lat_mou",
+            "longitude": "lon_mou",
+            "flux": "FLOW",
+            "ratio": "ratio_m2s",
+            "name": "riv_name",
+        }
+    )
+    opt_var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "vol": "vol_stn",
+        }
+    )
+    climatology: Optional[bool] = False
+    ds: xr.Dataset = field(init=False, repr=False)
+    def add_time_info(self, ds: xr.Dataset) -> xr.Dataset:
+        """Adds time information to the dataset based on the climatology flag and
+        dimension names.
+        This method processes the dataset to include time information according to the climatology
+        setting. If the dataset represents climatology data and the time dimension is labeled as
+        "month", it assigns dates to the dataset based on a monthly climatology. Additionally, it
+        handles dimension name updates if necessary.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The input dataset to which time information will be added.
+        Returns
+        -------
+        xr.Dataset
+            The dataset with time information added, including adjustments for climatology and
+            dimension names.
+        """
+        time_dim = self.dim_names["time"]
+        # Extract the 'time' variable as a numpy array
+        time_vals = ds[time_dim].values
+        # Handle rounding of the time values
+        year = np.round(time_vals * 1e-2).astype(int)
+        month = np.round((time_vals * 1e-2 - year) * 1e2).astype(int)
+        # Convert to datetime (assuming the day is always 15th for this example)
+        dates = [datetime(year=i, month=m, day=15) for i, m in zip(year, month)]
+        ds[time_dim] = dates
+        return ds
+# shared functions
+def _load_data(filename, dim_names, use_dask, decode_times=True):
+    """Load dataset from the specified file.
+    Parameters
+    ----------
+    filename : Union[str, Path, List[Union[str, Path]]]
+        The path to the data file(s). Can be a single string (with or without wildcards), a single Path object,
+        or a list of strings or Path objects containing multiple files.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    use_dask: bool
+        Indicates whether to use dask for chunking. If True, data is loaded with dask; if False, data is loaded eagerly. Defaults to False.
+    decode_times: bool, optional
+        If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers.
+        Defaults to True.
+    Returns
+    -------
+    ds : xr.Dataset
+        The loaded xarray Dataset containing the forcing data.
+    Raises
+    ------
+    FileNotFoundError
+        If the specified file does not exist.
+    ValueError
+        If a list of files is provided but dim_names["time"] is not available or use_dask=False.
+    """
+    # Precompile the regex for matching wildcard characters
+    wildcard_regex = re.compile(r"[\*\?\[\]]")
+    # Convert Path objects to strings
+    if isinstance(filename, (str, Path)):
+        filename_str = str(filename)
+    elif isinstance(filename, list):
+        filename_str = [str(f) for f in filename]
+    else:
+        raise ValueError("filename must be a string, Path, or a list of strings/Paths.")
+    # Handle the case when filename is a string
+    contains_wildcard = False
+    if isinstance(filename_str, str):
+        contains_wildcard = bool(wildcard_regex.search(filename_str))
+        if contains_wildcard:
+            matching_files = glob.glob(filename_str)
+            if not matching_files:
+                raise FileNotFoundError(
+                    f"No files found matching the pattern '{filename_str}'."
+                )
+        else:
+            matching_files = [filename_str]
+    # Handle the case when filename is a list
+    elif isinstance(filename_str, list):
+        contains_wildcard = any(wildcard_regex.search(f) for f in filename_str)
+        if contains_wildcard:
+            matching_files = []
+            for f in filename_str:
+                files = glob.glob(f)
+                if not files:
+                    raise FileNotFoundError(
+                        f"No files found matching the pattern '{f}'."
+                    )
+                matching_files.extend(files)
+        else:
+            matching_files = filename_str
+    # Check if time dimension is available when multiple files are provided
+    if isinstance(filename_str, list) and "time" not in dim_names:
+        raise ValueError(
+            "A list of files is provided, but time dimension is not available. "
+            "A time dimension must be available to concatenate the files."
+        )
+    # Determine the kwargs for combining datasets
+    if contains_wildcard or len(matching_files) == 1:
+        # If there is a wildcard or just one file, use by_coords
+        kwargs = {"combine": "by_coords"}
+    else:
+        # Otherwise, use nested combine based on time
+        kwargs = {"combine": "nested", "concat_dim": dim_names["time"]}
+    # Base kwargs used for dataset combination
+    combine_kwargs = {
+        "coords": "minimal",
+        "compat": "override",
+        "combine_attrs": "override",
+    }
+    if use_dask:
+        chunks = {
+            dim_names["latitude"]: -1,
+            dim_names["longitude"]: -1,
+        }
+        if "depth" in dim_names:
+            chunks[dim_names["depth"]] = -1
+        if "time" in dim_names:
+            chunks[dim_names["time"]] = 1
+        ds = xr.open_mfdataset(
+            matching_files,
+            decode_times=decode_times,
+            chunks=chunks,
+            **combine_kwargs,
+            **kwargs,
+        )
+    else:
+        ds_list = []
+        for file in matching_files:
+            ds = xr.open_dataset(file, decode_times=decode_times, chunks=None)
+            ds_list.append(ds)
+        if kwargs["combine"] == "by_coords":
+            ds = xr.combine_by_coords(ds_list, **combine_kwargs)
+        elif kwargs["combine"] == "nested":
+            ds = xr.combine_nested(
+                ds_list, concat_dim=kwargs["concat_dim"], **combine_kwargs
+            )
+    if "time" in dim_names and dim_names["time"] not in ds.dims:
+        ds = ds.expand_dims(dim_names["time"])
+    return ds
+def _check_dataset(
+    ds: xr.Dataset,
+    dim_names: Dict[str, str],
+    var_names: Dict[str, str],
+    opt_var_names: Optional[Dict[str, str]] = None,
+) -> None:
+    """Check if the dataset contains the specified variables and dimensions.
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset to check.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    var_names: Dict[str, str]
+        Dictionary of variable names that are required in the dataset.
+    opt_var_names : Optional[Dict[str, str]], optional
+        Dictionary of optional variable names.
+        These variables are not strictly required, and the function will not raise an error if they are missing.
+        Default is None, meaning no optional variables are considered.
+    Raises
+    ------
+    ValueError
+        If the dataset does not contain the specified variables or dimensions.
+    """
+    missing_dims = [dim for dim in dim_names.values() if dim not in ds.dims]
+    if missing_dims:
+        raise ValueError(
+            f"Dataset does not contain all required dimensions. The following dimensions are missing: {missing_dims}"
+        )
+    missing_vars = [var for var in var_names.values() if var not in ds.data_vars]
+    if missing_vars:
+        raise ValueError(
+            f"Dataset does not contain all required variables. The following variables are missing: {missing_vars}"
+        )
+    if opt_var_names:
+        missing_optional_vars = [
+            var for var in opt_var_names.values() if var not in ds.data_vars
+        ]
+        if missing_optional_vars:
+            logging.warning(
+                f"Optional variables missing (but not critical): {missing_optional_vars}"
+            )
+def _select_relevant_times(
+    ds, time_dim, start_time=None, end_time=None, climatology=False
+) -> xr.Dataset:
+    """Select a subset of the dataset based on the specified time range.
+    This method filters the dataset to include all records between `start_time` and `end_time`.
+    Additionally, it ensures that one record at or before `start_time` and one record at or
+    after `end_time` are included, even if they fall outside the strict time range.
+    If no `end_time` is specified, the method will select the time range of
+    [start_time, start_time + 24 hours] and return the closest time entry to `start_time` within that range.
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The input dataset to be filtered. Must contain a time dimension.
+    time_dim: str
+        Name of time dimension.
+    start_time : Optional[datetime], optional
+        The start time for selecting relevant data. If not provided, the data is not filtered by start time.
+    end_time : Optional[datetime], optional
+        The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided,
+        or no filtering is applied if start_time is not provided.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to False.
+    Returns
+    -------
+    xr.Dataset
+        A dataset filtered to the specified time range, including the closest entries
+        at or before `start_time` and at or after `end_time` if applicable.
+    Raises
+    ------
+    ValueError
+        If no matching times are found between `start_time` and `start_time + 24 hours`.
+    Warns
+    -----
+    UserWarning
+        If the dataset contains exactly 12 time steps but the climatology flag is not set.
+        This may indicate that the dataset represents climatology data.
+    UserWarning
+        If no records at or before `start_time` or no records at or after `end_time` are found.
+    UserWarning
+        If the dataset does not contain any time dimension or the time dimension is incorrectly named.
+    Notes
+    -----
+    - If the `climatology` flag is set and `end_time` is not provided, the method will
+      interpolate initial conditions from climatology data.
+    - If the dataset uses `cftime` datetime objects, these will be converted to standard
+      `np.datetime64` objects before filtering.
+    """
+    if time_dim in ds.variables:
+        if climatology:
+            if len(ds[time_dim]) != 12:
+                raise ValueError(
+                    f"The dataset contains {len(ds[time_dim])} time steps, but the climatology flag is set to True, which requires exactly 12 time steps."
+                )
+            if not end_time:
+                # Interpolate from climatology for initial conditions
+                ds = interpolate_from_climatology(ds, time_dim, start_time)
+        else:
+            time_type = get_time_type(ds[time_dim])
+            if time_type == "int":
+                raise ValueError(
+                    "The dataset contains integer time values, which are only supported when the climatology flag is set to True. However, your climatology flag is set to False."
+                )
+            if time_type == "cftime":
+                ds = ds.assign_coords(
+                    {time_dim: convert_cftime_to_datetime(ds[time_dim])}
+                )
+            if end_time:
+                end_time = end_time
+                # Identify records before or at start_time
+                before_start = ds[time_dim] <= np.datetime64(start_time)
+                if before_start.any():
+                    closest_before_start = (
+                        ds[time_dim].where(before_start, drop=True).max()
+                    )
+                else:
+                    logging.warning("No records found at or before the start_time.")
+                    closest_before_start = ds[time_dim].min()
+                # Identify records after or at end_time
+                after_end = ds[time_dim] >= np.datetime64(end_time)
+                if after_end.any():
+                    closest_after_end = ds[time_dim].where(after_end, drop=True).min()
+                else:
+                    logging.warning("No records found at or after the end_time.")
+                    closest_after_end = ds[time_dim].max()
+                # Select records within the time range and add the closest before/after
+                within_range = (ds[time_dim] > np.datetime64(start_time)) & (
+                    ds[time_dim] < np.datetime64(end_time)
+                )
+                selected_times = ds[time_dim].where(
+                    within_range
+                    | (ds[time_dim] == closest_before_start)
+                    | (ds[time_dim] == closest_after_end),
+                    drop=True,
+                )
+                ds = ds.sel({time_dim: selected_times})
+            else:
+                # Look in time range [start_time, start_time + 24h]
+                end_time = start_time + timedelta(days=1)
+                times = (np.datetime64(start_time) <= ds[time_dim]) & (
+                    ds[time_dim] < np.datetime64(end_time)
+                )
+                if np.all(~times):
+                    raise ValueError(
+                        f"The dataset does not contain any time entries between the specified start_time: {start_time} "
+                        f"and {start_time + timedelta(hours=24)}. "
+                        "Please ensure the dataset includes time entries for that range."
+                    )
+                ds = ds.where(times, drop=True)
+                if ds.sizes[time_dim] > 1:
+                    # Pick the time closest to start_time
+                    ds = ds.isel({time_dim: 0})
+                logging.info(
+                    f"Selected time entry closest to the specified start_time ({start_time}) within the range [{start_time}, {start_time + timedelta(hours=24)}]: {ds[time_dim].values}"
+                )
+    else:
+        logging.warning(
+            "Dataset does not contain any time information. Please check if the time dimension "
+            "is correctly named or if the dataset includes time data."
+        )
+    return ds
+def decode_string(byte_array):
+    # Decode each byte and handle errors with 'ignore'
+    decoded_string = "".join(
+        [
+            x.decode("utf-8", errors="ignore")  # Ignore invalid byte sequences
+            for x in byte_array.values
+            if isinstance(x, bytes) and x != b" " and x is not np.nan
+        ]
+    )
+    return decoded_string

roms-tools 1.6.2__py3-none-any.whl → 2.0.0__py3-none-any.whl

roms-tools 1.6.2py3-none-any.whl → 2.0.0py3-none-any.whl