PyPI - roms-tools - Versions diffs - 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

roms-tools 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

ci/environment.yml +2 -0
roms_tools/__init__.py +4 -2
roms_tools/_version.py +1 -1
roms_tools/setup/boundary_forcing.py +757 -0
roms_tools/setup/datasets.py +1141 -35
roms_tools/setup/download.py +118 -0
roms_tools/setup/fill.py +118 -5
roms_tools/setup/grid.py +145 -19
roms_tools/setup/initial_conditions.py +557 -0
roms_tools/setup/mixins.py +395 -0
roms_tools/setup/plot.py +149 -4
roms_tools/setup/surface_forcing.py +596 -0
roms_tools/setup/tides.py +472 -437
roms_tools/setup/topography.py +18 -3
roms_tools/setup/utils.py +352 -0
roms_tools/setup/vertical_coordinate.py +494 -0
roms_tools/tests/test_boundary_forcing.py +706 -0
roms_tools/tests/test_datasets.py +370 -0
roms_tools/tests/test_grid.py +226 -0
roms_tools/tests/test_initial_conditions.py +520 -0
roms_tools/tests/test_surface_forcing.py +2622 -0
roms_tools/tests/test_tides.py +365 -0
roms_tools/tests/test_topography.py +78 -0
roms_tools/tests/test_utils.py +16 -0
roms_tools/tests/test_vertical_coordinate.py +337 -0
{roms_tools-0.1.0.dist-info → roms_tools-1.0.0.dist-info}/METADATA +9 -4
roms_tools-1.0.0.dist-info/RECORD +31 -0
{roms_tools-0.1.0.dist-info → roms_tools-1.0.0.dist-info}/WHEEL +1 -1
roms_tools/setup/atmospheric_forcing.py +0 -993
roms_tools/tests/test_setup.py +0 -181
roms_tools-0.1.0.dist-info/RECORD +0 -17
{roms_tools-0.1.0.dist-info → roms_tools-1.0.0.dist-info}/LICENSE +0 -0
{roms_tools-0.1.0.dist-info → roms_tools-1.0.0.dist-info}/top_level.txt +0 -0

roms_tools/setup/datasets.py CHANGED Viewed

@@ -1,48 +1,1154 @@
-import pooch
 import xarray as xr
+from dataclasses import dataclass, field
+import glob
+from datetime import datetime, timedelta
+import numpy as np
+from typing import Dict, Optional
+import dask
+import warnings
+from roms_tools.setup.utils import (
+    assign_dates_to_climatology,
+    interpolate_from_climatology,
+    is_cftime_datetime,
+    convert_cftime_to_datetime,
+)
+from roms_tools.setup.download import download_correction_data
-FRANK = pooch.create(
-    # Use the default cache folder for the operating system
-    path=pooch.os_cache("roms-tools"),
-    base_url="https://github.com/CWorthy-ocean/roms-tools-data/raw/main/",
-    # If this is a development version, get the data from the "main" branch
-    # The registry specifies the files that can be fetched
-    registry={
-        "etopo5.nc": "sha256:23600e422d59bbf7c3666090166a0d468c8ee16092f4f14e32c4e928fbcd627b",
-    },
-)
+@dataclass(frozen=True, kw_only=True)
+class Dataset:
+    """
+    Represents forcing data on original grid.
+    Parameters
+    ----------
+    filename : str
+        The path to the data files. Can contain wildcards.
+    start_time : Optional[datetime], optional
+        The start time for selecting relevant data. If not provided, the data is not filtered by start time.
+    end_time : Optional[datetime], optional
+        The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided,
+        or no filtering is applied if start_time is not provided.
+    var_names: Dict[str, str]
+        Dictionary of variable names that are required in the dataset.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to False.
+    Attributes
+    ----------
+    is_global : bool
+        Indicates whether the dataset covers the entire globe.
+    ds : xr.Dataset
+        The xarray Dataset containing the forcing data on its original grid.
+    Examples
+    --------
+    >>> dataset = Dataset(
+    ...     filename="data.nc",
+    ...     start_time=datetime(2022, 1, 1),
+    ...     end_time=datetime(2022, 12, 31),
+    ... )
+    >>> dataset.load_data()
+    >>> print(dataset.ds)
+    <xarray.Dataset>
+    Dimensions:  ...
+    """
+    filename: str
+    start_time: Optional[datetime] = None
+    end_time: Optional[datetime] = None
+    var_names: Dict[str, str]
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "longitude": "longitude",
+            "latitude": "latitude",
+            "time": "time",
+        }
+    )
+    climatology: Optional[bool] = False
+    is_global: bool = field(init=False, repr=False)
+    ds: xr.Dataset = field(init=False, repr=False)
+    def __post_init__(self):
+        """
+        Post-initialization processing:
+        1. Loads the dataset from the specified filename.
+        2. Applies time filtering based on start_time and end_time if provided.
+        3. Selects relevant fields as specified by var_names.
+        4. Ensures latitude values are in ascending order.
+        5. Checks if the dataset covers the entire globe and adjusts if necessary.
+        """
+        ds = self.load_data()
+        self.check_dataset(ds)
+        # Select relevant times
+        if "time" in self.dim_names and self.start_time is not None:
+            ds = self.add_time_info(ds)
+            ds = self.select_relevant_times(ds)
+        # Select relevant fields
+        ds = self.select_relevant_fields(ds)
+        # Make sure that latitude is ascending
+        ds = self.ensure_latitude_ascending(ds)
+        # Check whether the data covers the entire globe
+        object.__setattr__(self, "is_global", self.check_if_global(ds))
+        # If dataset is global concatenate three copies of field along longitude dimension
+        if self.is_global:
+            ds = self.concatenate_longitudes(ds)
+        object.__setattr__(self, "ds", ds)
+    def load_data(self) -> xr.Dataset:
+        """
+        Load dataset from the specified file.
+        Returns
+        -------
+        ds : xr.Dataset
+            The loaded xarray Dataset containing the forcing data.
+        Raises
+        ------
+        FileNotFoundError
+            If the specified file does not exist.
+        """
+        # Check if the file exists
+        matching_files = glob.glob(self.filename)
+        if not matching_files:
+            raise FileNotFoundError(
+                f"No files found matching the pattern '{self.filename}'."
+            )
+        # Load the dataset
+        with dask.config.set(**{"array.slicing.split_large_chunks": False}):
+            # Define the chunk sizes
+            chunks = {
+                self.dim_names["latitude"]: -1,
+                self.dim_names["longitude"]: -1,
+            }
+            if "depth" in self.dim_names.keys():
+                chunks[self.dim_names["depth"]] = -1
+            if "time" in self.dim_names.keys():
+                chunks[self.dim_names["time"]] = 1
+                ds = xr.open_mfdataset(
+                    self.filename,
+                    combine="nested",
+                    concat_dim=self.dim_names["time"],
+                    coords="minimal",
+                    compat="override",
+                    chunks=chunks,
+                    engine="netcdf4",
+                )
+            else:
+                ds = xr.open_dataset(
+                    self.filename,
+                    chunks=chunks,
+                )
+        return ds
+    def check_dataset(self, ds: xr.Dataset) -> None:
+        """
+        Check if the dataset contains the specified variables and dimensions.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The xarray Dataset to check.
+        Raises
+        ------
+        ValueError
+            If the dataset does not contain the specified variables or dimensions.
+        """
+        missing_vars = [
+            var for var in self.var_names.values() if var not in ds.data_vars
+        ]
+        if missing_vars:
+            raise ValueError(
+                f"Dataset does not contain all required variables. The following variables are missing: {missing_vars}"
+            )
+        missing_dims = [dim for dim in self.dim_names.values() if dim not in ds.dims]
+        if missing_dims:
+            raise ValueError(
+                f"Dataset does not contain all required dimensions. The following dimensions are missing: {missing_vars}"
+            )
+    def select_relevant_fields(self, ds) -> xr.Dataset:
+        """
+        Selects and returns a subset of the dataset containing only the variables specified in `self.var_names`.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The input dataset from which variables will be selected.
+        Returns
+        -------
+        xr.Dataset
+            A dataset containing only the variables specified in `self.var_names`.
+        """
+        for var in ds.data_vars:
+            if var not in self.var_names.values():
+                ds = ds.drop_vars(var)
+        return ds
+    import xarray as xr
+    def add_time_info(self, ds: xr.Dataset) -> xr.Dataset:
+        """
+        Dummy method to be overridden by child classes to add time information to the dataset.
+        This method is intended as a placeholder and should be implemented in subclasses
+        to provide specific functionality for adding time-related information to the dataset.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The xarray Dataset to which time information will be added.
+        Returns
+        -------
+        xr.Dataset
+            The xarray Dataset with time information added (as implemented by child classes).
+        """
+        return ds
+    def select_relevant_times(self, ds) -> xr.Dataset:
+        """
+        Selects and returns the subset of the dataset corresponding to the specified time range.
+        This function filters the dataset to include only the data points within the specified
+        time range, defined by `self.start_time` and `self.end_time`. If `self.end_time` is not
+        provided, it defaults to one day after `self.start_time`.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The input dataset to be filtered.
+        Returns
+        -------
+        xr.Dataset
+            A dataset containing only the data points within the specified time range.
+        Raises
+        ------
+        ValueError
+            If no matching times are found or if the number of matching times does not meet expectations.
+        Warns
+        -----
+        UserWarning
+            If the dataset contains only 12 time steps but the climatology flag is not set.
+            This may indicate that the dataset represents climatology data.
+        """
-def fetch_topo(topography_source) -> xr.Dataset:
+        time_dim = self.dim_names["time"]
+        if time_dim in ds.coords or time_dim in ds.data_vars:
+            if self.climatology:
+                if not self.end_time:
+                    # Interpolate from climatology for initial conditions
+                    ds = interpolate_from_climatology(
+                        ds, self.dim_names["time"], self.start_time
+                    )
+            else:
+                if len(ds[time_dim]) == 12:
+                    warnings.warn(
+                        "The dataset contains exactly 12 time steps. This may indicate that it is "
+                        "climatological data. Please verify if climatology is appropriate for your "
+                        "analysis and set the climatology flag to True."
+                    )
+                if is_cftime_datetime(ds[time_dim]):
+                    ds = ds.assign_coords(
+                        {time_dim: convert_cftime_to_datetime(ds[time_dim])}
+                    )
+                if not self.end_time:
+                    end_time = self.start_time + timedelta(days=1)
+                else:
+                    end_time = self.end_time
+                times = (np.datetime64(self.start_time) <= ds[time_dim]) & (
+                    ds[time_dim] < np.datetime64(end_time)
+                )
+                ds = ds.where(times, drop=True)
+        else:
+            warnings.warn(
+                "Dataset does not contain any time information. Please check if the time dimension "
+                "is correctly named or if the dataset includes time data."
+            )
+        if not ds.sizes[time_dim]:
+            raise ValueError("No matching times found in the dataset.")
+        if not self.end_time:
+            if ds.sizes[time_dim] != 1:
+                found_times = ds.sizes[time_dim]
+                raise ValueError(
+                    f"There must be exactly one time matching the start_time. Found {found_times} matching times."
+                )
+        return ds
+    def ensure_latitude_ascending(self, ds: xr.Dataset) -> xr.Dataset:
+        """
+        Ensure that the latitude dimension is in ascending order.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The xarray Dataset to check.
+        Returns
+        -------
+        ds : xr.Dataset
+            The xarray Dataset with latitude in ascending order.
+        """
+        # Make sure that latitude is ascending
+        lat_diff = np.diff(ds[self.dim_names["latitude"]])
+        if np.all(lat_diff < 0):
+            ds = ds.isel(**{self.dim_names["latitude"]: slice(None, None, -1)})
+        return ds
+    def check_if_global(self, ds) -> bool:
+        """
+        Checks if the dataset covers the entire globe in the longitude dimension.
+        This function calculates the mean difference between consecutive longitude values.
+        It then checks if the difference between the first and last longitude values (plus 360 degrees)
+        is close to this mean difference, within a specified tolerance. If it is, the dataset is considered
+        to cover the entire globe in the longitude dimension.
+        Returns
+        -------
+        bool
+            True if the dataset covers the entire globe in the longitude dimension, False otherwise.
+        """
+        dlon_mean = (
+            ds[self.dim_names["longitude"]].diff(dim=self.dim_names["longitude"]).mean()
+        )
+        dlon = (
+            ds[self.dim_names["longitude"]][0] - ds[self.dim_names["longitude"]][-1]
+        ) % 360.0
+        is_global = np.isclose(dlon, dlon_mean, rtol=0.0, atol=1e-3).item()
+        return is_global
+    def concatenate_longitudes(self, ds):
+        """
+        Concatenates the field three times: with longitudes shifted by -360, original longitudes, and shifted by +360.
+        Parameters
+        ----------
+        field : xr.DataArray
+            The field to be concatenated.
+        Returns
+        -------
+        xr.DataArray
+            The concatenated field, with the longitude dimension extended.
+        Notes
+        -----
+        Concatenating three times may be overkill in most situations, but it is safe. Alternatively, we could refactor
+        to figure out whether concatenating on the lower end, upper end, or at all is needed.
+        """
+        ds_concatenated = xr.Dataset()
+        lon = ds[self.dim_names["longitude"]]
+        lon_minus360 = lon - 360
+        lon_plus360 = lon + 360
+        lon_concatenated = xr.concat(
+            [lon_minus360, lon, lon_plus360], dim=self.dim_names["longitude"]
+        )
+        ds_concatenated[self.dim_names["longitude"]] = lon_concatenated
+        for var in self.var_names.values():
+            if self.dim_names["longitude"] in ds[var].dims:
+                field = ds[var]
+                field_concatenated = xr.concat(
+                    [field, field, field], dim=self.dim_names["longitude"]
+                ).chunk({self.dim_names["longitude"]: -1})
+                field_concatenated[self.dim_names["longitude"]] = lon_concatenated
+                ds_concatenated[var] = field_concatenated
+            else:
+                ds_concatenated[var] = ds[var]
+        return ds_concatenated
+    def choose_subdomain(
+        self, latitude_range, longitude_range, margin, straddle, return_subdomain=False
+    ):
+        """
+        Selects a subdomain from the xarray Dataset based on specified latitude and longitude ranges,
+        extending the selection by a specified margin. Handles longitude conversions to accommodate different
+        longitude ranges.
+        Parameters
+        ----------
+        latitude_range : tuple of float
+            A tuple (lat_min, lat_max) specifying the minimum and maximum latitude values of the subdomain.
+        longitude_range : tuple of float
+            A tuple (lon_min, lon_max) specifying the minimum and maximum longitude values of the subdomain.
+        margin : float
+            Margin in degrees to extend beyond the specified latitude and longitude ranges when selecting the subdomain.
+        straddle : bool
+            If True, target longitudes are expected in the range [-180, 180].
+            If False, target longitudes are expected in the range [0, 360].
+        return_subdomain : bool, optional
+            If True, returns the subset of the original dataset as an xarray Dataset. If False, assigns the subset to `self.ds`.
+            Defaults to False.
+        Returns
+        -------
+        xr.Dataset or None
+            If `return_subdomain` is True, returns the subset of the original dataset representing the chosen subdomain,
+            including an extended area to cover one extra grid point beyond the specified ranges. If `return_subdomain` is False,
+            returns None as the subset is assigned to `self.ds`.
+        Notes
+        -----
+        This method adjusts the longitude range if necessary to ensure it matches the expected range for the dataset.
+        It also handles longitude discontinuities that can occur when converting to different longitude ranges.
+        This is important for avoiding artifacts in the interpolation process.
+        Raises
+        ------
+        ValueError
+            If the selected latitude or longitude range does not intersect with the dataset.
+        """
+        lat_min, lat_max = latitude_range
+        lon_min, lon_max = longitude_range
+        if not self.is_global:
+            # Adjust longitude range if needed to match the expected range
+            lon = self.ds[self.dim_names["longitude"]]
+            if not straddle:
+                if lon.min() < -180:
+                    if lon_max + margin > 0:
+                        lon_min -= 360
+                        lon_max -= 360
+                elif lon.min() < 0:
+                    if lon_max + margin > 180:
+                        lon_min -= 360
+                        lon_max -= 360
+            if straddle:
+                if lon.max() > 360:
+                    if lon_min - margin < 180:
+                        lon_min += 360
+                        lon_max += 360
+                elif lon.max() > 180:
+                    if lon_min - margin < 0:
+                        lon_min += 360
+                        lon_max += 360
+        # Select the subdomain
+        subdomain = self.ds.sel(
+            **{
+                self.dim_names["latitude"]: slice(lat_min - margin, lat_max + margin),
+                self.dim_names["longitude"]: slice(lon_min - margin, lon_max + margin),
+            }
+        )
+        # Check if the selected subdomain has zero dimensions in latitude or longitude
+        if subdomain[self.dim_names["latitude"]].size == 0:
+            raise ValueError("Selected latitude range does not intersect with dataset.")
+        if subdomain[self.dim_names["longitude"]].size == 0:
+            raise ValueError(
+                "Selected longitude range does not intersect with dataset."
+            )
+        # Adjust longitudes to expected range if needed
+        lon = subdomain[self.dim_names["longitude"]]
+        if straddle:
+            subdomain[self.dim_names["longitude"]] = xr.where(lon > 180, lon - 360, lon)
+        else:
+            subdomain[self.dim_names["longitude"]] = xr.where(lon < 0, lon + 360, lon)
+        if return_subdomain:
+            return subdomain
+        else:
+            object.__setattr__(self, "ds", subdomain)
+    def convert_to_negative_depth(self):
+        """
+        Converts the depth values in the dataset to negative if they are non-negative.
+        This method checks the values in the depth dimension of the dataset (`self.ds[self.dim_names["depth"]]`).
+        If all values are greater than or equal to zero, it negates them and updates the dataset accordingly.
+        """
+        depth = self.ds[self.dim_names["depth"]]
+        if (depth >= 0).all():
+            self.ds[self.dim_names["depth"]] = -depth
+@dataclass(frozen=True, kw_only=True)
+class TPXODataset(Dataset):
     """
-    Load the global topography data as an xarray Dataset.
+    Represents tidal data on the original grid from the TPXO dataset.
+    Parameters
+    ----------
+    filename : str
+        The path to the TPXO dataset file.
+    var_names : Dict[str, str], optional
+        Dictionary of variable names required in the dataset. Defaults to:
+        {
+            "h_Re": "h_Re",
+            "h_Im": "h_Im",
+            "sal_Re": "sal_Re",
+            "sal_Im": "sal_Im",
+            "u_Re": "u_Re",
+            "u_Im": "u_Im",
+            "v_Re": "v_Re",
+            "v_Im": "v_Im",
+            "depth": "depth"
+        }
+    dim_names : Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset. Defaults to:
+        {"longitude": "ny", "latitude": "nx", "ntides": "nc"}.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the TPXO tidal model data, loaded from the specified file.
+    reference_date : datetime
+        The reference date for the TPXO data. Default is datetime(1992, 1, 1).
     """
-    # Mapping from user-specified topography options to corresponding filenames in the registry
-    topo_dict = {"etopo5": "etopo5.nc"}
-    # The file will be downloaded automatically the first time this is run
-    # returns the file path to the downloaded file. Afterwards, Pooch finds
-    # it in the local cache and doesn't repeat the download.
-    fname = FRANK.fetch(topo_dict[topography_source])
-    # The "fetch" method returns the full path to the downloaded data file.
-    # All we need to do now is load it with our standard Python tools.
-    ds = xr.open_dataset(fname)
-    return ds
+    filename: str
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "ssh_Re": "h_Re",
+            "ssh_Im": "h_Im",
+            "sal_Re": "sal_Re",
+            "sal_Im": "sal_Im",
+            "u_Re": "u_Re",
+            "u_Im": "u_Im",
+            "v_Re": "v_Re",
+            "v_Im": "v_Im",
+            "depth": "depth",
+        }
+    )
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {"longitude": "ny", "latitude": "nx", "ntides": "nc"}
+    )
+    ds: xr.Dataset = field(init=False, repr=False)
+    reference_date: datetime = datetime(1992, 1, 1)
+    def __post_init__(self):
+        # Perform any necessary dataset initialization or modifications here
+        ds = super().load_data()
+        # Clean up dataset
+        ds = ds.assign_coords(
+            {
+                "omega": ds["omega"],
+                "nx": ds["lon_r"].isel(
+                    ny=0
+                ),  # lon_r is constant along ny, i.e., is only a function of nx
+                "ny": ds["lat_r"].isel(
+                    nx=0
+                ),  # lat_r is constant along nx, i.e., is only a function of ny
+            }
+        )
+        ds = ds.rename({"nx": "longitude", "ny": "latitude"})
+        object.__setattr__(
+            self,
+            "dim_names",
+            {
+                "latitude": "latitude",
+                "longitude": "longitude",
+                "ntides": self.dim_names["ntides"],
+            },
+        )
+        # Select relevant fields
+        ds = super().select_relevant_fields(ds)
-def fetch_ssr_correction(correction_source) -> xr.Dataset:
+        # Check whether the data covers the entire globe
+        object.__setattr__(self, "is_global", super().check_if_global(ds))
+        # If dataset is global concatenate three copies of field along longitude dimension
+        if self.is_global:
+            ds = super().concatenate_longitudes(ds)
+        object.__setattr__(self, "ds", ds)
+    def check_number_constituents(self, ntides: int):
+        """
+        Checks if the number of constituents in the dataset is at least `ntides`.
+        Parameters
+        ----------
+        ntides : int
+            The required number of tidal constituents.
+        Raises
+        ------
+        ValueError
+            If the number of constituents in the dataset is less than `ntides`.
+        """
+        if len(self.ds[self.dim_names["ntides"]]) < ntides:
+            raise ValueError(
+                f"The dataset contains fewer than {ntides} tidal constituents."
+            )
+@dataclass(frozen=True, kw_only=True)
+class GLORYSDataset(Dataset):
+    """
+    Represents GLORYS data on original grid.
+    Parameters
+    ----------
+    filename : str
+        The path to the data files. Can contain wildcards.
+    start_time : Optional[datetime], optional
+        The start time for selecting relevant data. If not provided, the data is not filtered by start time.
+    end_time : Optional[datetime], optional
+        The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided,
+        or no filtering is applied if start_time is not provided.
+    var_names: Dict[str, str], optional
+        Dictionary of variable names that are required in the dataset.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to False.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the GLORYS data on its original grid.
+    """
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "temp": "thetao",
+            "salt": "so",
+            "u": "uo",
+            "v": "vo",
+            "zeta": "zos",
+        }
+    )
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "longitude": "longitude",
+            "latitude": "latitude",
+            "depth": "depth",
+            "time": "time",
+        }
+    )
+    climatology: Optional[bool] = False
+@dataclass(frozen=True, kw_only=True)
+class CESMDataset(Dataset):
+    """
+    Represents CESM data on original grid.
+    Parameters
+    ----------
+    filename : str
+        The path to the data files. Can contain wildcards.
+    start_time : Optional[datetime], optional
+        The start time for selecting relevant data. If not provided, the data is not filtered by start time.
+    end_time : Optional[datetime], optional
+        The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided,
+        or no filtering is applied if start_time is not provided.
+    var_names: Dict[str, str], optional
+        Dictionary of variable names that are required in the dataset.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to True.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the GLORYS data on its original grid.
+    """
+    # overwrite load_data method from parent class
+    def load_data(self) -> xr.Dataset:
+        """
+        Load dataset from the specified file.
+        Returns
+        -------
+        ds : xr.Dataset
+            The loaded xarray Dataset containing the forcing data.
+        Raises
+        ------
+        FileNotFoundError
+            If the specified file does not exist.
+        """
+        # Check if the file exists
+        matching_files = glob.glob(self.filename)
+        if not matching_files:
+            raise FileNotFoundError(
+                f"No files found matching the pattern '{self.filename}'."
+            )
+        # Load the dataset
+        with dask.config.set(**{"array.slicing.split_large_chunks": False}):
+            # Define the chunk sizes
+            chunks = {
+                self.dim_names["latitude"]: -1,
+                self.dim_names["longitude"]: -1,
+            }
+            ds = xr.open_mfdataset(
+                self.filename,
+                combine="nested",
+                coords="minimal",
+                compat="override",
+                chunks=chunks,
+                engine="netcdf4",
+            )
+            if "time" not in self.dim_names:
+                if "time" in ds.dims:
+                    self.dim_names["time"] = "time"
+                else:
+                    if "month" in ds.dims:
+                        self.dim_names["time"] = "month"
+                    else:
+                        ds = ds.expand_dims({"time": 1})
+                        self.dim_names["time"] = "time"
+        return ds
+    def add_time_info(self, ds: xr.Dataset) -> xr.Dataset:
+        """
+        Adds time information to the dataset based on the climatology flag and dimension names.
+        This method processes the dataset to include time information according to the climatology
+        setting. If the dataset represents climatology data and the time dimension is labeled as
+        "month", it assigns dates to the dataset based on a monthly climatology. Additionally, it
+        handles dimension name updates if necessary.
+        Parameters
+        ----------
+        ds : xr.Dataset
+            The input dataset to which time information will be added.
+        Returns
+        -------
+        xr.Dataset
+            The dataset with time information added, including adjustments for climatology and
+            dimension names.
+        """
+        time_dim = self.dim_names["time"]
+        if self.climatology and time_dim == "month":
+            ds = assign_dates_to_climatology(ds, time_dim)
+            # rename dimension
+            ds = ds.swap_dims({time_dim: "time"})
+            # Update dimension names
+            updated_dim_names = self.dim_names.copy()
+            updated_dim_names["time"] = "time"
+            object.__setattr__(self, "dim_names", updated_dim_names)
+        return ds
+@dataclass(frozen=True, kw_only=True)
+class CESMBGCDataset(CESMDataset):
     """
-    Load the SSR correction data as an xarray Dataset.
+    Represents CESM BGC data on original grid.
+    Parameters
+    ----------
+    filename : str
+        The path to the data files. Can contain wildcards.
+    start_time : Optional[datetime], optional
+        The start time for selecting relevant data. If not provided, the data is not filtered by start time.
+    end_time : Optional[datetime], optional
+        The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided,
+        or no filtering is applied if start_time is not provided.
+    var_names: Dict[str, str], optional
+        Dictionary of variable names that are required in the dataset.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to True.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the GLORYS data on its original grid.
     """
-    # Mapping from user-specified topography options to corresponding filenames in the registry
-    topo_dict = {"corev2": "SSR_correction.nc"}
-    # The file will be downloaded automatically the first time this is run
-    # returns the file path to the downloaded file. Afterwards, Pooch finds
-    # it in the local cache and doesn't repeat the download.
-    fname = FRANK.fetch(topo_dict[correction_source])
-    # The "fetch" method returns the full path to the downloaded data file.
-    # All we need to do now is load it with our standard Python tools.
-    ds = xr.open_dataset(fname)
-    return ds
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "PO4": "PO4",
+            "NO3": "NO3",
+            "SiO3": "SiO3",
+            "NH4": "NH4",
+            "Fe": "Fe",
+            "Lig": "Lig",
+            "O2": "O2",
+            "DIC": "DIC",
+            "DIC_ALT_CO2": "DIC_ALT_CO2",
+            "ALK": "ALK",
+            "ALK_ALT_CO2": "ALK_ALT_CO2",
+            "DOC": "DOC",
+            "DON": "DON",
+            "DOP": "DOP",
+            "DOPr": "DOPr",
+            "DONr": "DONr",
+            "DOCr": "DOCr",
+            "spChl": "spChl",
+            "spC": "spC",
+            "spP": "spP",
+            "spFe": "spFe",
+            "diatChl": "diatChl",
+            "diatC": "diatC",
+            "diatP": "diatP",
+            "diatFe": "diatFe",
+            "diatSi": "diatSi",
+            "diazChl": "diazChl",
+            "diazC": "diazC",
+            "diazP": "diazP",
+            "diazFe": "diazFe",
+            "spCaCO3": "spCaCO3",
+            "zooC": "zooC",
+        }
+    )
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "longitude": "lon",
+            "latitude": "lat",
+            "depth": "z_t",
+        }
+    )
+    climatology: Optional[bool] = True
+    def post_process(self):
+        """
+        Processes and converts CESM data values as follows:
+        - Convert depth values from cm to m.
+        """
+        if self.dim_names["depth"] == "z_t":
+            # Fill variables that only have data in upper 150m with NaNs below
+            if (
+                "z_t_150m" in self.ds.dims
+                and np.equal(self.ds.z_t[:15].values, self.ds.z_t_150m.values).all()
+            ):
+                for var in self.var_names:
+                    if "z_t_150m" in self.ds[var].dims:
+                        self.ds[var] = self.ds[var].rename({"z_t_150m": "z_t"})
+                        self.ds[var] = self.ds[var].chunk({"z_t": -1})
+            # Convert depth from cm to m
+            ds = self.ds.assign_coords({"depth": self.ds["z_t"] / 100})
+            ds["depth"].attrs["long_name"] = "Depth"
+            ds["depth"].attrs["units"] = "m"
+            ds = ds.swap_dims({"z_t": "depth"})
+            if "z_t" in ds:
+                ds = ds.drop_vars("z_t")
+            if "z_t_150m" in ds:
+                ds = ds.drop_vars("z_t_150m")
+            # update dataset
+            object.__setattr__(self, "ds", ds)
+            # Update dim_names with "depth": "depth" key-value pair
+            updated_dim_names = self.dim_names.copy()
+            updated_dim_names["depth"] = "depth"
+            object.__setattr__(self, "dim_names", updated_dim_names)
+@dataclass(frozen=True, kw_only=True)
+class CESMBGCSurfaceForcingDataset(CESMDataset):
+    """
+    Represents CESM BGC surface forcing data on original grid.
+    Parameters
+    ----------
+    filename : str
+        The path to the data files. Can contain wildcards.
+    start_time : Optional[datetime], optional
+        The start time for selecting relevant data. If not provided, the data is not filtered by start time.
+    end_time : Optional[datetime], optional
+        The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided,
+        or no filtering is applied if start_time is not provided.
+    var_names: Dict[str, str], optional
+        Dictionary of variable names that are required in the dataset.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to False.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the GLORYS data on its original grid.
+    """
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "pco2_air": "pCO2SURF",
+            "pco2_air_alt": "pCO2SURF",
+            "iron": "IRON_FLUX",
+            "dust": "dust_FLUX_IN",
+            "nox": "NOx_FLUX",
+            "nhy": "NHy_FLUX",
+        }
+    )
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "longitude": "lon",
+            "latitude": "lat",
+        }
+    )
+    climatology: Optional[bool] = False
+@dataclass(frozen=True, kw_only=True)
+class ERA5Dataset(Dataset):
+    """
+    Represents ERA5 data on original grid.
+    Parameters
+    ----------
+    filename : str
+        The path to the data files. Can contain wildcards.
+    start_time : Optional[datetime], optional
+        The start time for selecting relevant data. If not provided, the data is not filtered by start time.
+    end_time : Optional[datetime], optional
+        The end time for selecting relevant data. If not provided, only data at the start_time is selected if start_time is provided,
+        or no filtering is applied if start_time is not provided.
+    var_names: Dict[str, str], optional
+        Dictionary of variable names that are required in the dataset.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+    climatology : bool
+        Indicates whether the dataset is climatological. Defaults to False.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The xarray Dataset containing the GLORYS data on its original grid.
+    """
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "uwnd": "u10",
+            "vwnd": "v10",
+            "swrad": "ssr",
+            "lwrad": "strd",
+            "Tair": "t2m",
+            "d2m": "d2m",
+            "rain": "tp",
+            "mask": "sst",
+        }
+    )
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "longitude": "longitude",
+            "latitude": "latitude",
+            "time": "time",
+        }
+    )
+    climatology: Optional[bool] = False
+    def post_process(self):
+        """
+        Processes and converts ERA5 data values as follows:
+        - Convert radiation values from J/m^2 to W/m^2.
+        - Convert rainfall from meters to cm/day.
+        - Convert temperature from Kelvin to Celsius.
+        - Compute relative humidity if not present, convert to absolute humidity.
+        """
+        # Translate radiation to fluxes. ERA5 stores values integrated over 1 hour.
+        # Convert radiation from J/m^2 to W/m^2
+        self.ds[self.var_names["swrad"]] /= 3600
+        self.ds[self.var_names["lwrad"]] /= 3600
+        self.ds[self.var_names["swrad"]].attrs["units"] = "W/m^2"
+        self.ds[self.var_names["lwrad"]].attrs["units"] = "W/m^2"
+        # Convert rainfall from m to cm/day
+        self.ds[self.var_names["rain"]] *= 100 * 24
+        # Convert temperature from Kelvin to Celsius
+        self.ds[self.var_names["Tair"]] -= 273.15
+        self.ds[self.var_names["d2m"]] -= 273.15
+        self.ds[self.var_names["Tair"]].attrs["units"] = "degrees C"
+        self.ds[self.var_names["d2m"]].attrs["units"] = "degrees C"
+        # Compute relative humidity if not present
+        if "qair" not in self.ds.data_vars:
+            qair = np.exp(
+                (17.625 * self.ds[self.var_names["d2m"]])
+                / (243.04 + self.ds[self.var_names["d2m"]])
+            ) / np.exp(
+                (17.625 * self.ds[self.var_names["Tair"]])
+                / (243.04 + self.ds[self.var_names["Tair"]])
+            )
+            # Convert relative to absolute humidity
+            patm = 1010.0
+            cff = (
+                (1.0007 + 3.46e-6 * patm)
+                * 6.1121
+                * np.exp(
+                    17.502
+                    * self.ds[self.var_names["Tair"]]
+                    / (240.97 + self.ds[self.var_names["Tair"]])
+                )
+            )
+            cff = cff * qair
+            self.ds["qair"] = 0.62197 * (cff / (patm - 0.378 * cff))
+            self.ds["qair"].attrs["long_name"] = "Absolute humidity at 2m"
+            self.ds["qair"].attrs["units"] = "kg/kg"
+            # Update var_names dictionary
+            var_names = {**self.var_names, "qair": "qair"}
+            object.__setattr__(self, "var_names", var_names)
+        if "mask" in self.var_names.keys():
+            mask = xr.where(self.ds[self.var_names["mask"]].isel(time=0).isnull(), 0, 1)
+            for var in self.ds.data_vars:
+                self.ds[var] = xr.where(mask == 1, self.ds[var], np.nan)
+@dataclass(frozen=True, kw_only=True)
+class ERA5Correction(Dataset):
+    """
+    Global dataset to correct ERA5 radiation. The dataset contains multiplicative correction factors for the ERA5 shortwave radiation, obtained by comparing the COREv2 climatology to the ERA5 climatology.
+    Parameters
+    ----------
+    filename : str, optional
+        The path to the correction files. Defaults to download_correction_data('SSR_correction.nc').
+    var_names: Dict[str, str], optional
+        Dictionary of variable names that are required in the dataset.
+        Defaults to {"swr_corr": "ssr_corr"}.
+    dim_names: Dict[str, str], optional
+        Dictionary specifying the names of dimensions in the dataset.
+        Defaults to {"longitude": "longitude", "latitude": "latitude", "time": "time"}.
+    climatology : bool, optional
+        Indicates if the correction data is a climatology. Defaults to True.
+    Attributes
+    ----------
+    ds : xr.Dataset
+        The loaded xarray Dataset containing the correction data.
+    """
+    filename: str = field(
+        default_factory=lambda: download_correction_data("SSR_correction.nc")
+    )
+    var_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "swr_corr": "ssr_corr",  # multiplicative correction factor for ERA5 shortwave radiation
+        }
+    )
+    dim_names: Dict[str, str] = field(
+        default_factory=lambda: {
+            "longitude": "longitude",
+            "latitude": "latitude",
+            "time": "time",
+        }
+    )
+    climatology: Optional[bool] = True
+    ds: xr.Dataset = field(init=False, repr=False)
+    def __post_init__(self):
+        if not self.climatology:
+            raise NotImplementedError(
+                "Correction data must be a climatology. Set climatology to True."
+            )
+        super().__post_init__()
+    def choose_subdomain(self, coords, straddle: bool):
+        """
+        Converts longitude values in the dataset if necessary and selects a subdomain based on the specified coordinates.
+        This method converts longitude values between different ranges if required and then extracts a subset of the
+        dataset according to the given coordinates. It updates the dataset in place to reflect the selected subdomain.
+        Parameters
+        ----------
+        coords : dict
+            A dictionary specifying the target coordinates for selecting the subdomain. Keys should correspond to the
+            dimension names of the dataset (e.g., latitude and longitude), and values should be the desired ranges or
+            specific coordinate values.
+        straddle : bool
+            If True, assumes that target longitudes are in the range [-180, 180]. If False, assumes longitudes are in the
+            range [0, 360]. This parameter determines how longitude values are converted if necessary.
+        Raises
+        ------
+        ValueError
+            If the specified subdomain does not fully contain the specified latitude or longitude values. This can occur
+            if the dataset does not cover the full range of provided coordinates.
+        Notes
+        -----
+        - The dataset (`self.ds`) is updated in place to reflect the chosen subdomain.
+        """
+        lon = self.ds[self.dim_names["longitude"]]
+        if not self.is_global:
+            if lon.min().values < 0 and not straddle:
+                # Convert from [-180, 180] to [0, 360]
+                self.ds[self.dim_names["longitude"]] = xr.where(lon < 0, lon + 360, lon)
+            if lon.max().values > 180 and straddle:
+                # Convert from [0, 360] to [-180, 180]
+                self.ds[self.dim_names["longitude"]] = xr.where(
+                    lon > 180, lon - 360, lon
+                )
+        # Select the subdomain based on the specified latitude and longitude ranges
+        subdomain = self.ds.sel(**coords)
+        # Check if the selected subdomain contains the specified latitude and longitude values
+        if not subdomain[self.dim_names["latitude"]].equals(
+            coords[self.dim_names["latitude"]]
+        ):
+            raise ValueError(
+                "The correction dataset does not contain all specified latitude values."
+            )
+        if not subdomain[self.dim_names["longitude"]].equals(
+            coords[self.dim_names["longitude"]]
+        ):
+            raise ValueError(
+                "The correction dataset does not contain all specified longitude values."
+            )
+        object.__setattr__(self, "ds", subdomain)

roms-tools 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

roms-tools 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl