PyPI - ocf-data-sampler - Versions diffs - 0.2.38__tar.gz → 0.3.1__tar.gz - Mend

ocf-data-sampler 0.2.38tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (70) hide show

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.2.38
+Version: 0.3.1
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License
@@ -44,6 +44,7 @@ Requires-Dist: pyproj
 Requires-Dist: pyaml_env
 Requires-Dist: pyresample
 Requires-Dist: h5netcdf
+Requires-Dist: xarray-tensorstore==0.1.5
 # ocf-data-sampler

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/config/model.py RENAMED Viewed

@@ -90,11 +90,10 @@ class DropoutMixin(Base):
         "negative or zero.",
     )
-    dropout_fraction: float = Field(
+    dropout_fraction: float|list[float] = Field(
         default=0,
-        description="Chance of dropout being applied to each sample",
-        ge=0,
-        le=1,
+        description="Either a float(Chance of dropout being applied to each sample) or a list of "
+        "floats (probability that dropout of the corresponding timedelta is applied)",
     )
     @field_validator("dropout_timedeltas_minutes")
@@ -105,6 +104,36 @@ class DropoutMixin(Base):
                 raise ValueError("Dropout timedeltas must be negative")
         return v
+    @field_validator("dropout_fraction")
+    def dropout_fractions(cls, dropout_frac: float|list[float]) -> float|list[float]:
+        """Validate 'dropout_frac'."""
+        from math import isclose
+        if isinstance(dropout_frac, float):
+            if not (dropout_frac <= 1):
+                raise ValueError("Input should be less than or equal to 1")
+            elif not (dropout_frac >= 0):
+                raise ValueError("Input should be greater than or equal to 0")
+        elif isinstance(dropout_frac, list):
+            if not dropout_frac:
+                raise ValueError("List cannot be empty")
+            if not all(isinstance(i, float) for i in dropout_frac):
+                raise ValueError("All elements in the list must be floats")
+            if not all(0 <= i <= 1 for i in dropout_frac):
+                raise ValueError("Each float in the list must be between 0 and 1")
+            if not isclose(sum(dropout_frac), 1.0, rel_tol=1e-9):
+                raise ValueError("Sum of all floats in the list must be 1.0")
+        else:
+            raise TypeError("Must be either a float or a list of floats")
+        return dropout_frac
     @model_validator(mode="after")
     def dropout_instructions_consistent(self) -> "DropoutMixin":
         """Validator for dropout instructions."""

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/load/load_dataset.py RENAMED Viewed

@@ -25,7 +25,7 @@ def get_dataset_dict(
             zarr_path=input_config.gsp.zarr_path,
             boundaries_version=input_config.gsp.boundaries_version,
             public=input_config.gsp.public,
-        ).compute()
+        )
         if gsp_ids is None:
             # Remove national (gsp_id=0)

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/load/nwp/providers/cloudcasting.py RENAMED Viewed

@@ -28,7 +28,7 @@ def open_cloudcasting(zarr_path: str | list[str]) -> xr.DataArray:
             [3] https://github.com/openclimatefix/sat_pred
     """
     # Open the data
-    ds = open_zarr_paths(zarr_path)
+    ds = open_zarr_paths(zarr_path, backend="tensorstore")
     # Rename
     ds = ds.rename(

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/load/nwp/providers/ecmwf.py RENAMED Viewed

@@ -19,7 +19,7 @@ def open_ifs(zarr_path: str | list[str]) -> xr.DataArray:
     Returns:
         Xarray DataArray of the NWP data
     """
-    ds = open_zarr_paths(zarr_path)
+    ds = open_zarr_paths(zarr_path, backend="tensorstore")
     # LEGACY SUPPORT - rename variable to channel if it exists
     ds = ds.rename({"init_time": "init_time_utc", "variable": "channel"})

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/load/nwp/providers/gfs.py RENAMED Viewed

@@ -23,7 +23,12 @@ def open_gfs(zarr_path: str | list[str], public: bool = False) -> xr.DataArray:
     _log.info("Loading NWP GFS data")
     # Open data
-    gfs: xr.Dataset = open_zarr_paths(zarr_path, time_dim="init_time_utc", public=public)
+    gfs: xr.Dataset = open_zarr_paths(
+        zarr_path,
+        time_dim="init_time_utc",
+        public=public,
+        backend="dask",
+    )
     nwp: xr.DataArray = gfs.to_array(dim="channel")
     del gfs

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/load/nwp/providers/icon.py RENAMED Viewed

@@ -20,7 +20,7 @@ def open_icon_eu(zarr_path: str | list[str]) -> xr.DataArray:
         Xarray DataArray of the NWP data
     """
     # Open and check initially
-    ds = open_zarr_paths(zarr_path, time_dim="init_time_utc")
+    ds = open_zarr_paths(zarr_path, time_dim="init_time_utc", backend="dask")
     if "icon_eu_data" in ds.data_vars:
         nwp = ds["icon_eu_data"]

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/load/nwp/providers/ukv.py RENAMED Viewed

@@ -19,7 +19,7 @@ def open_ukv(zarr_path: str | list[str]) -> xr.DataArray:
     Returns:
         Xarray DataArray of the NWP data
     """
-    ds = open_zarr_paths(zarr_path)
+    ds = open_zarr_paths(zarr_path, backend="tensorstore")
     ds = ds.rename(
         {

ocf_data_sampler-0.3.1/ocf_data_sampler/load/nwp/providers/utils.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Utility functions for the NWP data processing."""
+from glob import glob
+import xarray as xr
+from xarray_tensorstore import open_zarr
+from ocf_data_sampler.load.open_tensorstore_zarrs import open_zarrs
+def open_zarr_paths(
+    zarr_path: str | list[str],
+    time_dim: str = "init_time",
+    public: bool = False,
+    backend: str = "dask",
+) -> xr.Dataset:
+    """Opens the NWP data.
+    Args:
+        zarr_path: Path to the zarr(s) to open
+        time_dim: Name of the time dimension
+        public: Whether the data is public or private. Only available for the dask backend.
+        backend: The xarray backend to use.
+    Returns:
+        The opened Xarray Dataset
+    """
+    if backend not in ["dask", "tensorstore"]:
+        raise ValueError(
+            f"Unsupported backend: {backend}. Supported backends are 'dask' and 'tensorstore'.",
+        )
+    if public and backend == "tensorstore":
+        raise ValueError("Public data is only supported with the 'dask' backend.")
+    if backend == "tensorstore":
+        ds = _tensostore_open_zarr_paths(zarr_path, time_dim)
+    elif backend == "dask":
+        ds = _dask_open_zarr_paths(zarr_path, time_dim, public)
+    return ds
+def _dask_open_zarr_paths(zarr_path: str | list[str], time_dim: str, public: bool) -> xr.Dataset:
+    general_kwargs = {
+        "engine": "zarr",
+        "chunks": "auto",
+        "decode_timedelta": True,
+    }
+    if public:
+        # note this only works for s3 zarr paths at the moment
+        general_kwargs["storage_options"] = {"anon": True}
+    if isinstance(zarr_path, list | tuple) or "*" in str(zarr_path):  # Multi-file dataset
+        ds = xr.open_mfdataset(
+            zarr_path,
+            concat_dim=time_dim,
+            combine="nested",
+            **general_kwargs,
+        ).sortby(time_dim)
+    else:
+        ds = xr.open_dataset(
+            zarr_path,
+            consolidated=True,
+            mode="r",
+            **general_kwargs,
+        )
+    return ds
+def _tensostore_open_zarr_paths(zarr_path: str | list[str], time_dim: str) -> xr.Dataset:
+    if "*" in str(zarr_path):
+        zarr_path = sorted(glob(zarr_path))
+    if isinstance(zarr_path, list | tuple):
+        ds = open_zarrs(zarr_path, concat_dim=time_dim).sortby(time_dim)
+    else:
+        ds = open_zarr(zarr_path)
+    return ds

ocf_data_sampler-0.3.1/ocf_data_sampler/load/open_tensorstore_zarrs.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Open multiple zarrs with TensorStore.
+This extendds the functionality of xarray_tensorstore to open multiple zarr stores
+"""
+import os
+import tensorstore as ts
+import xarray as xr
+from xarray_tensorstore import (
+    _raise_if_mask_and_scale_used_for_data_vars,
+    _TensorStoreAdapter,
+    _zarr_spec_from_path,
+)
+def tensorstore_open_multi_zarrs(
+    paths: list[str],
+    data_vars: list[str],
+    concat_axes: list[int],
+    context: ts.Context,
+    write: bool,
+) -> dict[str, ts.TensorStore]:
+    """Open multiple zarrs with TensorStore.
+    Args:
+        paths: List of paths to zarr stores.
+        data_vars: List of data variable names to open.
+        concat_axes: List of axes along which to concatenate the data variables.
+        context: TensorStore context.
+        write: Whether to open the stores for writing.
+    """
+    arrays_list = []
+    for path in paths:
+        specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in data_vars}
+        array_futures = {
+          k: ts.open(spec, read=True, write=write, context=context)
+          for k, spec in specs.items()
+        }
+        arrays_list.append({k: v.result() for k, v in array_futures.items()})
+    arrays = {}
+    for k, axis in zip(data_vars, concat_axes, strict=False):
+        datasets = [d[k] for d in arrays_list]
+        arrays[k] = ts.concat(datasets, axis=axis)
+    return arrays
+def open_zarrs(
+    paths: list[str],
+    concat_dim: str,
+    *,
+    context: ts.Context | None = None,
+    mask_and_scale: bool = True,
+    write: bool = False,
+) -> xr.Dataset:
+    """Open multiple zarrs with TensorStore.
+    Args:
+        paths: List of paths to zarr stores.
+        concat_dim: Dimension along which to concatenate the data variables.
+        context: TensorStore context.
+        mask_and_scale: Whether to mask and scale the data.
+        write: Whether to open the stores for writing.
+    """
+    if context is None:
+        context = ts.Context()
+    ds = xr.open_mfdataset(
+        paths,
+        concat_dim=concat_dim,
+        combine="nested",
+        mask_and_scale=mask_and_scale,
+        decode_timedelta=True,
+    )
+    if mask_and_scale:
+        # Data variables get replaced below with _TensorStoreAdapter arrays, which
+        # don't get masked or scaled. Raising an error avoids surprising users with
+        # incorrect data values.
+        _raise_if_mask_and_scale_used_for_data_vars(ds)
+    data_vars = list(ds.data_vars)
+    concat_axes = [ds[v].dims.index(concat_dim) for v in data_vars]
+    arrays = tensorstore_open_multi_zarrs(paths, data_vars, concat_axes, context, write)
+    new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
+    return ds.copy(data=new_data)

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/load/satellite.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """Satellite loader."""
 import numpy as np
 import xarray as xr
+from xarray_tensorstore import open_zarr
 from ocf_data_sampler.load.utils import (
     check_time_unique_increasing,
@@ -8,39 +9,7 @@ from ocf_data_sampler.load.utils import (
     make_spatial_coords_increasing,
 )
-def get_single_sat_data(zarr_path: str) -> xr.Dataset:
-    """Helper function to open a zarr from either a local or GCP path.
-    Args:
-        zarr_path: path to a zarr file. Wildcards (*) are supported only for local paths
-                   GCS paths (gs://) do not support wildcards
-    Returns:
-        An xarray Dataset containing satellite data
-    Raises:
-        ValueError: If a wildcard (*) is used in a GCS (gs://) path
-    """
-    # Raise an error if a wildcard is used in a GCP path
-    if "gs://" in str(zarr_path) and "*" in str(zarr_path):
-        raise ValueError("Wildcard (*) paths are not supported for GCP (gs://) URLs")
-    # Handle multi-file dataset for local paths
-    if "*" in str(zarr_path):
-        ds = xr.open_mfdataset(
-            zarr_path,
-            engine="zarr",
-            concat_dim="time",
-            combine="nested",
-            chunks="auto",
-            join="override",
-        )
-        check_time_unique_increasing(ds.time)
-    else:
-        ds = xr.open_dataset(zarr_path, engine="zarr", chunks="auto")
-    return ds
+from .open_tensorstore_zarrs import open_zarrs
 def open_sat_data(zarr_path: str | list[str]) -> xr.DataArray:
@@ -52,14 +21,11 @@ def open_sat_data(zarr_path: str | list[str]) -> xr.DataArray:
     """
     # Open the data
     if isinstance(zarr_path, list | tuple):
-        ds = xr.combine_nested(
-            [get_single_sat_data(path) for path in zarr_path],
-            concat_dim="time",
-            combine_attrs="override",
-            join="override",
-        )
+        ds = open_zarrs(zarr_path, concat_dim="time")
     else:
-        ds = get_single_sat_data(zarr_path)
+        ds = open_zarr(zarr_path)
+    check_time_unique_increasing(ds.time)
     ds = ds.rename(
         {

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/load/utils.py RENAMED Viewed

@@ -47,7 +47,7 @@ def get_xr_data_array_from_xr_dataset(ds: xr.Dataset) -> xr.DataArray:
     Args:
         ds: xr.Dataset to extract xr.DataArray from
     """
-    datavars = list(ds.var())
+    datavars = list(ds.data_vars)
     if len(datavars) != 1:
         raise ValueError("Cannot open as xr.DataArray: dataset contains multiple variables")
     return ds[datavars[0]]

ocf_data_sampler-0.3.1/ocf_data_sampler/select/dropout.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Functions for simulating dropout in time series data.
+This is used for the following types of data: GSP, Satellite and Site
+This is not used for NWP
+"""
+import numpy as np
+import pandas as pd
+import xarray as xr
+def apply_sampled_dropout_time(
+    t0: pd.Timestamp,
+    dropout_timedeltas: list[pd.Timedelta],
+    dropout_frac: float|list[float],
+    da: xr.DataArray,
+) -> xr.DataArray:
+    """Randomly pick a dropout time from a list of timedeltas and apply dropout time to the data.
+    Args:
+        t0: The forecast init-time
+        dropout_timedeltas: List of timedeltas relative to t0 to pick from
+        dropout_frac: Either a probability that dropout will be applied.
+            This should be between 0 and 1 inclusive.
+            Or a list of probabilities for each of the corresponding timedeltas
+        da: Xarray DataArray with 'time_utc' coordinate
+    """
+    if  isinstance(dropout_frac, list):
+        # checking if len match
+        if len(dropout_frac) != len(dropout_timedeltas):
+            raise ValueError("Lengths of dropout_frac and dropout_timedeltas should match")
+        dropout_time = t0 + np.random.choice(dropout_timedeltas,p=dropout_frac)
+        return da.where(da.time_utc <= dropout_time)
+    # old logic
+    else:
+        # sample dropout time
+        if dropout_frac > 0 and len(dropout_timedeltas) == 0:
+            raise ValueError("To apply dropout, dropout_timedeltas must be provided")
+        if not (0 <= dropout_frac <= 1):
+            raise ValueError("dropout_frac must be between 0 and 1 inclusive")
+        if (len(dropout_timedeltas) == 0) or (np.random.uniform() >= dropout_frac):
+            dropout_time = None
+        else:
+            dropout_time = t0 + np.random.choice(dropout_timedeltas)
+        # apply dropout time
+        if dropout_time is None:
+            return da
+        # This replaces the times after the dropout with NaNs
+        return da.where(da.time_utc <= dropout_time)

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py RENAMED Viewed

@@ -270,6 +270,8 @@ class PVNetUKRegionalDataset(AbstractPVNetUKDataset):
     def __getitem__(self, idx: int) -> NumpySample:
         # Get the coordinates of the sample
+        idx = int(idx)
         if idx >= len(self):
             raise ValueError(f"Index {idx} out of range for dataset of length {len(self)}")

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.2.38
+Version: 0.3.1
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License
@@ -44,6 +44,7 @@ Requires-Dist: pyproj
 Requires-Dist: pyaml_env
 Requires-Dist: pyresample
 Requires-Dist: h5netcdf
+Requires-Dist: xarray-tensorstore==0.1.5
 # ocf-data-sampler

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler.egg-info/SOURCES.txt RENAMED Viewed

@@ -17,6 +17,7 @@ ocf_data_sampler/data/uk_gsp_locations_20250109.csv
 ocf_data_sampler/load/__init__.py
 ocf_data_sampler/load/gsp.py
 ocf_data_sampler/load/load_dataset.py
+ocf_data_sampler/load/open_tensorstore_zarrs.py
 ocf_data_sampler/load/satellite.py
 ocf_data_sampler/load/site.py
 ocf_data_sampler/load/utils.py

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/ocf_data_sampler.egg-info/requires.txt RENAMED Viewed

@@ -12,3 +12,4 @@ pyproj
 pyaml_env
 pyresample
 h5netcdf
+xarray-tensorstore==0.1.5

{ocf_data_sampler-0.2.38 → ocf_data_sampler-0.3.1}/pyproject.toml RENAMED Viewed

@@ -35,6 +35,7 @@ dependencies = [
     "pyaml_env",
     "pyresample",
     "h5netcdf",
+    "xarray-tensorstore==0.1.5",
 ]
 [dependency-groups]

ocf_data_sampler-0.2.38/ocf_data_sampler/load/nwp/providers/utils.py DELETED Viewed

@@ -1,43 +0,0 @@
-"""Utility functions for the NWP data processing."""
-import xarray as xr
-def open_zarr_paths(
-    zarr_path: str | list[str], time_dim: str = "init_time", public: bool = False,
-) -> xr.Dataset:
-    """Opens the NWP data.
-    Args:
-        zarr_path: Path to the zarr(s) to open
-        time_dim: Name of the time dimension
-        public: Whether the data is public or private
-    Returns:
-        The opened Xarray Dataset
-    """
-    general_kwargs = {
-        "engine": "zarr",
-        "chunks": "auto",
-        "decode_timedelta": True,
-    }
-    if public:
-        # note this only works for s3 zarr paths at the moment
-        general_kwargs["storage_options"] = {"anon": True}
-    if type(zarr_path) in [list, tuple] or "*" in str(zarr_path):  # Multi-file dataset
-        ds = xr.open_mfdataset(
-            zarr_path,
-            concat_dim=time_dim,
-            combine="nested",
-            **general_kwargs,
-        ).sortby(time_dim)
-    else:
-        ds = xr.open_dataset(
-            zarr_path,
-            consolidated=True,
-            mode="r",
-            **general_kwargs,
-        )
-    return ds

ocf_data_sampler-0.2.38/ocf_data_sampler/select/dropout.py DELETED Viewed

@@ -1,47 +0,0 @@
-"""Functions for simulating dropout in time series data.
-This is used for the following types of data: GSP, Satellite and Site
-This is not used for NWP
-"""
-import numpy as np
-import pandas as pd
-import xarray as xr
-def apply_sampled_dropout_time(
-    t0: pd.Timestamp,
-    dropout_timedeltas: list[pd.Timedelta],
-    dropout_frac: float,
-    da: xr.DataArray,
-) -> xr.DataArray:
-    """Randomly pick a dropout time from a list of timedeltas and apply dropout time to the data.
-    Args:
-        t0: The forecast init-time
-        dropout_timedeltas: List of timedeltas relative to t0 to pick from
-        dropout_frac: Probability that dropout will be applied.
-            This should be between 0 and 1 inclusive
-        da: Xarray DataArray with 'time_utc' coordinate
-    """
-    # sample dropout time
-    if dropout_frac > 0 and len(dropout_timedeltas) == 0:
-        raise ValueError("To apply dropout, dropout_timedeltas must be provided")
-    for t in dropout_timedeltas:
-        if t > pd.Timedelta("0min"):
-            raise ValueError("Dropout timedeltas must be negative")
-    if not (0 <= dropout_frac <= 1):
-        raise ValueError("dropout_frac must be between 0 and 1 inclusive")
-    if (len(dropout_timedeltas) == 0) or (np.random.uniform() >= dropout_frac):
-        dropout_time = None
-    else:
-        dropout_time = t0 + np.random.choice(dropout_timedeltas)
-    # apply dropout time
-    if dropout_time is None:
-        return da
-    # This replaces the times after the dropout with NaNs
-    return da.where(da.time_utc <= dropout_time)