PyPI - ocf-data-sampler - Versions diffs - 0.5.13__tar.gz → 0.5.15__tar.gz - Mend

ocf-data-sampler 0.5.13tar.gz → 0.5.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (71) hide show

{ocf_data_sampler-0.5.13 → ocf_data_sampler-0.5.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.5.13
+Version: 0.5.15
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License

{ocf_data_sampler-0.5.13 → ocf_data_sampler-0.5.15}/ocf_data_sampler/config/model.py RENAMED Viewed

@@ -90,7 +90,7 @@ class DropoutMixin(Base):
         "negative or zero.",
     )
-    dropout_fraction: float|list[float] = Field(
+    dropout_fraction: float | list[float] = Field(
         default=0,
         description="Either a float(Chance of dropout being applied to each sample) or a list of "
         "floats (probability that dropout of the corresponding timedelta is applied)",
@@ -106,31 +106,22 @@ class DropoutMixin(Base):
     @field_validator("dropout_fraction")
-    def dropout_fractions(cls, dropout_frac: float|list[float]) -> float|list[float]:
+    def dropout_fractions(cls, dropout_frac: float | list[float]) -> float | list[float]:
         """Validate 'dropout_frac'."""
-        from math import isclose
-        if isinstance(dropout_frac, float):
-            if not (dropout_frac <= 1):
-                raise ValueError("Input should be less than or equal to 1")
-            elif not (dropout_frac >= 0):
-                raise ValueError("Input should be greater than or equal to 0")
+        if isinstance(dropout_frac, float | int):
+            if not (0<= dropout_frac <= 1):
+                raise ValueError("Dropout fractions must be in range [0, 1]")
         elif isinstance(dropout_frac, list):
             if not dropout_frac:
                 raise ValueError("List cannot be empty")
-            if not all(isinstance(i, float) for i in dropout_frac):
-                raise ValueError("All elements in the list must be floats")
             if not all(0 <= i <= 1 for i in dropout_frac):
-                raise ValueError("Each float in the list must be between 0 and 1")
-            if not isclose(sum(dropout_frac), 1.0, rel_tol=1e-9):
-                raise ValueError("Sum of all floats in the list must be 1.0")
+                raise ValueError("All dropout fractions must be in range [0, 1]")
+            if not (0 <= sum(dropout_frac) <= 1):
+                raise ValueError("The sum of dropout fractions must be in range [0, 1]")
-        else:
-            raise TypeError("Must be either a float or a list of floats")
         return dropout_frac
@@ -172,23 +163,6 @@ class NormalisationConstantsMixin(Base):
     """Normalisation constants for multiple channels."""
     normalisation_constants: dict[str, NormalisationValues]
-    @property
-    def channel_means(self) -> dict[str, float]:
-        """Return the channel means."""
-        return {
-            channel: norm_values.mean
-            for channel, norm_values in self.normalisation_constants.items()
-        }
-    @property
-    def channel_stds(self) -> dict[str, float]:
-        """Return the channel standard deviations."""
-        return {
-            channel: norm_values.std
-            for channel, norm_values in self.normalisation_constants.items()
-        }
 class Satellite(TimeWindowMixin, DropoutMixin, SpatialWindowMixin, NormalisationConstantsMixin):
     """Satellite configuration model."""

ocf_data_sampler-0.5.15/ocf_data_sampler/select/dropout.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Functions for simulating dropout in time series data.
+This is used for the following types of data: GSP, Satellite and Site
+This is not used for NWP
+"""
+import numpy as np
+import pandas as pd
+import xarray as xr
+def apply_history_dropout(
+    t0: pd.Timestamp,
+    dropout_timedeltas: list[pd.Timedelta],
+    dropout_frac: float | list[float],
+    da: xr.DataArray,
+) -> xr.DataArray:
+    """Apply randomly sampled dropout to the historical part of some sequence data.
+    Dropped out data is replaced with NaNs
+    Args:
+        t0: The forecast init-time.
+        dropout_timedeltas: List of timedeltas relative to t0 to pick from
+        dropout_frac: The probabilit(ies) that each dropout timedelta will be applied. This should
+            be between 0 and 1 inclusive.
+        da: Xarray DataArray with 'time_utc' coordinate
+    """
+    if len(dropout_timedeltas)==0:
+        return da
+    if isinstance(dropout_frac, float | int):
+        if not (0<=dropout_frac<=1):
+            raise ValueError("`dropout_frac` must be in range [0, 1]")
+        # Create list with equal chance for all dropout timedeltas
+        n = len(dropout_timedeltas)
+        dropout_frac = [dropout_frac/n for _ in range(n)]
+    else:
+        if not 0<=sum(dropout_frac)<=1:
+            raise ValueError("The sum of `dropout_frac` must be in range [0, 1]")
+        if len(dropout_timedeltas)!=len(dropout_frac):
+            raise ValueError("`dropout_timedeltas` and `dropout_frac` must have the same length")
+        dropout_frac = [*dropout_frac] # Make copy of the list so we can append to it
+    dropout_timedeltas = [*dropout_timedeltas] # Make copy of the list so we can append to it
+    # Add chance of no dropout
+    dropout_frac.append(1-sum(dropout_frac))
+    dropout_timedeltas.append(None)
+    timedelta_choice = np.random.choice(dropout_timedeltas, p=dropout_frac)
+    if timedelta_choice is None:
+        return da
+    else:
+        return da.where((da.time_utc <= timedelta_choice + t0) | (da.time_utc> t0))

{ocf_data_sampler-0.5.13 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py RENAMED Viewed

@@ -137,10 +137,10 @@ class AbstractPVNetUKDataset(Dataset):
             for nwp_key, da_nwp in dataset_dict["nwp"].items():
                 # Standardise and convert to NumpyBatch
-                da_channel_means = self.means_dict["nwp"][nwp_key]
-                da_channel_stds = self.stds_dict["nwp"][nwp_key]
+                channel_means = self.means_dict["nwp"][nwp_key]
+                channel_stds = self.stds_dict["nwp"][nwp_key]
-                da_nwp = (da_nwp - da_channel_means) / da_channel_stds
+                da_nwp = (da_nwp - channel_means) / channel_stds
                 nwp_numpy_modalities[nwp_key] = convert_nwp_to_numpy_sample(da_nwp)
@@ -151,17 +151,17 @@ class AbstractPVNetUKDataset(Dataset):
             da_sat = dataset_dict["sat"]
             # Standardise and convert to NumpyBatch
-            da_channel_means = self.means_dict["sat"]
-            da_channel_stds = self.stds_dict["sat"]
+            channel_means = self.means_dict["sat"]
+            channel_stds = self.stds_dict["sat"]
-            da_sat = (da_sat - da_channel_means) / da_channel_stds
+            da_sat = (da_sat - channel_means) / channel_stds
             numpy_modalities.append(convert_satellite_to_numpy_sample(da_sat))
         if "gsp" in dataset_dict:
             gsp_config = self.config.input_data.gsp
             da_gsp = dataset_dict["gsp"]
-            da_gsp = da_gsp / da_gsp.effective_capacity_mwp
+            da_gsp = da_gsp / da_gsp.effective_capacity_mwp.values
             # Convert to NumpyBatch
             numpy_modalities.append(

{ocf_data_sampler-0.5.13 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/datasets/site.py RENAMED Viewed

@@ -82,10 +82,10 @@ def process_and_combine_datasets(
             # Standardise and convert to NumpyBatch
-            da_channel_means = means_dict["nwp"][nwp_key]
-            da_channel_stds = stds_dict["nwp"][nwp_key]
+            channel_means = means_dict["nwp"][nwp_key]
+            channel_stds = stds_dict["nwp"][nwp_key]
-            da_nwp = (da_nwp - da_channel_means) / da_channel_stds
+            da_nwp = (da_nwp - channel_means) / channel_stds
             nwp_numpy_modalities[nwp_key] = convert_nwp_to_numpy_sample(da_nwp)
@@ -96,16 +96,16 @@ def process_and_combine_datasets(
         da_sat = dataset_dict["sat"]
         # Standardise and convert to NumpyBatch
-        da_channel_means = means_dict["sat"]
-        da_channel_stds = stds_dict["sat"]
+        channel_means = means_dict["sat"]
+        channel_stds = stds_dict["sat"]
-        da_sat = (da_sat - da_channel_means) / da_channel_stds
+        da_sat = (da_sat - channel_means) / channel_stds
         numpy_modalities.append(convert_satellite_to_numpy_sample(da_sat))
     if "site" in dataset_dict:
         da_sites = dataset_dict["site"]
-        da_sites = da_sites / da_sites.capacity_kwp
+        da_sites = da_sites / da_sites.capacity_kwp.values
         # Convert to NumpyBatch
         numpy_modalities.append(convert_site_to_numpy_sample(da_sites))

ocf_data_sampler-0.5.15/ocf_data_sampler/torch_datasets/utils/config_normalization_values_to_dicts.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Utility function for converting normalisation constants in the config to arrays."""
+import numpy as np
+from ocf_data_sampler.config import Configuration
+def config_normalization_values_to_dicts(
+    config: Configuration,
+) -> tuple[dict[str, np.ndarray | dict[str, np.ndarray]]]:
+    """Construct numpy arrays of mean and std values from the config normalisation constants.
+    Args:
+        config: Data configuration.
+    Returns:
+        Means dict
+        Stds dict
+    """
+    means_dict = {}
+    stds_dict = {}
+    if config.input_data.nwp is not None:
+        means_dict["nwp"] = {}
+        stds_dict["nwp"] = {}
+        for nwp_key in config.input_data.nwp:
+            nwp_config = config.input_data.nwp[nwp_key]
+            means_list = []
+            stds_list = []
+            for channel in list(nwp_config.channels):
+                # These accumulated channels are diffed and renamed
+                if channel in nwp_config.accum_channels:
+                    channel =f"diff_{channel}"
+                means_list.append(nwp_config.normalisation_constants[channel].mean)
+                stds_list.append(nwp_config.normalisation_constants[channel].std)
+            means_dict["nwp"][nwp_key] = np.array(means_list)[None, :, None, None]
+            stds_dict["nwp"][nwp_key] = np.array(stds_list)[None, :, None, None]
+    if config.input_data.satellite is not None:
+        sat_config = config.input_data.satellite
+        means_list = []
+        stds_list = []
+        for channel in list(config.input_data.satellite.channels):
+            means_list.append(sat_config.normalisation_constants[channel].mean)
+            stds_list.append(sat_config.normalisation_constants[channel].std)
+        # Convert to array and expand dimensions so we can normalise the 4D sat and NWP sources
+        means_dict["sat"] = np.array(means_list)[None, :, None, None]
+        stds_dict["sat"] = np.array(stds_list)[None, :, None, None]
+    return means_dict, stds_dict

{ocf_data_sampler-0.5.13 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py RENAMED Viewed

@@ -1,10 +1,9 @@
 """Slice datasets by time."""
 import pandas as pd
-import xarray as xr
 from ocf_data_sampler.config import Configuration
-from ocf_data_sampler.select.dropout import apply_sampled_dropout_time
+from ocf_data_sampler.select.dropout import apply_history_dropout
 from ocf_data_sampler.select.select_time_slice import select_time_slice, select_time_slice_nwp
 from ocf_data_sampler.utils import minutes
@@ -52,7 +51,7 @@ def slice_datasets_by_time(
         )
         # Apply the randomly sampled dropout
-        sliced_datasets_dict["sat"] = apply_sampled_dropout_time(
+        sliced_datasets_dict["sat"] = apply_history_dropout(
             t0,
             dropout_timedeltas=minutes(sat_config.dropout_timedeltas_minutes),
             dropout_frac=sat_config.dropout_fraction,
@@ -62,59 +61,44 @@ def slice_datasets_by_time(
     if "gsp" in datasets_dict:
         gsp_config = config.input_data.gsp
-        da_gsp_past = select_time_slice(
+        da_gsp = select_time_slice(
             datasets_dict["gsp"],
             t0,
             time_resolution=minutes(gsp_config.time_resolution_minutes),
             interval_start=minutes(gsp_config.interval_start_minutes),
-            interval_end=minutes(0),
+            interval_end=minutes(gsp_config.interval_end_minutes),
         )
         # Dropout on the past GSP, but not the future GSP
-        da_gsp_past = apply_sampled_dropout_time(
+        da_gsp = apply_history_dropout(
             t0,
             dropout_timedeltas=minutes(gsp_config.dropout_timedeltas_minutes),
             dropout_frac=gsp_config.dropout_fraction,
-            da=da_gsp_past,
-        )
-        da_gsp_future = select_time_slice(
-            datasets_dict["gsp"],
-            t0,
-            time_resolution=minutes(gsp_config.time_resolution_minutes),
-            interval_start=minutes(gsp_config.time_resolution_minutes),
-            interval_end=minutes(gsp_config.interval_end_minutes),
+            da=da_gsp,
         )
-        sliced_datasets_dict["gsp"] = xr.concat([da_gsp_past, da_gsp_future], dim="time_utc")
+        sliced_datasets_dict["gsp"] = da_gsp
     if "site" in datasets_dict:
         site_config = config.input_data.site
-        da_site_past = select_time_slice(
+        da_site = select_time_slice(
             datasets_dict["site"],
             t0,
             time_resolution=minutes(site_config.time_resolution_minutes),
             interval_start=minutes(site_config.interval_start_minutes),
-            interval_end=minutes(0),
+            interval_end=minutes(site_config.interval_end_minutes),
         )
         # Apply the randomly sampled dropout on the past site not the future
-        da_site_past = apply_sampled_dropout_time(
+        da_site = apply_history_dropout(
             t0,
             dropout_timedeltas=minutes(site_config.dropout_timedeltas_minutes),
             dropout_frac=site_config.dropout_fraction,
-            da=da_site_past,
+            da=da_site,
         )
-        da_site_future = select_time_slice(
-            datasets_dict["site"],
-            t0,
-            time_resolution=minutes(site_config.time_resolution_minutes),
-            interval_start=minutes(site_config.time_resolution_minutes),
-            interval_end=minutes(site_config.interval_end_minutes),
-        )
+        sliced_datasets_dict["site"] = da_site
-        sliced_datasets_dict["site"] = xr.concat([da_site_past, da_site_future], dim="time_utc")
     return sliced_datasets_dict

{ocf_data_sampler-0.5.13 → ocf_data_sampler-0.5.15}/ocf_data_sampler.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.5.13
+Version: 0.5.15
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License

ocf_data_sampler-0.5.13/ocf_data_sampler/select/dropout.py DELETED Viewed

@@ -1,61 +0,0 @@
-"""Functions for simulating dropout in time series data.
-This is used for the following types of data: GSP, Satellite and Site
-This is not used for NWP
-"""
-import numpy as np
-import pandas as pd
-import xarray as xr
-def apply_sampled_dropout_time(
-    t0: pd.Timestamp,
-    dropout_timedeltas: list[pd.Timedelta],
-    dropout_frac: float|list[float],
-    da: xr.DataArray,
-) -> xr.DataArray:
-    """Randomly pick a dropout time from a list of timedeltas and apply dropout time to the data.
-    Args:
-        t0: The forecast init-time
-        dropout_timedeltas: List of timedeltas relative to t0 to pick from
-        dropout_frac: Either a probability that dropout will be applied.
-            This should be between 0 and 1 inclusive.
-            Or a list of probabilities for each of the corresponding timedeltas
-        da: Xarray DataArray with 'time_utc' coordinate
-    """
-    if  isinstance(dropout_frac, list):
-        # checking if len match
-        if len(dropout_frac) != len(dropout_timedeltas):
-            raise ValueError("Lengths of dropout_frac and dropout_timedeltas should match")
-        dropout_time = t0 + np.random.choice(dropout_timedeltas,p=dropout_frac)
-        return da.where(da.time_utc <= dropout_time)
-    # old logic
-    else:
-        # sample dropout time
-        if dropout_frac > 0 and len(dropout_timedeltas) == 0:
-            raise ValueError("To apply dropout, dropout_timedeltas must be provided")
-        if not (0 <= dropout_frac <= 1):
-            raise ValueError("dropout_frac must be between 0 and 1 inclusive")
-        if (len(dropout_timedeltas) == 0) or (np.random.uniform() >= dropout_frac):
-            dropout_time = None
-        else:
-            dropout_time = t0 + np.random.choice(dropout_timedeltas)
-        # apply dropout time
-        if dropout_time is None:
-            return da
-        # This replaces the times after the dropout with NaNs
-        return da.where(da.time_utc <= dropout_time)

ocf_data_sampler-0.5.13/ocf_data_sampler/torch_datasets/utils/config_normalization_values_to_dicts.py DELETED Viewed

@@ -1,57 +0,0 @@
-"""Utility function for converting channel dictionaries to xarray DataArrays."""
-import xarray as xr
-from ocf_data_sampler.config import Configuration
-def channel_dict_to_dataarray(channel_dict: dict[str, float]) -> xr.DataArray:
-    """Converts a dictionary of channel values to a DataArray.
-    Args:
-        channel_dict: Dictionary mapping channel names (str) to their values (float).
-    Returns:
-        xr.DataArray: A 1D DataArray with channels as coordinates.
-    """
-    return xr.DataArray(
-        list(channel_dict.values()),
-        coords={"channel": list(channel_dict.keys())},
-    )
-def config_normalization_values_to_dicts(
-    config: Configuration,
-) -> tuple[dict[str, xr.DataArray | dict[str, xr.DataArray]]]:
-    """Construct DataArrays of mean and std values from the config normalisation constants.
-    Args:
-        config: Data configuration.
-    Returns:
-        Means dict
-        Stds dict
-    """
-    means_dict = {}
-    stds_dict = {}
-    if config.input_data.nwp is not None:
-        means_dict["nwp"] = {}
-        stds_dict["nwp"] = {}
-        for nwp_key in config.input_data.nwp:
-            # Standardise and convert to NumpyBatch
-            means_dict["nwp"][nwp_key] = channel_dict_to_dataarray(
-                config.input_data.nwp[nwp_key].channel_means,
-            )
-            stds_dict["nwp"][nwp_key] = channel_dict_to_dataarray(
-                config.input_data.nwp[nwp_key].channel_stds,
-            )
-    if config.input_data.satellite is not None:
-        means_dict["sat"] = channel_dict_to_dataarray(config.input_data.satellite.channel_means)
-        stds_dict["sat"] = channel_dict_to_dataarray(config.input_data.satellite.channel_stds)
-    return means_dict, stds_dict