PyPI - ocf-data-sampler - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

ocf-data-sampler 0.1.11py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (76) hide show

ocf_data_sampler/config/load.py +3 -3
ocf_data_sampler/config/model.py +73 -61
ocf_data_sampler/config/save.py +5 -4
ocf_data_sampler/constants.py +140 -12
ocf_data_sampler/load/gsp.py +6 -5
ocf_data_sampler/load/load_dataset.py +5 -6
ocf_data_sampler/load/nwp/nwp.py +17 -5
ocf_data_sampler/load/nwp/providers/ecmwf.py +6 -7
ocf_data_sampler/load/nwp/providers/gfs.py +36 -0
ocf_data_sampler/load/nwp/providers/icon.py +46 -0
ocf_data_sampler/load/nwp/providers/ukv.py +4 -5
ocf_data_sampler/load/nwp/providers/utils.py +3 -1
ocf_data_sampler/load/satellite.py +9 -10
ocf_data_sampler/load/site.py +10 -6
ocf_data_sampler/load/utils.py +21 -16
ocf_data_sampler/numpy_sample/collate.py +10 -9
ocf_data_sampler/numpy_sample/datetime_features.py +3 -5
ocf_data_sampler/numpy_sample/gsp.py +12 -14
ocf_data_sampler/numpy_sample/nwp.py +12 -12
ocf_data_sampler/numpy_sample/satellite.py +9 -9
ocf_data_sampler/numpy_sample/site.py +5 -8
ocf_data_sampler/numpy_sample/sun_position.py +16 -21
ocf_data_sampler/sample/base.py +15 -17
ocf_data_sampler/sample/site.py +13 -20
ocf_data_sampler/sample/uk_regional.py +29 -35
ocf_data_sampler/select/dropout.py +16 -14
ocf_data_sampler/select/fill_time_periods.py +15 -5
ocf_data_sampler/select/find_contiguous_time_periods.py +88 -75
ocf_data_sampler/select/geospatial.py +63 -54
ocf_data_sampler/select/location.py +16 -51
ocf_data_sampler/select/select_spatial_slice.py +105 -89
ocf_data_sampler/select/select_time_slice.py +71 -58
ocf_data_sampler/select/spatial_slice_for_dataset.py +7 -6
ocf_data_sampler/select/time_slice_for_dataset.py +17 -16
ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py +126 -118
ocf_data_sampler/torch_datasets/datasets/site.py +135 -101
ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py +6 -2
ocf_data_sampler/torch_datasets/utils/valid_time_periods.py +23 -22
ocf_data_sampler/torch_datasets/utils/validate_channels.py +23 -19
ocf_data_sampler/utils.py +3 -1
{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.16.dist-info}/METADATA +7 -18
ocf_data_sampler-0.1.16.dist-info/RECORD +56 -0
{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.16.dist-info}/WHEEL +1 -1
{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.16.dist-info}/top_level.txt +1 -1
scripts/refactor_site.py +62 -33
utils/compute_icon_mean_stddev.py +72 -0
ocf_data_sampler-0.1.11.dist-info/LICENSE +0 -21
ocf_data_sampler-0.1.11.dist-info/RECORD +0 -82
tests/__init__.py +0 -0
tests/config/test_config.py +0 -113
tests/config/test_load.py +0 -7
tests/config/test_save.py +0 -28
tests/conftest.py +0 -319
tests/load/test_load_gsp.py +0 -15
tests/load/test_load_nwp.py +0 -21
tests/load/test_load_satellite.py +0 -17
tests/load/test_load_sites.py +0 -14
tests/numpy_sample/test_collate.py +0 -21
tests/numpy_sample/test_datetime_features.py +0 -37
tests/numpy_sample/test_gsp.py +0 -38
tests/numpy_sample/test_nwp.py +0 -13
tests/numpy_sample/test_satellite.py +0 -40
tests/numpy_sample/test_sun_position.py +0 -81
tests/select/test_dropout.py +0 -69
tests/select/test_fill_time_periods.py +0 -28
tests/select/test_find_contiguous_time_periods.py +0 -202
tests/select/test_location.py +0 -67
tests/select/test_select_spatial_slice.py +0 -154
tests/select/test_select_time_slice.py +0 -275
tests/test_sample/test_base.py +0 -164
tests/test_sample/test_site_sample.py +0 -165
tests/test_sample/test_uk_regional_sample.py +0 -136
tests/torch_datasets/test_merge_and_fill_utils.py +0 -40
tests/torch_datasets/test_pvnet_uk.py +0 -154
tests/torch_datasets/test_site.py +0 -226
tests/torch_datasets/test_validate_channels_utils.py +0 -78

ocf_data_sampler/sample/base.py CHANGED Viewed

@@ -1,11 +1,10 @@
-""" Base class for handling flat/nested data structures with NWP consideration """
+"""Base class for handling flat/nested data structures with NWP consideration."""
-import numpy as np
-import torch
-from typing import TypeAlias
 from abc import ABC, abstractmethod
+from typing import TypeAlias
+import numpy as np
+import torch
 NumpySample: TypeAlias = dict[str, np.ndarray | dict[str, np.ndarray]]
 NumpyBatch: TypeAlias = dict[str, np.ndarray | dict[str, np.ndarray]]
@@ -13,39 +12,38 @@ TensorBatch: TypeAlias = dict[str, torch.Tensor | dict[str, torch.Tensor]]
 class SampleBase(ABC):
-    """
-    Abstract base class for all sample types
-    Provides core data storage functionality
-    """
+    """Abstract base class for all sample types."""
     @abstractmethod
     def to_numpy(self) -> NumpySample:
-        """Convert sample data to numpy format"""
+        """Convert sample data to numpy format."""
         raise NotImplementedError
     @abstractmethod
     def plot(self) -> None:
+        """Create a visualisation of the data."""
         raise NotImplementedError
     @abstractmethod
     def save(self, path: str) -> None:
+        """Saves the sample to disk in the implementations' required format."""
         raise NotImplementedError
     @classmethod
     @abstractmethod
-    def load(cls, path: str) -> 'SampleBase':
+    def load(cls, path: str) -> "SampleBase":
+        """Load a sample from disk from the implementations' format."""
         raise NotImplementedError
 def batch_to_tensor(batch: NumpyBatch) -> TensorBatch:
-    """
-    Recursively converts numpy arrays in nested dict to torch tensors
+    """Recursively converts numpy arrays in nested dict to torch tensors.
     Args:
         batch: NumpyBatch with data in numpy arrays
     Returns:
         TensorBatch with data in torch tensors
     """
     for k, v in batch.items():
         if isinstance(v, dict):
             batch[k] = batch_to_tensor(v)
@@ -58,12 +56,12 @@ def batch_to_tensor(batch: NumpyBatch) -> TensorBatch:
 def copy_batch_to_device(batch: TensorBatch, device: torch.device) -> TensorBatch:
-    """Recursively copies tensors in nested dict to specified device
+    """Recursively copies tensors in nested dict to specified device.
     Args:
         batch: Nested dict with tensors to move
         device: Device to move tensors to
     Returns:
         A dict with tensors moved to the new device
     """
@@ -71,7 +69,7 @@ def copy_batch_to_device(batch: TensorBatch, device: torch.device) -> TensorBatc
     for k, v in batch.items():
         if isinstance(v, dict):
-            batch_copy[k] = copy_batch_to_device(v, device)
+            batch_copy[k] = copy_batch_to_device(v, device)
         elif isinstance(v, torch.Tensor):
             batch_copy[k] = v.to(device)
         else:

ocf_data_sampler/sample/site.py CHANGED Viewed

@@ -1,44 +1,37 @@
-"""PVNet Site sample implementation for netCDF data handling and conversion"""
+"""PVNet Site sample implementation for netCDF data handling and conversion."""
 import xarray as xr
 from typing_extensions import override
-from ocf_data_sampler.sample.base import SampleBase, NumpySample
+from ocf_data_sampler.sample.base import NumpySample, SampleBase
 from ocf_data_sampler.torch_datasets.datasets.site import convert_netcdf_to_numpy_sample
 class SiteSample(SampleBase):
-    """Handles PVNet site specific netCDF operations"""
+    """Handles PVNet site specific netCDF operations."""
-    def __init__(self, data: xr.Dataset):
+    def __init__(self, data: xr.Dataset) -> None:
+        """Initializes the SiteSample object with the given xarray Dataset."""
         if not isinstance(data, xr.Dataset):
             raise TypeError(f"Data must be xarray Dataset - Found type {type(data)}")
         self._data = data
     @override
-    def to_numpy(self) -> NumpySample:
+    def to_numpy(self) -> NumpySample:
         return convert_netcdf_to_numpy_sample(self._data)
+    @override
     def save(self, path: str) -> None:
-        """Save site sample data as netCDF
-        Args:
-            path: Path to save the netCDF file
-        """
+        # Saves as NetCDF
         self._data.to_netcdf(path, mode="w", engine="h5netcdf")
     @classmethod
-    def load(cls, path: str) -> 'SiteSample':
-        """Load site sample data from netCDF
-        Args:
-            path: Path to load the netCDF file from
-        """
+    @override
+    def load(cls, path: str) -> "SiteSample":
+        # Loads from NetCDF
         return cls(xr.open_dataset(path))
-    # TODO - placeholder for now
+    @override
     def plot(self) -> None:
+        # TODO - placeholder for now
         raise NotImplementedError("Plotting not yet implemented for SiteSample")

ocf_data_sampler/sample/uk_regional.py CHANGED Viewed

@@ -1,75 +1,69 @@
-"""PVNet UK Regional sample implementation for dataset handling and visualisation"""
-from typing_extensions import override
+"""PVNet UK Regional sample implementation for dataset handling and visualisation."""
 import torch
+from typing_extensions import override
-from ocf_data_sampler.sample.base import SampleBase, NumpySample
 from ocf_data_sampler.numpy_sample import (
-    NWPSampleKey,
     GSPSampleKey,
-    SatelliteSampleKey
+    NWPSampleKey,
+    SatelliteSampleKey,
 )
+from ocf_data_sampler.sample.base import NumpySample, SampleBase
 class UKRegionalSample(SampleBase):
-    """Handles UK Regional PVNet data operations"""
+    """Handles UK Regional PVNet data operations."""
-    def __init__(self, data: NumpySample):
+    def __init__(self, data: NumpySample) -> None:
+        """Initialises UK Regional sample with data."""
         self._data = data
     @override
     def to_numpy(self) -> NumpySample:
         return self._data
+    @override
     def save(self, path: str) -> None:
-        """Save PVNet sample as pickle format using torch.save
-        Args:
-            path: Path to save the sample data to
-        """
+        # Saves to pickle format
         torch.save(self._data, path)
     @classmethod
-    def load(cls, path: str) -> 'UKRegionalSample':
-        """Load PVNet sample data from .pt format
-        Args:
-            path: Path to load the sample data from
-        """
+    @override
+    def load(cls, path: str) -> "UKRegionalSample":
+        # Loads from .pt format
         # TODO: We should move away from using torch.load(..., weights_only=False)
         return cls(torch.load(path, weights_only=False))
+    @override
     def plot(self) -> None:
-        """Creates visualisations for NWP, GSP, solar position, and satellite data"""
         from matplotlib import pyplot as plt
         fig, axes = plt.subplots(2, 2, figsize=(12, 8))
         if NWPSampleKey.nwp in self._data:
-            first_nwp = list(self._data[NWPSampleKey.nwp].values())[0]
-            if 'nwp' in first_nwp:
-                axes[0, 1].imshow(first_nwp['nwp'][0])
-                title = 'NWP (First Channel)'
+            first_nwp = next(iter(self._data[NWPSampleKey.nwp].values()))
+            if "nwp" in first_nwp:
+                axes[0, 1].imshow(first_nwp["nwp"][0])
+                title = "NWP (First Channel)"
                 if NWPSampleKey.channel_names in first_nwp:
                     channel_names = first_nwp[NWPSampleKey.channel_names]
                     if channel_names:
-                        title = f'NWP: {channel_names[0]}'
+                        title = f"NWP: {channel_names[0]}"
                 axes[0, 1].set_title(title)
         if GSPSampleKey.gsp in self._data:
             axes[0, 0].plot(self._data[GSPSampleKey.gsp])
-            axes[0, 0].set_title('GSP Generation')
-        if GSPSampleKey.solar_azimuth in self._data and GSPSampleKey.solar_elevation in self._data:
-            axes[1, 1].plot(self._data[GSPSampleKey.solar_azimuth], label='Azimuth')
-            axes[1, 1].plot(self._data[GSPSampleKey.solar_elevation], label='Elevation')
-            axes[1, 1].set_title('Solar Position')
-            axes[1, 1].legend()
+            axes[0, 0].set_title("GSP Generation")
+        if "solar_azimuth" in self._data and "solar_elevation" in self._data:
+                    axes[1, 1].plot(self._data["solar_azimuth"], label="Azimuth")
+                    axes[1, 1].plot(self._data["solar_elevation"], label="Elevation")
+                    axes[1, 1].set_title("Solar Position")
+                    axes[1, 1].legend()
         if SatelliteSampleKey.satellite_actual in self._data:
             axes[1, 0].imshow(self._data[SatelliteSampleKey.satellite_actual])
-            axes[1, 0].set_title('Satellite Data')
+            axes[1, 0].set_title("Satellite Data")
         plt.tight_layout()
         plt.show()

ocf_data_sampler/select/dropout.py CHANGED Viewed

@@ -1,8 +1,9 @@
-"""Functions for simulating dropout in time series data
+"""Functions for simulating dropout in time series data.
 This is used for the following types of data: GSP, Satellite and Site
 This is not used for NWP
 """
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -13,22 +14,23 @@ def draw_dropout_time(
     dropout_timedeltas: list[pd.Timedelta],
     dropout_frac: float,
 ) -> pd.Timestamp:
-    """Randomly pick a dropout time from a list of timedeltas
+    """Randomly pick a dropout time from a list of timedeltas.
     Args:
         t0: The forecast init-time
         dropout_timedeltas: List of timedeltas relative to t0 to pick from
-        dropout_frac: Probability that dropout will be applied. This should be between 0 and 1
-            inclusive
+        dropout_frac: Probability that dropout will be applied.
+            This should be between 0 and 1 inclusive
     """
-    if dropout_frac>0:
-        assert len(dropout_timedeltas) > 0, "To apply dropout dropout_timedeltas must be provided"
+    if dropout_frac > 0 and len(dropout_timedeltas) == 0:
+        raise ValueError("To apply dropout, dropout_timedeltas must be provided")
     for t in dropout_timedeltas:
-        assert t <= pd.Timedelta("0min"), "Dropout timedeltas must be negative"
+        if t > pd.Timedelta("0min"):
+            raise ValueError("Dropout timedeltas must be negative")
-    assert 0 <= dropout_frac <= 1
+    if not (0 <= dropout_frac <= 1):
+        raise ValueError("dropout_frac must be between 0 and 1 inclusive")
     if (len(dropout_timedeltas) == 0) or (np.random.uniform() >= dropout_frac):
         dropout_time = t0
@@ -41,11 +43,11 @@ def draw_dropout_time(
 def apply_dropout_time(
     ds: xr.DataArray,
     dropout_time: pd.Timestamp,
- ) -> xr.DataArray:
-    """Apply dropout time to the data
+) -> xr.DataArray:
+    """Apply dropout time to the data.
     Args:
-        ds: Xarray DataArray with 'time_utc' coordiante
+        ds: Xarray DataArray with 'time_utc' coordinate
         dropout_time: Time after which data is set to NaN
     """
     # This replaces the times after the dropout with NaNs

ocf_data_sampler/select/fill_time_periods.py CHANGED Viewed

@@ -1,13 +1,23 @@
-"""Fill time periods between start and end dates at specified frequency"""
+"""Fill time periods between specified start and end dates."""
-import pandas as pd
 import numpy as np
+import pandas as pd
 def fill_time_periods(time_periods: pd.DataFrame, freq: pd.Timedelta) -> pd.DatetimeIndex:
-    """Generate DatetimeIndex for all timestamps between start and end dates"""
+    """Create range of timestamps between given start and end times.
+    Each of the continuous periods (i.e. each row of the input DataFrame) is filled with the
+    specified frequency.
+    Args:
+        time_periods: DataFrame with columns 'start_dt' and 'end_dt'
+        freq: Frequency to fill time periods with
+    """
     start_dts = pd.to_datetime(time_periods["start_dt"].values).ceil(freq)
     end_dts = pd.to_datetime(time_periods["end_dt"].values)
-    date_ranges = [pd.date_range(start_dt, end_dt, freq=freq) for start_dt, end_dt in zip(start_dts, end_dts)]
+    date_ranges = [
+        pd.date_range(start_dt, end_dt, freq=freq)
+        for start_dt, end_dt in zip(start_dts, end_dts, strict=False)
+        ]
     return pd.DatetimeIndex(np.concatenate(date_ranges))

ocf_data_sampler/select/find_contiguous_time_periods.py CHANGED Viewed

@@ -1,9 +1,12 @@
-"""Get contiguous time periods for training"""
+"""Get contiguous time periods."""
 import numpy as np
 import pandas as pd
 from ocf_data_sampler.load.utils import check_time_unique_increasing
+ZERO_TDELTA = pd.Timedelta(0)
 def find_contiguous_time_periods(
     datetimes: pd.DatetimeIndex,
@@ -14,20 +17,20 @@ def find_contiguous_time_periods(
     Args:
       datetimes: pd.DatetimeIndex. Must be sorted.
-      min_seq_length: Sequences of min_seq_length or shorter will be discarded.  Typically, this
-        would be set to the `total_seq_length` of each machine learning example.
+      min_seq_length: Sequences of min_seq_length or shorter will be discarded.
       max_gap_duration: If any pair of consecutive `datetimes` is more than `max_gap_duration`
         apart, then this pair of `datetimes` will be considered a "gap" between two contiguous
-        sequences. Typically, `max_gap_duration` would be set to the sample period of
-        the timeseries.
+        sequences.
     Returns:
-      pd.DataFrame where each row represents a single time period.  The pd.DataFrame
-          has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
+      pd.DataFrame where each row represents a single time period. The pd.DataFrame
+      has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
     """
     # Sanity checks.
-    assert len(datetimes) > 0
-    assert min_seq_length > 1
+    if len(datetimes) == 0:
+        raise ValueError("No datetimes to use")
+    if min_seq_length <= 1:
+        raise ValueError(f"{min_seq_length=} must be greater than 1")
     check_time_unique_increasing(datetimes)
     # Find indices of gaps larger than max_gap:
@@ -43,77 +46,75 @@ def find_contiguous_time_periods(
     # Capture the last segment of dt_index.
     segment_boundaries = np.concatenate((segment_boundaries, [len(datetimes)]))
-    periods: list[dict[str, pd.Timestamp]] = []
+    periods: list[list[pd.Timestamp]] = []
     start_i = 0
     for next_start_i in segment_boundaries:
         n_timesteps = next_start_i - start_i
         if n_timesteps > min_seq_length:
             end_i = next_start_i - 1
-            period = {"start_dt": datetimes[start_i], "end_dt": datetimes[end_i]}
-            periods.append(period)
+            periods.append([datetimes[start_i], datetimes[end_i]])
         start_i = next_start_i
-    assert len(periods) > 0, (
-        f"Did not find an periods from {datetimes}. " f"{min_seq_length=} {max_gap_duration=}"
-    )
+    if len(periods) == 0:
+        raise ValueError(
+            f"Did not find any periods from {datetimes}. {min_seq_length=} {max_gap_duration=}",
+        )
-    return pd.DataFrame(periods)
+    return pd.DataFrame(periods, columns=["start_dt", "end_dt"])
 def trim_contiguous_time_periods(
-    contiguous_time_periods: pd.DataFrame,
+    contiguous_time_periods: pd.DataFrame,
     interval_start: pd.Timedelta,
     interval_end: pd.Timedelta,
 ) -> pd.DataFrame:
-    """Trim the contiguous time periods to allow for history and forecast durations.
+    """Trims contiguous time periods to account for history requirements and forecast horizons.
     Args:
-        contiguous_time_periods: DataFrame where each row represents a single time period. The
-            DataFrame must have `start_dt` and `end_dt` columns.
+        contiguous_time_periods: pd.DataFrame where each row represents a single time period.
+            The pd.DataFrame must have `start_dt` and `end_dt` columns.
         interval_start: The start of the interval with respect to t0
         interval_end: The end of the interval with respect to t0
     Returns:
-      The contiguous_time_periods DataFrame with the `start_dt` and `end_dt` columns updated.
+      The contiguous_time_periods pd.DataFrame with the `start_dt` and `end_dt` columns updated.
     """
-    contiguous_time_periods = contiguous_time_periods.copy()
-    contiguous_time_periods["start_dt"] -= interval_start
-    contiguous_time_periods["end_dt"] -= interval_end
+    # Make a copy so the data is not edited in place.
+    trimmed_time_periods = contiguous_time_periods.copy()
+    trimmed_time_periods["start_dt"] -= interval_start
+    trimmed_time_periods["end_dt"] -= interval_end
-    valid_mask = contiguous_time_periods["start_dt"] <= contiguous_time_periods["end_dt"]
-    contiguous_time_periods = contiguous_time_periods.loc[valid_mask]
-    return contiguous_time_periods
+    valid_mask = trimmed_time_periods["start_dt"] <= trimmed_time_periods["end_dt"]
+    return trimmed_time_periods.loc[valid_mask]
 def find_contiguous_t0_periods(
-        datetimes: pd.DatetimeIndex,
-        interval_start: pd.Timedelta,
-        interval_end: pd.Timedelta,
-        sample_period_duration: pd.Timedelta,
-    ) -> pd.DataFrame:
+    datetimes: pd.DatetimeIndex,
+    interval_start: pd.Timedelta,
+    interval_end: pd.Timedelta,
+    time_resolution: pd.Timedelta,
+) -> pd.DataFrame:
     """Return a pd.DataFrame where each row records the boundary of a contiguous time period.
     Args:
-        datetimes: pd.DatetimeIndex. Must be sorted.
+        datetimes: pd.DatetimeIndex
         interval_start: The start of the interval with respect to t0
         interval_end: The end of the interval with respect to t0
-        sample_period_duration: The sample frequency of the timeseries
+        time_resolution: The sample frequency of the timeseries
     Returns:
         pd.DataFrame where each row represents a single time period.  The pd.DataFrame
             has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
     """
+    check_time_unique_increasing(datetimes)
     total_duration = interval_end - interval_start
     contiguous_time_periods = find_contiguous_time_periods(
         datetimes=datetimes,
-        min_seq_length=int(total_duration / sample_period_duration) + 1,
-        max_gap_duration=sample_period_duration,
+        min_seq_length=int(total_duration / time_resolution) + 1,
+        max_gap_duration=time_resolution,
     )
     contiguous_t0_periods = trim_contiguous_time_periods(
@@ -122,7 +123,11 @@ def find_contiguous_t0_periods(
         interval_end=interval_end,
     )
-    assert len(contiguous_t0_periods) > 0
+    if len(contiguous_t0_periods) == 0:
+        raise ValueError(
+            f"No contiguous time periods found for {datetimes}. "
+            f"{interval_start=} {interval_end=} {time_resolution=}",
+        )
     return contiguous_t0_periods
@@ -131,54 +136,59 @@ def find_contiguous_t0_periods_nwp(
     init_times: pd.DatetimeIndex,
     interval_start: pd.Timedelta,
     max_staleness: pd.Timedelta,
-    max_dropout: pd.Timedelta = pd.Timedelta(0),
-    first_forecast_step: pd.Timedelta = pd.Timedelta(0),
+    max_dropout: pd.Timedelta = ZERO_TDELTA,
+    first_forecast_step: pd.Timedelta = ZERO_TDELTA,
 ) -> pd.DataFrame:
-    """Get all time periods from the NWP init times which are valid as t0 datetimes.
+    """Get all time periods from the NWP init-times which are valid as t0 datetimes.
     Args:
         init_times: The initialisation times of the available forecasts
-        interval_start: The start of the desired data interval with respect to t0
-        max_staleness: Up to how long after an init time are we willing to use the forecast. Each
-            init time will only be used up to this t0 time regardless of the forecast valid time.
-        max_dropout: What is the maximum amount of dropout that will be used. This must be <=
-            max_staleness.
-        first_forecast_step: The timedelta of the first step of the forecast. By default we assume
-            the first valid time of the forecast is the same as its init time.
+        interval_start: The start of the time interval with respect to t0
+        max_staleness: Up to how long after an init-time are we willing to use the forecast.
+            Each init-time will only be used up to this t0 time regardless of the forecast valid
+            time.
+        max_dropout: What is the maximum amount of dropout that will be used.
+            This must be <= max_staleness.
+        first_forecast_step: The timedelta of the first step of the forecast.
+            By default we assume the first valid time of the forecast
+            is the same as its init-time.
     Returns:
-        pd.DataFrame where each row represents a single time period.  The pd.DataFrame
+        pd.DataFrame where each row represents a single time period. The pd.DataFrame
         has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
     """
     # Sanity checks.
-    assert len(init_times) > 0
-    assert init_times.is_monotonic_increasing
-    assert init_times.is_unique
-    assert max_staleness >= pd.Timedelta(0)
-    assert pd.Timedelta(0) <= max_dropout <= max_staleness
+    if len(init_times) == 0:
+        raise ValueError("No init-times to use")
+    check_time_unique_increasing(init_times)
+    if max_staleness < pd.Timedelta(0):
+        raise ValueError("The max staleness must be positive")
+    if not (pd.Timedelta(0) <= max_dropout <= max_staleness):
+        raise ValueError("The max dropout must be between 0 and the max staleness")
-    hist_drop_buffer = max(first_forecast_step-interval_start, max_dropout)
+    history_drop_buffer = max(first_forecast_step - interval_start, max_dropout)
     # Store contiguous periods
-    contiguous_periods = []
+    contiguous_periods: list[list[pd.Timestamp]] = []
-    # Begin the first period allowing for the time to the first_forecast_step, the length of the
+    # Begin the first period allowing for the time to the first_forecast_step, the length of the
     # interval sampled from before t0, and the dropout
-    start_this_period = init_times[0] + hist_drop_buffer
+    start_this_period = init_times[0] + history_drop_buffer
     # The first forecast is valid up to the max staleness
     end_this_period = init_times[0] + max_staleness
     for dt_init in init_times[1:]:
-        # If the previous init time becomes stale before the next init becomes valid (whilst also
-        # considering dropout) then the contiguous period breaks
-        # Else if the previous init time becomes stale before the fist step of the next forecast
+        # If the previous init-time becomes stale before the next init-time becomes valid (whilst
+        # also considering dropout) then the contiguous period breaks
+        # Else if the previous init-time becomes stale before the fist step of the next forecast
         # then this also causes a break in the contiguous period
-        if (end_this_period < dt_init + max(max_dropout, first_forecast_step)):
+        if end_this_period < dt_init + max(max_dropout, first_forecast_step):
             contiguous_periods.append([start_this_period, end_this_period])
             # The new period begins with the same conditions as the first period
-            start_this_period = dt_init + hist_drop_buffer
+            start_this_period = dt_init + history_drop_buffer
         end_this_period = dt_init + max_staleness
     contiguous_periods.append([start_this_period, end_this_period])
@@ -189,11 +199,13 @@ def find_contiguous_t0_periods_nwp(
 def intersection_of_multiple_dataframes_of_periods(
     time_periods: list[pd.DataFrame],
 ) -> pd.DataFrame:
-    """Find the intersection of a list of time periods.
+    """Find the intersection of list of time periods.
-    See the docstring of intersection_of_2_dataframes_of_periods() for more details.
+    Consecutively updates intersection of time periods.
+    See the docstring of intersection_of_2_dataframes_of_periods() for further details.
     """
-    assert len(time_periods) > 0
+    if len(time_periods) == 0:
+        raise ValueError("No time periods to intersect")
     intersection = time_periods[0]
     for time_period in time_periods[1:]:
         intersection = intersection_of_2_dataframes_of_periods(intersection, time_period)
@@ -209,7 +221,8 @@ def intersection_of_2_dataframes_of_periods(a: pd.DataFrame, b: pd.DataFrame) ->
     A typical use-case is that each pd.DataFrame represents all the time periods where
     a `DataSource` has contiguous, valid data.
-    Here's a graphical example of two pd.DataFrames of time periods and their intersection:
+    Graphical representation of two pd.DataFrames of time periods and their intersection,
+    as follows:
                  ----------------------> TIME ->---------------------
                a: |-----|   |----|     |----------|     |-----------|
@@ -217,9 +230,9 @@ def intersection_of_2_dataframes_of_periods(a: pd.DataFrame, b: pd.DataFrame) ->
     intersection:    |--|   |-|                         |--|    |---|
     Args:
-        a: pd.DataFrame where each row represents a time period.  The pd.DataFrame has
+        a: pd.DataFrame where each row represents a time period. The pd.DataFrame has
         two columns: start_dt and end_dt.
-        b: pd.DataFrame where each row represents a time period.  The pd.DataFrame has
+        b: pd.DataFrame where each row represents a time period. The pd.DataFrame has
         two columns: start_dt and end_dt.
     Returns:
@@ -238,7 +251,7 @@ def intersection_of_2_dataframes_of_periods(a: pd.DataFrame, b: pd.DataFrame) ->
         # and `a` must always end after `b` starts:
         # TODO: <= and >= because we should allow overlap time periods of length 1. e.g.
-        # a: |----|      or   |---|
+        # a: |----|      or   |---|
         # b:      |--|            |---|
         # These aren't allowed if we use < and >.

ocf-data-sampler 0.1.11__py3-none-any.whl → 0.1.16__py3-none-any.whl

Potentially problematic release.

ocf-data-sampler 0.1.11py3-none-any.whl → 0.1.16py3-none-any.whl