PyPI - ocf-data-sampler - Versions diffs - 0.0.25__tar.gz → 0.0.27__tar.gz - Mend

ocf-data-sampler 0.0.25tar.gz → 0.0.27tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (72) hide show

{ocf_data_sampler-0.0.25/ocf_data_sampler.egg-info → ocf_data_sampler-0.0.27}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ocf_data_sampler
-Version: 0.0.25
+Version: 0.0.27
 Summary: Sample from weather data for renewable energy prediction
 Author: James Fulton, Peter Dudfield, and the Open Climate Fix team
 Author-email: info@openclimatefix.org

{ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/config/model.py RENAMED Viewed

@@ -15,6 +15,7 @@ from typing import Dict, List, Optional
 from typing_extensions import Self
 from pydantic import BaseModel, Field, RootModel, field_validator, ValidationInfo, model_validator
 from ocf_data_sampler.constants import NWP_PROVIDERS
 logger = logging.getLogger(__name__)
@@ -34,26 +35,50 @@ class Base(BaseModel):
 class General(Base):
     """General pydantic model"""
-    name: str = Field("example", description="The name of this configuration file.")
+    name: str = Field("example", description="The name of this configuration file")
     description: str = Field(
         "example configuration", description="Description of this configuration file"
     )
-class DataSourceMixin(Base):
-    """Mixin class, to add forecast and history minutes"""
+class TimeWindowMixin(Base):
+    """Mixin class, to add interval start, end and resolution minutes"""
-    forecast_minutes: int = Field(
+    time_resolution_minutes: int = Field(
         ...,
-        ge=0,
-        description="how many minutes to forecast in the future. ",
+        gt=0,
+        description="The temporal resolution of the data in minutes",
     )
-    history_minutes: int = Field(
+    interval_start_minutes: int = Field(
         ...,
-        ge=0,
-        description="how many historic minutes to use. ",
+        description="Data interval starts at `t0 + interval_start_minutes`",
     )
+    interval_end_minutes: int = Field(
+        ...,
+        description="Data interval ends at `t0 + interval_end_minutes`",
+    )
+    @model_validator(mode='after')
+    def check_interval_range(cls, values):
+        if values.interval_start_minutes > values.interval_end_minutes:
+            raise ValueError('interval_start_minutes must be <= interval_end_minutes')
+        return values
+    @field_validator("interval_start_minutes")
+    def interval_start_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
+        if v % info.data["time_resolution_minutes"] != 0:
+            raise ValueError("interval_start_minutes must be divisible by time_resolution_minutes")
+        return v
+    @field_validator("interval_end_minutes")
+    def interval_end_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
+        if v % info.data["time_resolution_minutes"] != 0:
+            raise ValueError("interval_end_minutes must be divisible by time_resolution_minutes")
+        return v
 # noinspection PyMethodParameters
 class DropoutMixin(Base):
@@ -65,7 +90,12 @@ class DropoutMixin(Base):
         "negative or zero.",
     )
-    dropout_fraction: float = Field(0, description="Chance of dropout being applied to each sample")
+    dropout_fraction: float = Field(
+        default=0,
+        description="Chance of dropout being applied to each sample",
+        ge=0,
+        le=1,
+    )
     @field_validator("dropout_timedeltas_minutes")
     def dropout_timedeltas_minutes_negative(cls, v: List[int]) -> List[int]:
@@ -75,12 +105,6 @@ class DropoutMixin(Base):
                 assert m <= 0, "Dropout timedeltas must be negative"
         return v
-    @field_validator("dropout_fraction")
-    def dropout_fraction_valid(cls, v: float) -> float:
-        """Validate 'dropout_fraction'"""
-        assert 0 <= v <= 1, "Dropout fraction must be between 0 and 1"
-        return v
     @model_validator(mode="after")
     def dropout_instructions_consistent(self) -> Self:
         if self.dropout_fraction == 0:
@@ -92,93 +116,51 @@ class DropoutMixin(Base):
         return self
-# noinspection PyMethodParameters
-class TimeResolutionMixin(Base):
-    """Time resolution mix in"""
+class SpatialWindowMixin(Base):
+    """Mixin class, to add path and image size"""
-    time_resolution_minutes: int = Field(
+    image_size_pixels_height: int = Field(
         ...,
-        description="The temporal resolution of the data in minutes",
+        ge=0,
+        description="The number of pixels of the height of the region of interest",
     )
-class Site(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
-    """Site configuration model"""
-    file_path: str = Field(
+    image_size_pixels_width: int = Field(
         ...,
-        description="The NetCDF files holding the power timeseries.",
-    )
-    metadata_file_path: str = Field(
-        ...,
-        description="The CSV files describing power system",
+        ge=0,
+        description="The number of pixels of the width of the region of interest",
     )
-    @field_validator("forecast_minutes")
-    def forecast_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        """Check forecast length requested will give stable number of timesteps"""
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "Forecast duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
-    @field_validator("history_minutes")
-    def history_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        """Check history length requested will give stable number of timesteps"""
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "History duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
-    # TODO validate the netcdf for sites
-    # TODO validate the csv for metadata
-class Satellite(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
+class Satellite(TimeWindowMixin, DropoutMixin, SpatialWindowMixin):
     """Satellite configuration model"""
-    # Todo: remove 'satellite' from names
-    satellite_zarr_path: str | tuple[str] | list[str] = Field(
-        ...,
-        description="The path or list of paths which hold the satellite zarr",
-    )
-    satellite_channels: list[str] = Field(
-        ..., description="the satellite channels that are used"
-    )
-    satellite_image_size_pixels_height: int = Field(
+    zarr_path: str | tuple[str] | list[str] = Field(
         ...,
-        description="The number of pixels of the height of the region of interest"
-        " for non-HRV satellite channels.",
+        description="The path or list of paths which hold the data zarr",
     )
-    satellite_image_size_pixels_width: int = Field(
-        ...,
-        description="The number of pixels of the width of the region "
-        "of interest for non-HRV satellite channels.",
-    )
-    live_delay_minutes: int = Field(
-        ..., description="The expected delay in minutes of the satellite data"
+    channels: list[str] = Field(
+        ..., description="the satellite channels that are used"
     )
 # noinspection PyMethodParameters
-class NWP(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
+class NWP(TimeWindowMixin, DropoutMixin, SpatialWindowMixin):
     """NWP configuration model"""
-    nwp_zarr_path: str | tuple[str] | list[str] = Field(
+    zarr_path: str | tuple[str] | list[str] = Field(
         ...,
-        description="The path which holds the NWP zarr",
+        description="The path or list of paths which hold the data zarr",
     )
-    nwp_channels: list[str] = Field(
+    channels: list[str] = Field(
         ..., description="the channels used in the nwp data"
     )
-    nwp_accum_channels: list[str] = Field([], description="the nwp channels which need to be diffed")
-    nwp_image_size_pixels_height: int = Field(..., description="The size of NWP spacial crop in pixels")
-    nwp_image_size_pixels_width: int = Field(..., description="The size of NWP spacial crop in pixels")
-    nwp_provider: str = Field(..., description="The provider of the NWP data")
+    provider: str = Field(..., description="The provider of the NWP data")
+    accum_channels: list[str] = Field([], description="the nwp channels which need to be diffed")
     max_staleness_minutes: Optional[int] = Field(
         None,
@@ -188,32 +170,15 @@ class NWP(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
     )
-    @field_validator("nwp_provider")
-    def validate_nwp_provider(cls, v: str) -> str:
-        """Validate 'nwp_provider'"""
+    @field_validator("provider")
+    def validate_provider(cls, v: str) -> str:
+        """Validate 'provider'"""
         if v.lower() not in NWP_PROVIDERS:
             message = f"NWP provider {v} is not in {NWP_PROVIDERS}"
             logger.warning(message)
             raise Exception(message)
         return v
-    # Todo: put into time mixin when moving intervals there
-    @field_validator("forecast_minutes")
-    def forecast_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "Forecast duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
-    @field_validator("history_minutes")
-    def history_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "History duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
 class MultiNWP(RootModel):
     """Configuration for multiple NWPs"""
@@ -241,34 +206,32 @@ class MultiNWP(RootModel):
         return self.root.items()
-# noinspection PyMethodParameters
-class GSP(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
+class GSP(TimeWindowMixin, DropoutMixin):
     """GSP configuration model"""
-    gsp_zarr_path: str = Field(..., description="The path which holds the GSP zarr")
+    zarr_path: str = Field(..., description="The path which holds the GSP zarr")
-    @field_validator("forecast_minutes")
-    def forecast_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "Forecast duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
-    @field_validator("history_minutes")
-    def history_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "History duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
+class Site(TimeWindowMixin, DropoutMixin):
+    """Site configuration model"""
+    file_path: str = Field(
+        ...,
+        description="The NetCDF files holding the power timeseries.",
+    )
+    metadata_file_path: str = Field(
+        ...,
+        description="The CSV files describing power system",
+    )
+    # TODO validate the netcdf for sites
+    # TODO validate the csv for metadata
 # noinspection PyPep8Naming
 class InputData(Base):
-    """
-    Input data model.
-    """
+    """Input data model"""
     satellite: Optional[Satellite] = None
     nwp: Optional[MultiNWP] = None
@@ -280,4 +243,4 @@ class Configuration(Base):
     """Configuration model for the dataset"""
     general: General = General()
-    input_data: InputData = InputData()
+    input_data: InputData = InputData()

{ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/load_dataset.py RENAMED Viewed

@@ -20,8 +20,8 @@ def get_dataset_dict(config: Configuration) -> dict[str, dict[xr.DataArray]]:
     datasets_dict = {}
     # Load GSP data unless the path is None
-    if in_config.gsp and in_config.gsp.gsp_zarr_path:
-        da_gsp = open_gsp(zarr_path=in_config.gsp.gsp_zarr_path).compute()
+    if in_config.gsp and in_config.gsp.zarr_path:
+        da_gsp = open_gsp(zarr_path=in_config.gsp.zarr_path).compute()
         # Remove national GSP
         datasets_dict["gsp"] = da_gsp.sel(gsp_id=slice(1, None))
@@ -32,9 +32,9 @@ def get_dataset_dict(config: Configuration) -> dict[str, dict[xr.DataArray]]:
         datasets_dict["nwp"] = {}
         for nwp_source, nwp_config in in_config.nwp.items():
-            da_nwp = open_nwp(nwp_config.nwp_zarr_path, provider=nwp_config.nwp_provider)
+            da_nwp = open_nwp(nwp_config.zarr_path, provider=nwp_config.provider)
-            da_nwp = da_nwp.sel(channel=list(nwp_config.nwp_channels))
+            da_nwp = da_nwp.sel(channel=list(nwp_config.channels))
             datasets_dict["nwp"][nwp_source] = da_nwp
@@ -42,9 +42,9 @@ def get_dataset_dict(config: Configuration) -> dict[str, dict[xr.DataArray]]:
     if in_config.satellite:
         sat_config = config.input_data.satellite
-        da_sat = open_sat_data(sat_config.satellite_zarr_path)
+        da_sat = open_sat_data(sat_config.zarr_path)
-        da_sat = da_sat.sel(channel=list(sat_config.satellite_channels))
+        da_sat = da_sat.sel(channel=list(sat_config.channels))
         datasets_dict["sat"] = da_sat

{ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/find_contiguous_time_periods.py RENAMED Viewed

@@ -63,16 +63,16 @@ def find_contiguous_time_periods(
 def trim_contiguous_time_periods(
     contiguous_time_periods: pd.DataFrame,
-    history_duration: pd.Timedelta,
-    forecast_duration: pd.Timedelta,
+    interval_start: pd.Timedelta,
+    interval_end: pd.Timedelta,
 ) -> pd.DataFrame:
     """Trim the contiguous time periods to allow for history and forecast durations.
     Args:
         contiguous_time_periods: DataFrame where each row represents a single time period. The
             DataFrame must have `start_dt` and `end_dt` columns.
-        history_duration: Length of the historical slice used for a sample
-        forecast_duration: Length of the forecast slice used for a sample
+        interval_start: The start of the interval with respect to t0
+        interval_end: The end of the interval with respect to t0
     Returns:
@@ -80,8 +80,8 @@ def trim_contiguous_time_periods(
     """
     contiguous_time_periods = contiguous_time_periods.copy()
-    contiguous_time_periods["start_dt"] += history_duration
-    contiguous_time_periods["end_dt"] -= forecast_duration
+    contiguous_time_periods["start_dt"] -= interval_start
+    contiguous_time_periods["end_dt"] -= interval_end
     valid_mask = contiguous_time_periods["start_dt"] <= contiguous_time_periods["end_dt"]
     contiguous_time_periods = contiguous_time_periods.loc[valid_mask]
@@ -92,16 +92,16 @@ def trim_contiguous_time_periods(
 def find_contiguous_t0_periods(
         datetimes: pd.DatetimeIndex,
-        history_duration: pd.Timedelta,
-        forecast_duration: pd.Timedelta,
+        interval_start: pd.Timedelta,
+        interval_end: pd.Timedelta,
         sample_period_duration: pd.Timedelta,
     ) -> pd.DataFrame:
     """Return a pd.DataFrame where each row records the boundary of a contiguous time period.
     Args:
         datetimes: pd.DatetimeIndex. Must be sorted.
-        history_duration: Length of the historical slice used for each sample
-        forecast_duration: Length of the forecast slice used for each sample
+        interval_start: The start of the interval with respect to t0
+        interval_end: The end of the interval with respect to t0
         sample_period_duration: The sample frequency of the timeseries
@@ -109,7 +109,7 @@ def find_contiguous_t0_periods(
         pd.DataFrame where each row represents a single time period.  The pd.DataFrame
             has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
     """
-    total_duration = history_duration + forecast_duration
+    total_duration = interval_end - interval_start
     contiguous_time_periods = find_contiguous_time_periods(
         datetimes=datetimes,
@@ -119,8 +119,8 @@ def find_contiguous_t0_periods(
     contiguous_t0_periods = trim_contiguous_time_periods(
         contiguous_time_periods=contiguous_time_periods,
-        history_duration=history_duration,
-        forecast_duration=forecast_duration,
+        interval_start=interval_start,
+        interval_end=interval_end,
     )
     assert len(contiguous_t0_periods) > 0
@@ -128,92 +128,57 @@ def find_contiguous_t0_periods(
     return contiguous_t0_periods
-def _find_contiguous_t0_periods_nwp(
-        ds,
-        history_duration: pd.Timedelta,
-        forecast_duration: pd.Timedelta,
-        max_staleness: pd.Timedelta |  None = None,
-        max_dropout: pd.Timedelta = pd.Timedelta(0),
-        time_dim: str = "init_time_utc",
-        end_buffer: pd.Timedelta = pd.Timedelta(0),
-    ):
-    assert "step" in ds.coords
-    # It is possible to use up to this amount of max staleness for the dataset and slice
-    # required
-    possible_max_staleness = (
-        pd.Timedelta(ds["step"].max().item())
-        - forecast_duration
-        - end_buffer
-    )
-    # If max_staleness is set to None we set it based on the max step ahead of the input
-    # forecast data
-    if max_staleness is None:
-        max_staleness = possible_max_staleness
-    else:
-        # Make sure the max acceptable staleness isn't longer than the max possible
-        assert max_staleness <= possible_max_staleness
-        max_staleness = max_staleness
-    contiguous_time_periods = find_contiguous_t0_periods_nwp(
-        datetimes=pd.DatetimeIndex(ds[time_dim]),
-        history_duration=history_duration,
-        max_staleness=max_staleness,
-        max_dropout=max_dropout,
-    )
-    return contiguous_time_periods
 def find_contiguous_t0_periods_nwp(
-    datetimes: pd.DatetimeIndex,
-    history_duration: pd.Timedelta,
+    init_times: pd.DatetimeIndex,
+    interval_start: pd.Timedelta,
     max_staleness: pd.Timedelta,
     max_dropout: pd.Timedelta = pd.Timedelta(0),
+    first_forecast_step: pd.Timedelta = pd.Timedelta(0),
 ) -> pd.DataFrame:
     """Get all time periods from the NWP init times which are valid as t0 datetimes.
     Args:
-        datetimes: Sorted pd.DatetimeIndex
-        history_duration: Length of the historical slice used for a sample
-        max_staleness: Up to how long after an NWP forecast init_time are we willing to use the
-            forecast. Each init time will only be used up to this t0 time regardless of the forecast
-            valid time.
+        init_times: The initialisation times of the available forecasts
+        interval_start: The start of the desired data interval with respect to t0
+        max_staleness: Up to how long after an init time are we willing to use the forecast. Each
+            init time will only be used up to this t0 time regardless of the forecast valid time.
         max_dropout: What is the maximum amount of dropout that will be used. This must be <=
             max_staleness.
+        first_forecast_step: The timedelta of the first step of the forecast. By default we assume
+            the first valid time of the forecast is the same as its init time.
     Returns:
         pd.DataFrame where each row represents a single time period.  The pd.DataFrame
         has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
     """
     # Sanity checks.
-    assert len(datetimes) > 0
-    assert datetimes.is_monotonic_increasing
-    assert datetimes.is_unique
-    assert history_duration >= pd.Timedelta(0)
+    assert len(init_times) > 0
+    assert init_times.is_monotonic_increasing
+    assert init_times.is_unique
     assert max_staleness >= pd.Timedelta(0)
-    assert max_dropout <= max_staleness
+    assert pd.Timedelta(0) <= max_dropout <= max_staleness
-    hist_drop_buffer = max(history_duration, max_dropout)
+    hist_drop_buffer = max(first_forecast_step-interval_start, max_dropout)
     # Store contiguous periods
     contiguous_periods = []
-    # Start first period allowing for history slice and max dropout
-    start_this_period = datetimes[0] + hist_drop_buffer
+    # Begin the first period allowing for the time to the first_forecast_step, the length of the
+    # interval sampled from before t0, and the dropout
+    start_this_period = init_times[0] + hist_drop_buffer
     # The first forecast is valid up to the max staleness
-    end_this_period = datetimes[0] + max_staleness
-    for dt_init in datetimes[1:]:
-        # If the previous init time becomes stale before the next init becomes valid whilst also
-        # considering dropout - then the contiguous period breaks, and new starts with considering
-        # dropout and history duration
-        if end_this_period < dt_init + max_dropout:
+    end_this_period = init_times[0] + max_staleness
+    for dt_init in init_times[1:]:
+        # If the previous init time becomes stale before the next init becomes valid (whilst also
+        # considering dropout) then the contiguous period breaks
+        # Else if the previous init time becomes stale before the fist step of the next forecast
+        # then this also causes a break in the contiguous period
+        if (end_this_period < dt_init + max(max_dropout, first_forecast_step)):
             contiguous_periods.append([start_this_period, end_this_period])
-            # And start a new period
+            # The new period begins with the same conditions as the first period
             start_this_period = dt_init + hist_drop_buffer
         end_this_period = dt_init + max_staleness

ocf-data-sampler 0.0.25__tar.gz → 0.0.27__tar.gz

Potentially problematic release.

ocf-data-sampler 0.0.25tar.gz → 0.0.27tar.gz