PyPI - ocf-data-sampler - Versions diffs - 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl - Mend

ocf-data-sampler 0.0.24py3-none-any.whl → 0.0.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (28) hide show

ocf_data_sampler/config/model.py +84 -87
ocf_data_sampler/load/load_dataset.py +55 -0
ocf_data_sampler/load/nwp/providers/ecmwf.py +5 -2
ocf_data_sampler/load/site.py +30 -0
ocf_data_sampler/numpy_batch/__init__.py +1 -0
ocf_data_sampler/numpy_batch/site.py +29 -0
ocf_data_sampler/select/__init__.py +8 -1
ocf_data_sampler/select/dropout.py +2 -1
ocf_data_sampler/select/geospatial.py +43 -1
ocf_data_sampler/select/select_spatial_slice.py +8 -2
ocf_data_sampler/select/spatial_slice_for_dataset.py +53 -0
ocf_data_sampler/select/time_slice_for_dataset.py +124 -0
ocf_data_sampler/time_functions.py +11 -0
ocf_data_sampler/torch_datasets/process_and_combine.py +153 -0
ocf_data_sampler/torch_datasets/pvnet_uk_regional.py +8 -418
ocf_data_sampler/torch_datasets/site.py +196 -0
ocf_data_sampler/torch_datasets/valid_time_periods.py +108 -0
{ocf_data_sampler-0.0.24.dist-info → ocf_data_sampler-0.0.26.dist-info}/METADATA +1 -1
{ocf_data_sampler-0.0.24.dist-info → ocf_data_sampler-0.0.26.dist-info}/RECORD +28 -16
{ocf_data_sampler-0.0.24.dist-info → ocf_data_sampler-0.0.26.dist-info}/WHEEL +1 -1
{ocf_data_sampler-0.0.24.dist-info → ocf_data_sampler-0.0.26.dist-info}/top_level.txt +1 -0
scripts/refactor_site.py +50 -0
tests/config/test_config.py +9 -6
tests/conftest.py +62 -0
tests/load/test_load_sites.py +14 -0
tests/torch_datasets/test_pvnet_uk_regional.py +4 -4
tests/torch_datasets/test_site.py +85 -0
{ocf_data_sampler-0.0.24.dist-info → ocf_data_sampler-0.0.26.dist-info}/LICENSE +0 -0

ocf_data_sampler/config/model.py CHANGED Viewed

@@ -14,7 +14,7 @@ import logging
 from typing import Dict, List, Optional
 from typing_extensions import Self
-from pydantic import BaseModel, Field, RootModel, field_validator, ValidationInfo, model_validator
+from pydantic import BaseModel, Field, RootModel, field_validator, model_validator
 from ocf_data_sampler.constants import NWP_PROVIDERS
 logger = logging.getLogger(__name__)
@@ -34,27 +34,12 @@ class Base(BaseModel):
 class General(Base):
     """General pydantic model"""
-    name: str = Field("example", description="The name of this configuration file.")
+    name: str = Field("example", description="The name of this configuration file")
     description: str = Field(
         "example configuration", description="Description of this configuration file"
     )
-class DataSourceMixin(Base):
-    """Mixin class, to add forecast and history minutes"""
-    forecast_minutes: int = Field(
-        ...,
-        ge=0,
-        description="how many minutes to forecast in the future. ",
-    )
-    history_minutes: int = Field(
-        ...,
-        ge=0,
-        description="how many historic minutes to use. ",
-    )
 # noinspection PyMethodParameters
 class DropoutMixin(Base):
     """Mixin class, to add dropout minutes"""
@@ -65,7 +50,12 @@ class DropoutMixin(Base):
         "negative or zero.",
     )
-    dropout_fraction: float = Field(0, description="Chance of dropout being applied to each sample")
+    dropout_fraction: float = Field(
+        default=0,
+        description="Chance of dropout being applied to each sample",
+        ge=0,
+        le=1,
+    )
     @field_validator("dropout_timedeltas_minutes")
     def dropout_timedeltas_minutes_negative(cls, v: List[int]) -> List[int]:
@@ -75,12 +65,6 @@ class DropoutMixin(Base):
                 assert m <= 0, "Dropout timedeltas must be negative"
         return v
-    @field_validator("dropout_fraction")
-    def dropout_fraction_valid(cls, v: float) -> float:
-        """Validate 'dropout_fraction'"""
-        assert 0 <= v <= 1, "Dropout fraction must be between 0 and 1"
-        return v
     @model_validator(mode="after")
     def dropout_instructions_consistent(self) -> Self:
         if self.dropout_fraction == 0:
@@ -93,36 +77,67 @@ class DropoutMixin(Base):
 # noinspection PyMethodParameters
-class TimeResolutionMixin(Base):
+class TimeWindowMixin(Base):
     """Time resolution mix in"""
     time_resolution_minutes: int = Field(
         ...,
+        gt=0,
         description="The temporal resolution of the data in minutes",
     )
+    forecast_minutes: int = Field(
+        ...,
+        ge=0,
+        description="how many minutes to forecast in the future",
+    )
+    history_minutes: int = Field(
+        ...,
+        ge=0,
+        description="how many historic minutes to use",
+    )
-class Satellite(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
-    """Satellite configuration model"""
+    @field_validator("forecast_minutes")
+    def forecast_minutes_divide_by_time_resolution(cls, v, values) -> int:
+        if v % values.data["time_resolution_minutes"] != 0:
+            message = "Forecast duration must be divisible by time resolution"
+            logger.error(message)
+            raise Exception(message)
+        return v
-    # Todo: remove 'satellite' from names
-    satellite_zarr_path: str | tuple[str] | list[str] = Field(
+    @field_validator("history_minutes")
+    def history_minutes_divide_by_time_resolution(cls, v, values) -> int:
+        if v % values.data["time_resolution_minutes"] != 0:
+            message = "History duration must be divisible by time resolution"
+            logger.error(message)
+            raise Exception(message)
+        return v
+class SpatialWindowMixin(Base):
+    """Mixin class, to add path and image size"""
+    image_size_pixels_height: int = Field(
         ...,
-        description="The path or list of paths which hold the satellite zarr",
-    )
-    satellite_channels: list[str] = Field(
-        ..., description="the satellite channels that are used"
+        description="The number of pixels of the height of the region of interest",
     )
-    satellite_image_size_pixels_height: int = Field(
+    image_size_pixels_width: int = Field(
         ...,
-        description="The number of pixels of the height of the region of interest"
-        " for non-HRV satellite channels.",
+        description="The number of pixels of the width of the region of interest",
     )
-    satellite_image_size_pixels_width: int = Field(
+class Satellite(TimeWindowMixin, DropoutMixin, SpatialWindowMixin):
+    """Satellite configuration model"""
+    zarr_path: str | tuple[str] | list[str] = Field(
         ...,
-        description="The number of pixels of the width of the region "
-        "of interest for non-HRV satellite channels.",
+        description="The path or list of paths which hold the data zarr",
+    )
+    channels: list[str] = Field(
+        ..., description="the satellite channels that are used"
     )
     live_delay_minutes: int = Field(
@@ -131,21 +146,21 @@ class Satellite(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
 # noinspection PyMethodParameters
-class NWP(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
+class NWP(TimeWindowMixin, DropoutMixin, SpatialWindowMixin):
     """NWP configuration model"""
-    nwp_zarr_path: str | tuple[str] | list[str] = Field(
+    zarr_path: str | tuple[str] | list[str] = Field(
         ...,
-        description="The path which holds the NWP zarr",
+        description="The path or list of paths which hold the data zarr",
     )
-    nwp_channels: list[str] = Field(
+    channels: list[str] = Field(
         ..., description="the channels used in the nwp data"
     )
-    nwp_accum_channels: list[str] = Field([], description="the nwp channels which need to be diffed")
-    nwp_image_size_pixels_height: int = Field(..., description="The size of NWP spacial crop in pixels")
-    nwp_image_size_pixels_width: int = Field(..., description="The size of NWP spacial crop in pixels")
-    nwp_provider: str = Field(..., description="The provider of the NWP data")
+    provider: str = Field(..., description="The provider of the NWP data")
+    accum_channels: list[str] = Field([], description="the nwp channels which need to be diffed")
     max_staleness_minutes: Optional[int] = Field(
         None,
@@ -154,33 +169,15 @@ class NWP(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
         " the maximum forecast horizon of the NWP and the requested forecast length.",
     )
-    @field_validator("nwp_provider")
-    def validate_nwp_provider(cls, v: str) -> str:
-        """Validate 'nwp_provider'"""
+    @field_validator("provider")
+    def validate_provider(cls, v: str) -> str:
+        """Validate 'provider'"""
         if v.lower() not in NWP_PROVIDERS:
             message = f"NWP provider {v} is not in {NWP_PROVIDERS}"
             logger.warning(message)
             raise Exception(message)
         return v
-    # Todo: put into time mixin when moving intervals there
-    @field_validator("forecast_minutes")
-    def forecast_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "Forecast duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
-    @field_validator("history_minutes")
-    def history_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "History duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
 class MultiNWP(RootModel):
     """Configuration for multiple NWPs"""
@@ -208,27 +205,26 @@ class MultiNWP(RootModel):
         return self.root.items()
-# noinspection PyMethodParameters
-class GSP(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
+class GSP(TimeWindowMixin, DropoutMixin):
     """GSP configuration model"""
-    gsp_zarr_path: str = Field(..., description="The path which holds the GSP zarr")
+    zarr_path: str = Field(..., description="The path which holds the GSP zarr")
-    @field_validator("forecast_minutes")
-    def forecast_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "Forecast duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
-    @field_validator("history_minutes")
-    def history_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
-        if v % info.data["time_resolution_minutes"] != 0:
-            message = "History duration must be divisible by time resolution"
-            logger.error(message)
-            raise Exception(message)
-        return v
+class Site(TimeWindowMixin, DropoutMixin):
+    """Site configuration model"""
+    file_path: str = Field(
+        ...,
+        description="The NetCDF files holding the power timeseries.",
+    )
+    metadata_file_path: str = Field(
+        ...,
+        description="The CSV files describing power system",
+    )
+    # TODO validate the netcdf for sites
+    # TODO validate the csv for metadata
 # noinspection PyPep8Naming
@@ -240,10 +236,11 @@ class InputData(Base):
     satellite: Optional[Satellite] = None
     nwp: Optional[MultiNWP] = None
     gsp: Optional[GSP] = None
+    site: Optional[Site] = None
 class Configuration(Base):
     """Configuration model for the dataset"""
     general: General = General()
-    input_data: InputData = InputData()
+    input_data: InputData = InputData()

ocf_data_sampler/load/load_dataset.py ADDED Viewed

@@ -0,0 +1,55 @@
+""" Loads all data sources """
+import xarray as xr
+from ocf_data_sampler.config import Configuration
+from ocf_data_sampler.load.gsp import open_gsp
+from ocf_data_sampler.load.nwp import open_nwp
+from ocf_data_sampler.load.satellite import open_sat_data
+from ocf_data_sampler.load.site import open_site
+def get_dataset_dict(config: Configuration) -> dict[str, dict[xr.DataArray]]:
+    """Construct dictionary of all of the input data sources
+    Args:
+        config: Configuration file
+    """
+    in_config = config.input_data
+    datasets_dict = {}
+    # Load GSP data unless the path is None
+    if in_config.gsp and in_config.gsp.zarr_path:
+        da_gsp = open_gsp(zarr_path=in_config.gsp.zarr_path).compute()
+        # Remove national GSP
+        datasets_dict["gsp"] = da_gsp.sel(gsp_id=slice(1, None))
+    # Load NWP data if in config
+    if in_config.nwp:
+        datasets_dict["nwp"] = {}
+        for nwp_source, nwp_config in in_config.nwp.items():
+            da_nwp = open_nwp(nwp_config.zarr_path, provider=nwp_config.provider)
+            da_nwp = da_nwp.sel(channel=list(nwp_config.channels))
+            datasets_dict["nwp"][nwp_source] = da_nwp
+    # Load satellite data if in config
+    if in_config.satellite:
+        sat_config = config.input_data.satellite
+        da_sat = open_sat_data(sat_config.zarr_path)
+        da_sat = da_sat.sel(channel=list(sat_config.channels))
+        datasets_dict["sat"] = da_sat
+    if in_config.site:
+        da_sites = open_site(in_config.site)
+        datasets_dict["site"] = da_sites
+    return datasets_dict

ocf_data_sampler/load/nwp/providers/ecmwf.py CHANGED Viewed

@@ -9,7 +9,6 @@ from ocf_data_sampler.load.utils import (
 )
 def open_ifs(zarr_path: Path | str | list[Path] | list[str]) -> xr.DataArray:
     """
     Opens the ECMWF IFS NWP data
@@ -27,10 +26,14 @@ def open_ifs(zarr_path: Path | str | list[Path] | list[str]) -> xr.DataArray:
     ds = ds.rename(
         {
             "init_time": "init_time_utc",
-            "variable": "channel",
         }
     )
+    # LEGACY SUPPORT
+    # rename variable to channel if it exists
+    if "variable" in ds:
+        ds = ds.rename({"variable": "channel"})
     # Check the timestamps are unique and increasing
     check_time_unique_increasing(ds.init_time_utc)

ocf_data_sampler/load/site.py ADDED Viewed

@@ -0,0 +1,30 @@
+import pandas as pd
+import xarray as xr
+import numpy as np
+from ocf_data_sampler.config.model import Site
+def open_site(sites_config: Site) -> xr.DataArray:
+    # Load site generation xr.Dataset
+    data_ds = xr.open_dataset(sites_config.file_path)
+    # Load site generation data
+    metadata_df = pd.read_csv(sites_config.metadata_file_path, index_col="site_id")
+    # Add coordinates
+    ds = data_ds.assign_coords(
+        latitude=(metadata_df.latitude.to_xarray()),
+        longitude=(metadata_df.longitude.to_xarray()),
+        capacity_kwp=data_ds.capacity_kwp,
+    )
+    # Sanity checks
+    assert np.isfinite(data_ds.capacity_kwp.values).all()
+    assert (data_ds.capacity_kwp.values > 0).all()
+    assert metadata_df.index.is_unique
+    return ds.generation_kw

ocf_data_sampler/numpy_batch/__init__.py CHANGED Viewed

@@ -4,4 +4,5 @@ from .gsp import convert_gsp_to_numpy_batch, GSPBatchKey
 from .nwp import convert_nwp_to_numpy_batch, NWPBatchKey
 from .satellite import convert_satellite_to_numpy_batch, SatelliteBatchKey
 from .sun_position import make_sun_position_numpy_batch
+from .site import convert_site_to_numpy_batch

ocf_data_sampler/numpy_batch/site.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""Convert site to Numpy Batch"""
+import xarray as xr
+class SiteBatchKey:
+    generation = "site"
+    site_capacity_kwp = "site_capacity_kwp"
+    site_time_utc = "site_time_utc"
+    site_t0_idx = "site_t0_idx"
+    site_solar_azimuth = "site_solar_azimuth"
+    site_solar_elevation = "site_solar_elevation"
+    site_id = "site_id"
+def convert_site_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> dict:
+    """Convert from Xarray to NumpyBatch"""
+    example = {
+        SiteBatchKey.generation: da.values,
+        SiteBatchKey.site_capacity_kwp: da.isel(time_utc=0)["capacity_kwp"].values,
+        SiteBatchKey.site_time_utc: da["time_utc"].values.astype(float),
+    }
+    if t0_idx is not None:
+        example[SiteBatchKey.site_t0_idx] = t0_idx
+    return example

ocf_data_sampler/select/__init__.py CHANGED Viewed

@@ -1 +1,8 @@
+from .fill_time_periods import fill_time_periods
+from .find_contiguous_time_periods import (
+    find_contiguous_t0_periods,
+    intersection_of_multiple_dataframes_of_periods,
+)
+from .location import Location
+from .spatial_slice_for_dataset import slice_datasets_by_space
+from .time_slice_for_dataset import slice_datasets_by_time

ocf_data_sampler/select/dropout.py CHANGED Viewed

@@ -1,3 +1,4 @@
+""" Functions for simulating dropout in time series data """
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -5,7 +6,7 @@ import xarray as xr
 def draw_dropout_time(
         t0: pd.Timestamp,
-        dropout_timedeltas: list[pd.Timedelta] | None,
+        dropout_timedeltas: list[pd.Timedelta] | pd.Timedelta | None,
         dropout_frac: float = 0,
     ):

ocf_data_sampler/select/geospatial.py CHANGED Viewed

@@ -55,6 +55,23 @@ def lon_lat_to_osgb(
     return _lon_lat_to_osgb(xx=x, yy=y)
+def lon_lat_to_geostationary_area_coords(
+    longitude: Union[Number, np.ndarray],
+    latitude: Union[Number, np.ndarray],
+    xr_data: xr.DataArray,
+) -> tuple[Union[Number, np.ndarray], Union[Number, np.ndarray]]:
+    """Loads geostationary area and transformation from lat-lon to geostationary coords
+    Args:
+        longitude: longitude
+        latitude: latitude
+        xr_data: xarray object with geostationary area
+    Returns:
+        Geostationary coords: x, y
+    """
+    return coordinates_to_geostationary_area_coords(longitude, latitude, xr_data, WGS84)
 def osgb_to_geostationary_area_coords(
     x: Union[Number, np.ndarray],
     y: Union[Number, np.ndarray],
@@ -70,6 +87,31 @@ def osgb_to_geostationary_area_coords(
     Returns:
         Geostationary coords: x, y
     """
+    return coordinates_to_geostationary_area_coords(x, y, xr_data, OSGB36)
+def coordinates_to_geostationary_area_coords(
+    x: Union[Number, np.ndarray],
+    y: Union[Number, np.ndarray],
+    xr_data: xr.DataArray,
+    crs_from: int
+) -> tuple[Union[Number, np.ndarray], Union[Number, np.ndarray]]:
+    """Loads geostationary area and transformation from respective coordiates to geostationary coords
+        Args:
+            x: osgb east-west, or latitude
+            y: osgb north-south, or longitude
+            xr_data: xarray object with geostationary area
+            crs_from: the cordiates system of x,y
+        Returns:
+            Geostationary coords: x, y
+        """
+    assert crs_from in [OSGB36, WGS84], f"Unrecognized coordinate system: {crs_from}"
     # Only load these if using geostationary projection
     import pyresample
@@ -80,7 +122,7 @@ def osgb_to_geostationary_area_coords(
     )
     geostationary_crs = geostationary_area_definition.crs
     osgb_to_geostationary = pyproj.Transformer.from_crs(
-        crs_from=OSGB36, crs_to=geostationary_crs, always_xy=True
+        crs_from=crs_from, crs_to=geostationary_crs, always_xy=True
     ).transform
     return osgb_to_geostationary(xx=x, yy=y)

ocf_data_sampler/select/select_spatial_slice.py CHANGED Viewed

@@ -8,6 +8,7 @@ import xarray as xr
 from ocf_data_sampler.select.location import Location
 from ocf_data_sampler.select.geospatial import (
     lon_lat_to_osgb,
+    lon_lat_to_geostationary_area_coords,
     osgb_to_geostationary_area_coords,
     osgb_to_lon_lat,
     spatial_coord_type,
@@ -101,7 +102,7 @@ def _get_idx_of_pixel_closest_to_poi(
 def _get_idx_of_pixel_closest_to_poi_geostationary(
     da: xr.DataArray,
-    center_osgb: Location,
+    center: Location,
 ) -> Location:
     """
     Return x and y index location of pixel at center of region of interest.
@@ -116,7 +117,12 @@ def _get_idx_of_pixel_closest_to_poi_geostationary(
     _, x_dim, y_dim = spatial_coord_type(da)
-    x, y = osgb_to_geostationary_area_coords(x=center_osgb.x, y=center_osgb.y, xr_data=da)
+    if center.coordinate_system == 'osgb':
+        x, y = osgb_to_geostationary_area_coords(x=center.x, y=center.y, xr_data=da)
+    elif center.coordinate_system == 'lon_lat':
+        x, y = lon_lat_to_geostationary_area_coords(longitude=center.x, latitude=center.y, xr_data=da)
+    else:
+        x,y = center.x, center.y
     center_geostationary = Location(x=x, y=y, coordinate_system="geostationary")
     # Check that the requested point lies within the data

ocf_data_sampler/select/spatial_slice_for_dataset.py ADDED Viewed

@@ -0,0 +1,53 @@
+""" Functions for selecting data around a given location """
+from ocf_data_sampler.config import Configuration
+from ocf_data_sampler.select.location import Location
+from ocf_data_sampler.select.select_spatial_slice import select_spatial_slice_pixels
+def slice_datasets_by_space(
+    datasets_dict: dict,
+    location: Location,
+    config: Configuration,
+) -> dict:
+    """Slice the dictionary of input data sources around a given location
+    Args:
+        datasets_dict: Dictionary of the input data sources
+        location: The location to sample around
+        config: Configuration object.
+    """
+    assert set(datasets_dict.keys()).issubset({"nwp", "sat", "gsp", "site"})
+    sliced_datasets_dict = {}
+    if "nwp" in datasets_dict:
+        sliced_datasets_dict["nwp"] = {}
+        for nwp_key, nwp_config in config.input_data.nwp.items():
+            sliced_datasets_dict["nwp"][nwp_key] = select_spatial_slice_pixels(
+                datasets_dict["nwp"][nwp_key],
+                location,
+                height_pixels=nwp_config.image_size_pixels_height,
+                width_pixels=nwp_config.image_size_pixels_width,
+            )
+    if "sat" in datasets_dict:
+        sat_config = config.input_data.satellite
+        sliced_datasets_dict["sat"] = select_spatial_slice_pixels(
+            datasets_dict["sat"],
+            location,
+            height_pixels=sat_config.image_size_pixels_height,
+            width_pixels=sat_config.image_size_pixels_width,
+        )
+    if "gsp" in datasets_dict:
+        sliced_datasets_dict["gsp"] = datasets_dict["gsp"].sel(gsp_id=location.id)
+    if "site" in datasets_dict:
+        sliced_datasets_dict["site"] = datasets_dict["site"].sel(site_id=location.id)
+    return sliced_datasets_dict

ocf-data-sampler 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl

Potentially problematic release.

ocf-data-sampler 0.0.24py3-none-any.whl → 0.0.26py3-none-any.whl