PyPI - ocf-data-sampler - Versions diffs - 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl - Mend

ocf-data-sampler 0.0.23py3-none-any.whl → 0.0.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (35) hide show

ocf_data_sampler/config/model.py +34 -0
ocf_data_sampler/load/load_dataset.py +55 -0
ocf_data_sampler/load/nwp/providers/ecmwf.py +5 -2
ocf_data_sampler/load/site.py +30 -0
ocf_data_sampler/numpy_batch/__init__.py +4 -3
ocf_data_sampler/numpy_batch/gsp.py +12 -12
ocf_data_sampler/numpy_batch/nwp.py +14 -14
ocf_data_sampler/numpy_batch/satellite.py +8 -8
ocf_data_sampler/numpy_batch/site.py +29 -0
ocf_data_sampler/select/__init__.py +8 -1
ocf_data_sampler/select/dropout.py +2 -1
ocf_data_sampler/select/geospatial.py +43 -1
ocf_data_sampler/select/select_spatial_slice.py +8 -2
ocf_data_sampler/select/spatial_slice_for_dataset.py +53 -0
ocf_data_sampler/select/time_slice_for_dataset.py +124 -0
ocf_data_sampler/time_functions.py +11 -0
ocf_data_sampler/torch_datasets/process_and_combine.py +153 -0
ocf_data_sampler/torch_datasets/pvnet_uk_regional.py +8 -418
ocf_data_sampler/torch_datasets/site.py +196 -0
ocf_data_sampler/torch_datasets/valid_time_periods.py +108 -0
{ocf_data_sampler-0.0.23.dist-info → ocf_data_sampler-0.0.25.dist-info}/METADATA +1 -1
ocf_data_sampler-0.0.25.dist-info/RECORD +66 -0
{ocf_data_sampler-0.0.23.dist-info → ocf_data_sampler-0.0.25.dist-info}/WHEEL +1 -1
{ocf_data_sampler-0.0.23.dist-info → ocf_data_sampler-0.0.25.dist-info}/top_level.txt +1 -0
scripts/refactor_site.py +50 -0
tests/conftest.py +62 -0
tests/load/test_load_sites.py +14 -0
tests/numpy_batch/test_gsp.py +1 -2
tests/numpy_batch/test_nwp.py +1 -3
tests/numpy_batch/test_satellite.py +1 -3
tests/numpy_batch/test_sun_position.py +7 -7
tests/torch_datasets/test_pvnet_uk_regional.py +4 -6
tests/torch_datasets/test_site.py +85 -0
ocf_data_sampler-0.0.23.dist-info/RECORD +0 -54
{ocf_data_sampler-0.0.23.dist-info → ocf_data_sampler-0.0.25.dist-info}/LICENSE +0 -0

ocf_data_sampler/torch_datasets/site.py ADDED Viewed

@@ -0,0 +1,196 @@
+"""Torch dataset for sites"""
+import logging
+import pandas as pd
+import xarray as xr
+from torch.utils.data import Dataset
+from ocf_data_sampler.config import Configuration, load_yaml_configuration
+from ocf_data_sampler.load.load_dataset import get_dataset_dict
+from ocf_data_sampler.select import (
+    Location,
+    fill_time_periods,
+    find_contiguous_t0_periods,
+    intersection_of_multiple_dataframes_of_periods,
+    slice_datasets_by_time, slice_datasets_by_space
+)
+from ocf_data_sampler.time_functions import minutes
+from ocf_data_sampler.torch_datasets.process_and_combine import process_and_combine_datasets, compute
+from ocf_data_sampler.torch_datasets.valid_time_periods import find_valid_time_periods
+xr.set_options(keep_attrs=True)
+def find_valid_t0_and_site_ids(
+        datasets_dict: dict,
+        config: Configuration,
+) -> pd.DataFrame:
+    """Find the t0 times where all of the requested input data is available
+    The idea is to
+    1. Get valid time period for nwp and satellite
+    2. For each site location, find valid periods for that location
+    Args:
+        datasets_dict: A dictionary of input datasets
+        config: Configuration file
+    """
+    # 1. Get valid time period for nwp and satellite
+    datasets_nwp_and_sat_dict = {"nwp": datasets_dict["nwp"], "sat": datasets_dict["sat"]}
+    valid_time_periods = find_valid_time_periods(datasets_nwp_and_sat_dict, config)
+    # 2. Now lets loop over each location in system id and find the valid periods
+    # Should we have a different option if there are not nans
+    sites = datasets_dict["site"]
+    site_ids = sites.site_id.values
+    site_config = config.input_data.site
+    valid_t0_and_site_ids = []
+    for site_id in site_ids:
+        site = sites.sel(site_id=site_id)
+        # drop any nan values
+        # not sure this is right?
+        site = site.dropna(dim='time_utc')
+        # Get the valid time periods for this location
+        time_periods = find_contiguous_t0_periods(
+            pd.DatetimeIndex(site["time_utc"]),
+            sample_period_duration=minutes(site_config.time_resolution_minutes),
+            history_duration=minutes(site_config.history_minutes),
+            forecast_duration=minutes(site_config.forecast_minutes),
+        )
+        valid_time_periods_per_site = intersection_of_multiple_dataframes_of_periods(
+            [valid_time_periods, time_periods]
+        )
+        # Fill out the contiguous time periods to get the t0 times
+        valid_t0_times_per_site = fill_time_periods(
+            valid_time_periods_per_site,
+            freq=minutes(site_config.time_resolution_minutes)
+        )
+        valid_t0_per_site = pd.DataFrame(index=valid_t0_times_per_site)
+        valid_t0_per_site['site_id'] = site_id
+        valid_t0_and_site_ids.append(valid_t0_per_site)
+    valid_t0_and_site_ids = pd.concat(valid_t0_and_site_ids)
+    valid_t0_and_site_ids.index.name = 't0'
+    valid_t0_and_site_ids.reset_index(inplace=True)
+    return valid_t0_and_site_ids
+def get_locations(site_xr: xr.Dataset):
+    """Get list of locations of all sites"""
+    locations = []
+    for site_id in site_xr.site_id.values:
+        site = site_xr.sel(site_id=site_id)
+        location = Location(
+            id=site_id,
+            x=site.longitude.values,
+            y=site.latitude.values,
+            coordinate_system="lon_lat"
+        )
+        locations.append(location)
+    return locations
+class SitesDataset(Dataset):
+    def __init__(
+            self,
+            config_filename: str,
+            start_time: str | None = None,
+            end_time: str | None = None,
+    ):
+        """A torch Dataset for creating PVNet Site samples
+        Args:
+            config_filename: Path to the configuration file
+            start_time: Limit the init-times to be after this
+            end_time: Limit the init-times to be before this
+        """
+        config = load_yaml_configuration(config_filename)
+        datasets_dict = get_dataset_dict(config)
+        # get all locations
+        self.locations = get_locations(datasets_dict['site'])
+        # Get t0 times where all input data is available
+        valid_t0_and_site_ids = find_valid_t0_and_site_ids(datasets_dict, config)
+        # Filter t0 times to given range
+        if start_time is not None:
+            valid_t0_and_site_ids \
+                = valid_t0_and_site_ids[valid_t0_and_site_ids['t0'] >= pd.Timestamp(start_time)]
+        if end_time is not None:
+            valid_t0_and_site_ids \
+                = valid_t0_and_site_ids[valid_t0_and_site_ids['t0'] <= pd.Timestamp(end_time)]
+        # Assign coords and indices to self
+        self.valid_t0_and_site_ids = valid_t0_and_site_ids
+        # Assign config and input data to self
+        self.datasets_dict = datasets_dict
+        self.config = config
+    def __len__(self):
+        return len(self.valid_t0_and_site_ids)
+    def _get_sample(self, t0: pd.Timestamp, location: Location) -> dict:
+        """Generate the PVNet sample for given coordinates
+        Args:
+            t0: init-time for sample
+            location: location for sample
+        """
+        sample_dict = slice_datasets_by_space(self.datasets_dict, location, self.config)
+        sample_dict = slice_datasets_by_time(sample_dict, t0, self.config)
+        sample_dict = compute(sample_dict)
+        sample = process_and_combine_datasets(sample_dict, self.config, t0, location, sun_position_key='site')
+        return sample
+    def get_location_from_site_id(self, site_id):
+        """Get location from system id"""
+        locations = [loc for loc in self.locations if loc.id == site_id]
+        if len(locations) == 0:
+            raise ValueError(f"Location not found for site_id {site_id}")
+        if len(locations) > 1:
+            logging.warning(f"Multiple locations found for site_id {site_id}, but will take the first")
+        return locations[0]
+    def __getitem__(self, idx):
+        # Get the coordinates of the sample
+        t0, site_id = self.valid_t0_and_site_ids.iloc[idx]
+        # get location from site id
+        location = self.get_location_from_site_id(site_id)
+        # Generate the sample
+        return self._get_sample(t0, location)
+    def get_sample(self, t0: pd.Timestamp, site_id: int) -> dict:
+        """Generate a sample for a given site id and t0.
+        Useful for users to generate samples by t0 and site id
+        Args:
+            t0: init-time for sample
+            site_id: site id as int
+        """
+        location = self.get_location_from_site_id(site_id)
+        return self._get_sample(t0, location)

ocf_data_sampler/torch_datasets/valid_time_periods.py ADDED Viewed

@@ -0,0 +1,108 @@
+import numpy as np
+import pandas as pd
+from ocf_data_sampler.config import Configuration
+from ocf_data_sampler.select.find_contiguous_time_periods import find_contiguous_t0_periods_nwp, \
+    find_contiguous_t0_periods, intersection_of_multiple_dataframes_of_periods
+from ocf_data_sampler.time_functions import minutes
+def find_valid_time_periods(
+    datasets_dict: dict,
+    config: Configuration,
+):
+    """Find the t0 times where all of the requested input data is available
+    Args:
+        datasets_dict: A dictionary of input datasets
+        config: Configuration file
+    """
+    assert set(datasets_dict.keys()).issubset({"nwp", "sat", "gsp"})
+    contiguous_time_periods: dict[str: pd.DataFrame] = {}  # Used to store contiguous time periods from each data source
+    if "nwp" in datasets_dict:
+        for nwp_key, nwp_config in config.input_data.nwp.items():
+            da = datasets_dict["nwp"][nwp_key]
+            if nwp_config.dropout_timedeltas_minutes is None:
+                max_dropout = minutes(0)
+            else:
+                max_dropout = minutes(np.max(np.abs(nwp_config.dropout_timedeltas_minutes)))
+            if nwp_config.max_staleness_minutes is None:
+                max_staleness = None
+            else:
+                max_staleness = minutes(nwp_config.max_staleness_minutes)
+            # The last step of the forecast is lost if we have to diff channels
+            if len(nwp_config.nwp_accum_channels) > 0:
+                end_buffer = minutes(nwp_config.time_resolution_minutes)
+            else:
+                end_buffer = minutes(0)
+            # This is the max staleness we can use considering the max step of the input data
+            max_possible_staleness = (
+                pd.Timedelta(da["step"].max().item())
+                - minutes(nwp_config.forecast_minutes)
+                - end_buffer
+            )
+            # Default to use max possible staleness unless specified in config
+            if max_staleness is None:
+                max_staleness = max_possible_staleness
+            else:
+                # Make sure the max acceptable staleness isn't longer than the max possible
+                assert max_staleness <= max_possible_staleness
+            time_periods = find_contiguous_t0_periods_nwp(
+                datetimes=pd.DatetimeIndex(da["init_time_utc"]),
+                history_duration=minutes(nwp_config.history_minutes),
+                max_staleness=max_staleness,
+                max_dropout=max_dropout,
+            )
+            contiguous_time_periods[f'nwp_{nwp_key}'] = time_periods
+    if "sat" in datasets_dict:
+        sat_config = config.input_data.satellite
+        time_periods = find_contiguous_t0_periods(
+            pd.DatetimeIndex(datasets_dict["sat"]["time_utc"]),
+            sample_period_duration=minutes(sat_config.time_resolution_minutes),
+            history_duration=minutes(sat_config.history_minutes),
+            forecast_duration=minutes(sat_config.forecast_minutes),
+        )
+        contiguous_time_periods['sat'] = time_periods
+    if "gsp" in datasets_dict:
+        gsp_config = config.input_data.gsp
+        time_periods = find_contiguous_t0_periods(
+            pd.DatetimeIndex(datasets_dict["gsp"]["time_utc"]),
+            sample_period_duration=minutes(gsp_config.time_resolution_minutes),
+            history_duration=minutes(gsp_config.history_minutes),
+            forecast_duration=minutes(gsp_config.forecast_minutes),
+        )
+        contiguous_time_periods['gsp'] = time_periods
+    # just get the values (not the keys)
+    contiguous_time_periods_values = list(contiguous_time_periods.values())
+    # Find joint overlapping contiguous time periods
+    if len(contiguous_time_periods_values) > 1:
+        valid_time_periods = intersection_of_multiple_dataframes_of_periods(
+            contiguous_time_periods_values
+        )
+    else:
+        valid_time_periods = contiguous_time_periods_values[0]
+    # check there are some valid time periods
+    if len(valid_time_periods) == 0:
+        raise ValueError(f"No valid time periods found, {contiguous_time_periods=}")
+    return valid_time_periods

{ocf_data_sampler-0.0.23.dist-info → ocf_data_sampler-0.0.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ocf_data_sampler
-Version: 0.0.23
+Version: 0.0.25
 Summary: Sample from weather data for renewable energy prediction
 Author: James Fulton, Peter Dudfield, and the Open Climate Fix team
 Author-email: info@openclimatefix.org

ocf_data_sampler-0.0.25.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,66 @@
+ocf_data_sampler/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+ocf_data_sampler/constants.py,sha256=tUwHrsGShqIn5Izze4i32_xB6X0v67rvQwIYB-P5PJQ,3355
+ocf_data_sampler/time_functions.py,sha256=R6ZlVEe6h4UlJeUW7paZYAMWveOv9MTjMsoISCwnsiE,284
+ocf_data_sampler/config/__init__.py,sha256=YXnAkgHViHB26hSsjiv32b6EbpG-A1kKTkARJf0_RkY,212
+ocf_data_sampler/config/load.py,sha256=4f7vPHAIAmd-55tPxoIzn7F_TI_ue4NxkDcLPoVWl0g,943
+ocf_data_sampler/config/model.py,sha256=5GO8SF_4iOZhCAyIJyENSl0dnDRIWrURgqwslrVWke8,9462
+ocf_data_sampler/config/save.py,sha256=wKdctbv0dxIIiQtcRHLRxpWQVhEFQ_FCWg-oNaRLIps,1093
+ocf_data_sampler/data/uk_gsp_locations.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZtI58td4d4DhGos,10415772
+ocf_data_sampler/load/__init__.py,sha256=MjgfxilTzyz1RYFoBEeAXmE9hyjknLvdmlHPmlAoiQY,44
+ocf_data_sampler/load/gsp.py,sha256=Gcr1JVUOPKhFRDCSHtfPDjxx0BtyyEhXrZvGEKLPJ5I,759
+ocf_data_sampler/load/load_dataset.py,sha256=R4RAIVLVx6CHA6Qs61kD9sx834I_GMGAn6G7ZgwFMUA,1627
+ocf_data_sampler/load/satellite.py,sha256=3KlA1fx4SwxdzM-jC1WRaONXO0D6m0WxORnEnwUnZrA,2967
+ocf_data_sampler/load/site.py,sha256=ROif2XXIIgBz-JOOiHymTq1CMXswJ3AzENU9DJmYpcU,782
+ocf_data_sampler/load/utils.py,sha256=EQGvVWlGMoSOdbDYuMfVAa0v6wmAOPmHIAemdrTB5v4,1406
+ocf_data_sampler/load/nwp/__init__.py,sha256=SmcrnbygO5xtCKmGR4wtHrj-HI7nOAvnAtfuvRufBGQ,25
+ocf_data_sampler/load/nwp/nwp.py,sha256=O4QnajEZem8BvBgTcYYDBhRhgqPYuJkolHmpMRmrXEA,610
+ocf_data_sampler/load/nwp/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ocf_data_sampler/load/nwp/providers/ecmwf.py,sha256=2iR1Iy542lo51rC6XFLV-3pbUE68dWjlHa6TVJzx3ac,1280
+ocf_data_sampler/load/nwp/providers/ukv.py,sha256=79Bm7q-K_GJPYMy62SUIZbRWRF4-tIaB1dYPEgLD9vo,1207
+ocf_data_sampler/load/nwp/providers/utils.py,sha256=Sy2exG1wpXLLhMXYdsfR-DZMR3txG1_bBmBdchlc-yA,848
+ocf_data_sampler/numpy_batch/__init__.py,sha256=8MgRF29rK9bKP4b4iHakaoGwBKUcjWZ-VFKjCcq53QA,336
+ocf_data_sampler/numpy_batch/gsp.py,sha256=QjQ25JmtufvdiSsxUkBTPhxouYGWPnnWze8pXr_aBno,960
+ocf_data_sampler/numpy_batch/nwp.py,sha256=dAehfRo5DL2Yb20ifHHl5cU1QOrm3ZOpQmN39fSUOw8,1255
+ocf_data_sampler/numpy_batch/satellite.py,sha256=3NoE_ElzMHwO60apqJeFAwI6J7eIxD0OWTyAVl-uJi8,903
+ocf_data_sampler/numpy_batch/site.py,sha256=lJYMEot50UgSBnSOgADQMjUhky1YyWKYqwNsisyYV6w,789
+ocf_data_sampler/numpy_batch/sun_position.py,sha256=zw2bjtcjsm_tvKk0r_MZmgfYUJLHuLjLly2sMjwP3XI,1606
+ocf_data_sampler/select/__init__.py,sha256=E4AJulEbO2K-o0UlG1fgaEteuf_1ZFjHTvrotXSb4YU,332
+ocf_data_sampler/select/dropout.py,sha256=HCx5Wzk8Oh2Z9vV94Jy-ALJsHtGduwvMaQOleQXp5z0,1142
+ocf_data_sampler/select/fill_time_periods.py,sha256=iTtMjIPFYG5xtUYYedAFBLjTWWUa7t7WQ0-yksWf0-E,440
+ocf_data_sampler/select/find_contiguous_time_periods.py,sha256=6ioB8LeFpFNBMgKDxrgG3zqzNjkBF_jlV9yye2ZYT2E,11925
+ocf_data_sampler/select/geospatial.py,sha256=4xL-9y674jjoaXeqE52NHCHVfknciE4OEGsZtn9DvP4,4911
+ocf_data_sampler/select/location.py,sha256=26Y5ZjfFngShBwXieuWSoOA-RLaRzci4TTmcDk3Wg7U,2015
+ocf_data_sampler/select/select_spatial_slice.py,sha256=WNxwur9Q5oetvogATw8-hNejDuEwrXHzuZIovFDjNJA,11488
+ocf_data_sampler/select/select_time_slice.py,sha256=41cch1fQr59fZgv7UHsNGc3OvoynrixT3bmr3_1d7cU,6628
+ocf_data_sampler/select/spatial_slice_for_dataset.py,sha256=Nrc3j8DR5MM4BPPp9IQwaIMpoyOkc6AADMnfOjg-170,1791
+ocf_data_sampler/select/time_slice_for_dataset.py,sha256=A9fxvurbM0JSRkrjyg5Lr70_Mj6t5OO7HFqHUZel9q4,4220
+ocf_data_sampler/torch_datasets/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+ocf_data_sampler/torch_datasets/process_and_combine.py,sha256=Lovc2UM3-HgUy2BoQEIr0gQTz3USW6ACRWo-iTgxjHs,4993
+ocf_data_sampler/torch_datasets/pvnet_uk_regional.py,sha256=TpHALGU7hpo3iLbvD0nkoY6zu94Vq99W1V1qSGEcIW8,5552
+ocf_data_sampler/torch_datasets/site.py,sha256=1k0fWXYwAAIWG5DX_j3tgNfY8gglfPGLNzNlZd8EnJs,6631
+ocf_data_sampler/torch_datasets/valid_time_periods.py,sha256=dNJkBH5wdsFUjoFSmthU3yTqar6OPE77WsRQUebm-PY,4163
+scripts/refactor_site.py,sha256=asZ27hQ4IyXgCCUaFJqcz1ObBNcV2W3ywqHBpSXA_fc,1728
+tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/conftest.py,sha256=ZRktySCynj3NBbFRR4EFNLRLFMErkQsC-qQlmQzhbRg,7360
+tests/config/test_config.py,sha256=G_PD_pXib0zdRBPUIn0jjwJ9VyoKaO_TanLN1Mh5Ca4,5055
+tests/load/test_load_gsp.py,sha256=aT_nqaSXmUTcdHzuTT7AmXJr3R31k4OEN-Fv3eLxlQE,424
+tests/load/test_load_nwp.py,sha256=3qyyDkB1q9t3tyAwogfotNrxqUOpXXimco1CImoEWGg,753
+tests/load/test_load_satellite.py,sha256=STX5AqqmOAgUgE9R1xyq_sM3P1b8NKdGjO-hDhayfxM,524
+tests/load/test_load_sites.py,sha256=T9lSEnGPI8FQISudVYHHNTHeplNS62Vrx48jaZ6J_Jo,364
+tests/numpy_batch/test_gsp.py,sha256=VANXV32K8aLX4dCdhCUnDorJmyNN-Bjc7Wc1N-RzWEk,548
+tests/numpy_batch/test_nwp.py,sha256=Fnj7cR-VR2Z0kMu8SrgnIayjxWnPWrYFjWSjMmnrh4Y,1445
+tests/numpy_batch/test_satellite.py,sha256=8a4ZwMLpsOmYKmwI1oW_su_hwkCNYMEJAEfa0dbsx1k,1179
+tests/numpy_batch/test_sun_position.py,sha256=FYQ7KtlN0V5LlEjgI-cKjTMtGHUCxiMvxkRYTdMAgEE,2485
+tests/select/test_dropout.py,sha256=kiycl7RxAQYMCZJlokmx6Da5h_oBpSs8Is8pmSW4gOU,2413
+tests/select/test_fill_time_periods.py,sha256=o59f2YRe5b0vJrG3B0aYZkYeHnpNk4s6EJxdXZluNQg,907
+tests/select/test_find_contiguous_time_periods.py,sha256=G6tJRJd0DMfH9EdfzlKWsmfTbtMwOf3w-2filjJzuIQ,5998
+tests/select/test_location.py,sha256=_WZk2FPYeJ-nIfCJS6Sp_yaVEEo7m31DmMFoZzgyCts,2712
+tests/select/test_select_spatial_slice.py,sha256=7EX9b6g-pMdACQx3yefjs5do2s-Rho2UmKevV4oglsU,5147
+tests/select/test_select_time_slice.py,sha256=XC1J3DBBDnt81jcba5u-Hnd0yKv8GIQErLm-OECV6rs,10147
+tests/torch_datasets/test_pvnet_uk_regional.py,sha256=u3taw6p3oozM0_7cEEhCYbImAQPRldRhpruqSyV08Vg,2675
+tests/torch_datasets/test_site.py,sha256=5hdUP64neCDWEo2NMSd-MhbpuQjQvD6NOvhZ1DlMmo8,2733
+ocf_data_sampler-0.0.25.dist-info/LICENSE,sha256=F-Q3UFCR-BECSocV55BFDpn4YKxve9PKrm-lTt6o_Tg,1073
+ocf_data_sampler-0.0.25.dist-info/METADATA,sha256=p3SKEM4gRy0Z4LTcRWlgTrpjQ-QV89ar69tM9EwhudU,5269
+ocf_data_sampler-0.0.25.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+ocf_data_sampler-0.0.25.dist-info/top_level.txt,sha256=Faob6N6cFdPc5eUpCTYcXgCaNhi4XLLteUL5W5ayYmg,31
+ocf_data_sampler-0.0.25.dist-info/RECORD,,

{ocf_data_sampler-0.0.23.dist-info → ocf_data_sampler-0.0.25.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.2.0)
+Generator: setuptools (75.3.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{ocf_data_sampler-0.0.23.dist-info → ocf_data_sampler-0.0.25.dist-info}/top_level.txt RENAMED Viewed

@@ -1,2 +1,3 @@
 ocf_data_sampler
+scripts
 tests

scripts/refactor_site.py ADDED Viewed

@@ -0,0 +1,50 @@
+""" Helper functions for refactoring legacy site data """
+def legacy_format(data_ds, metadata_df):
+    """This formats old legacy data to the new format.
+    1. This renames the columns in the metadata
+    2. Re-formats the site data from data variables named by the site_id to
+    a data array with a site_id dimension. Also adds capacity_kwp to the dataset as a time series for each site_id
+    """
+    if "system_id" in metadata_df.columns:
+        metadata_df["site_id"] = metadata_df["system_id"]
+    if "capacity_megawatts" in metadata_df.columns:
+        metadata_df["capacity_kwp"] = metadata_df["capacity_megawatts"] * 1000
+    # only site data has the site_id as data variables.
+    # We want to join them all together and create another coordinate called site_id
+    if "0" in data_ds:
+        gen_df = data_ds.to_dataframe()
+        gen_da = xr.DataArray(
+            data=gen_df.values,
+            coords=(
+                ("time_utc", gen_df.index.values),
+                ("site_id", metadata_df["site_id"]),
+            ),
+            name="generation_kw",
+        )
+        capacity_df = gen_df
+        for col in capacity_df.columns:
+            capacity_df[col] = metadata_df[metadata_df["site_id"].astype(str) == col][
+                "capacity_kwp"
+            ].iloc[0]
+        capacity_da = xr.DataArray(
+            data=capacity_df.values,
+            coords=(
+                ("time_utc", gen_df.index.values),
+                ("site_id", metadata_df["site_id"]),
+            ),
+            name="capacity_kwp",
+        )
+        data_ds = xr.Dataset(
+            {
+                "generation_kw": gen_da,
+                "capacity_kwp": capacity_da,
+            }
+        )
+    return data_ds

tests/conftest.py CHANGED Viewed

@@ -6,6 +6,8 @@ import pytest
 import xarray as xr
 import tempfile
+from ocf_data_sampler.config.model import Site
 _top_test_directory = os.path.dirname(os.path.realpath(__file__))
 @pytest.fixture()
@@ -197,6 +199,66 @@ def ds_uk_gsp():
     })
+@pytest.fixture(scope="session")
+def data_sites() -> Site:
+    """
+    Make fake data for sites
+    Returns: filename for netcdf file, and csv metadata
+    """
+    times = pd.date_range("2023-01-01 00:00", "2023-01-02 00:00", freq="30min")
+    site_ids = list(range(0,10))
+    capacity_kwp_1d = np.array([0.1,1.1,4,6,8,9,15,2,3,4])
+    # these are quite specific for the fake satellite data
+    longitude = np.arange(-4, -3, 0.1)
+    latitude = np.arange(51, 52, 0.1)
+    generation = np.random.uniform(0, 200, size=(len(times), len(site_ids))).astype(np.float32)
+    # repeat capacity in new dims len(times) times
+    capacity_kwp = (np.tile(capacity_kwp_1d, len(times))).reshape(len(times),10)
+    coords = (
+        ("time_utc", times),
+        ("site_id", site_ids),
+    )
+    da_cap = xr.DataArray(
+        capacity_kwp,
+        coords=coords,
+    )
+    da_gen = xr.DataArray(
+        generation,
+        coords=coords,
+    )
+    # metadata
+    meta_df = pd.DataFrame(columns=[], data = [])
+    meta_df['site_id'] = site_ids
+    meta_df['capacity_kwp'] = capacity_kwp_1d
+    meta_df['longitude'] = longitude
+    meta_df['latitude'] = latitude
+    generation = xr.Dataset({
+        "capacity_kwp": da_cap,
+        "generation_kw": da_gen,
+    })
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = tmpdir + "/sites.netcdf"
+        filename_csv = tmpdir + "/sites_metadata.csv"
+        generation.to_netcdf(filename)
+        meta_df.to_csv(filename_csv)
+        site = Site(file_path=filename,
+                    metadata_file_path=filename_csv,
+                    time_resolution_minutes=30,
+                    forecast_minutes=60,
+                    history_minutes=30)
+        yield site
 @pytest.fixture(scope="session")
 def uk_gsp_zarr_path(ds_uk_gsp):

tests/load/test_load_sites.py ADDED Viewed

@@ -0,0 +1,14 @@
+from ocf_data_sampler.load.site import open_site
+import xarray as xr
+def test_open_site(data_sites):
+    da = open_site(data_sites)
+    assert isinstance(da, xr.DataArray)
+    assert da.dims == ("time_utc", "site_id")
+    assert "capacity_kwp" in da.coords
+    assert "latitude" in da.coords
+    assert "longitude" in da.coords
+    assert da.shape == (49, 10)

tests/numpy_batch/test_gsp.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from ocf_data_sampler.load.gsp import open_gsp
-from ocf_data_sampler.numpy_batch import convert_gsp_to_numpy_batch
-from ocf_data_sampler.numpy_batch.gsp import GSPBatchKey
+from ocf_data_sampler.numpy_batch import convert_gsp_to_numpy_batch, GSPBatchKey
 def test_convert_gsp_to_numpy_batch(uk_gsp_zarr_path):

tests/numpy_batch/test_nwp.py CHANGED Viewed

@@ -4,9 +4,7 @@ import xarray as xr
 import pytest
-from ocf_data_sampler.numpy_batch import convert_nwp_to_numpy_batch
-from ocf_data_sampler.numpy_batch.nwp import NWPBatchKey
+from ocf_data_sampler.numpy_batch import convert_nwp_to_numpy_batch, NWPBatchKey
 @pytest.fixture(scope="module")
 def da_nwp_like():

tests/numpy_batch/test_satellite.py CHANGED Viewed

@@ -5,9 +5,7 @@ import xarray as xr
 import pytest
-from ocf_data_sampler.numpy_batch import convert_satellite_to_numpy_batch
-from ocf_data_sampler.numpy_batch.satellite import SatelliteBatchKey
+from ocf_data_sampler.numpy_batch import convert_satellite_to_numpy_batch, SatelliteBatchKey
 @pytest.fixture(scope="module")

tests/numpy_batch/test_sun_position.py CHANGED Viewed

@@ -6,7 +6,7 @@ from ocf_data_sampler.numpy_batch.sun_position import (
     calculate_azimuth_and_elevation, make_sun_position_numpy_batch
 )
-from ocf_data_sampler.numpy_batch.gsp import GSPBatchKey
+from ocf_data_sampler.numpy_batch import GSPBatchKey
 @pytest.mark.parametrize("lat", [0, 5, 10, 23.5])
@@ -71,11 +71,11 @@ def test_make_sun_position_numpy_batch():
     batch = make_sun_position_numpy_batch(datetimes, lon, lat, key_prefix="gsp")
-    assert GSPBatchKey.gsp_solar_elevation in batch
-    assert GSPBatchKey.gsp_solar_azimuth in batch
+    assert GSPBatchKey.solar_elevation in batch
+    assert GSPBatchKey.solar_azimuth in batch
     # The solar coords are normalised in the function
-    assert (batch[GSPBatchKey.gsp_solar_elevation]>=0).all()
-    assert (batch[GSPBatchKey.gsp_solar_elevation]<=1).all()
-    assert (batch[GSPBatchKey.gsp_solar_azimuth]>=0).all()
-    assert (batch[GSPBatchKey.gsp_solar_azimuth]<=1).all()
+    assert (batch[GSPBatchKey.solar_elevation]>=0).all()
+    assert (batch[GSPBatchKey.solar_elevation]<=1).all()
+    assert (batch[GSPBatchKey.solar_azimuth]>=0).all()
+    assert (batch[GSPBatchKey.solar_azimuth]<=1).all()

tests/torch_datasets/test_pvnet_uk_regional.py CHANGED Viewed

@@ -3,9 +3,7 @@ import tempfile
 from ocf_data_sampler.torch_datasets.pvnet_uk_regional import PVNetUKRegionalDataset
 from ocf_data_sampler.config import load_yaml_configuration, save_yaml_configuration
-from ocf_data_sampler.numpy_batch.nwp import NWPBatchKey
-from ocf_data_sampler.numpy_batch.gsp import GSPBatchKey
-from ocf_data_sampler.numpy_batch.satellite import SatelliteBatchKey
+from ocf_data_sampler.numpy_batch import NWPBatchKey, GSPBatchKey, SatelliteBatchKey
 @pytest.fixture()
@@ -39,7 +37,7 @@ def test_pvnet(pvnet_config_filename):
     for key in [
         NWPBatchKey.nwp, SatelliteBatchKey.satellite_actual, GSPBatchKey.gsp,
-        GSPBatchKey.gsp_solar_azimuth, GSPBatchKey.gsp_solar_elevation,
+        GSPBatchKey.solar_azimuth, GSPBatchKey.solar_elevation,
     ]:
         assert key in sample
@@ -54,8 +52,8 @@ def test_pvnet(pvnet_config_filename):
     # 3 hours of 30 minute data (inclusive)
     assert sample[GSPBatchKey.gsp].shape == (7,)
     # Solar angles have same shape as GSP data
-    assert sample[GSPBatchKey.gsp_solar_azimuth].shape == (7,)
-    assert sample[GSPBatchKey.gsp_solar_elevation].shape == (7,)
+    assert sample[GSPBatchKey.solar_azimuth].shape == (7,)
+    assert sample[GSPBatchKey.solar_elevation].shape == (7,)
 def test_pvnet_no_gsp(pvnet_config_filename):

ocf-data-sampler 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl

Potentially problematic release.

ocf-data-sampler 0.0.23py3-none-any.whl → 0.0.25py3-none-any.whl