PyPI - ocf-data-sampler - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl - Mend

ocf-data-sampler 0.2.16py3-none-any.whl → 0.2.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (11) hide show

ocf_data_sampler/load/gsp.py CHANGED Viewed

@@ -6,25 +6,54 @@ import pandas as pd
 import xarray as xr
-def open_gsp(zarr_path: str) -> xr.DataArray:
+def get_gsp_boundaries(version: str) -> pd.DataFrame:
+    """Get the GSP boundaries for a given version.
+    Args:
+        version: Version of the GSP boundaries to use. Options are "20220314" or "20250109".
+    Returns:
+        pd.DataFrame: The GSP boundaries
+    """
+    if version not in ["20220314", "20250109"]:
+        raise ValueError(
+            "Invalid version. Options are '20220314' or '20250109'.",
+        )
+    return pd.read_csv(
+        files("ocf_data_sampler.data").joinpath(f"uk_gsp_locations_{version}.csv"),
+        index_col="gsp_id",
+    )
+def open_gsp(zarr_path: str, boundaries_version: str = "20220314") -> xr.DataArray:
     """Open the GSP data.
     Args:
         zarr_path: Path to the GSP zarr data
+        boundaries_version: Version of the GSP boundaries to use. Options are "20220314" or
+        "20250109".
     Returns:
         xr.DataArray: The opened GSP data
     """
-    ds = xr.open_zarr(zarr_path)
-    ds = ds.rename({"datetime_gmt": "time_utc"})
     # Load UK GSP locations
-    df_gsp_loc = pd.read_csv(
-        files("ocf_data_sampler.data").joinpath("uk_gsp_locations.csv"),
-        index_col="gsp_id",
+    df_gsp_loc = get_gsp_boundaries(boundaries_version)
+    # Open the GSP generation data
+    ds = (
+        xr.open_zarr(zarr_path)
+        .rename({"datetime_gmt": "time_utc"})
     )
+    if not (ds.gsp_id.isin(df_gsp_loc.index)).all():
+        raise ValueError(
+            "Some GSP IDs in the GSP generation data are available in the locations file.",
+        )
+    # Select the locations by the GSP IDs in the generation data
+    df_gsp_loc = df_gsp_loc.loc[ds.gsp_id.values]
     # Add locations and capacities as coordinates for each GSP and datetime
     ds = ds.assign_coords(
         x_osgb=(df_gsp_loc.x_osgb.to_xarray()),

ocf_data_sampler/load/load_dataset.py CHANGED Viewed

@@ -6,8 +6,10 @@ from ocf_data_sampler.config import InputData
 from ocf_data_sampler.load import open_gsp, open_nwp, open_sat_data, open_site
-def get_dataset_dict(input_config: InputData, gsp_ids: list[int] | None = None)\
-        -> dict[str, dict[xr.DataArray] | xr.DataArray]:
+def get_dataset_dict(
+    input_config: InputData,
+    gsp_ids: list[int] | None = None,
+) -> dict[str, dict[xr.DataArray] | xr.DataArray]:
     """Construct dictionary of all of the input data sources.
     Args:
@@ -19,7 +21,10 @@ def get_dataset_dict(input_config: InputData, gsp_ids: list[int] | None = None)\
     # Load GSP data unless the path is None
     if input_config.gsp and input_config.gsp.zarr_path:
-        da_gsp = open_gsp(zarr_path=input_config.gsp.zarr_path).compute()
+        da_gsp = open_gsp(
+            zarr_path=input_config.gsp.zarr_path,
+            boundaries_version=input_config.gsp.boundaries_version,
+        ).compute()
         if gsp_ids is None:
             # Remove national (gsp_id=0)

ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """Torch dataset for UK PVNet."""
-from importlib.resources import files
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -9,6 +7,7 @@ from torch.utils.data import Dataset
 from typing_extensions import override
 from ocf_data_sampler.config import Configuration, load_yaml_configuration
+from ocf_data_sampler.load.gsp import get_gsp_boundaries
 from ocf_data_sampler.load.load_dataset import get_dataset_dict
 from ocf_data_sampler.numpy_sample import (
     convert_gsp_to_numpy_sample,
@@ -47,22 +46,26 @@ def compute(xarray_dict: dict) -> dict:
     return xarray_dict
-def get_gsp_locations(gsp_ids: list[int] | None = None) -> list[Location]:
+def get_gsp_locations(
+    gsp_ids: list[int] | None = None,
+    version: str = "20220314",
+) -> list[Location]:
     """Get list of locations of all GSPs.
     Args:
-        gsp_ids: List of GSP IDs to include. Defaults to all
+        gsp_ids: List of GSP IDs to include. Defaults to all GSPs except national
+        version: Version of GSP boundaries to use. Defaults to "20220314"
     """
+    df_gsp_loc = get_gsp_boundaries(version)
+    # Default GSP IDs is all except national (gsp_id=0)
     if gsp_ids is None:
-        gsp_ids = list(range(1, 318))
+        gsp_ids = df_gsp_loc.index.values
+        gsp_ids = gsp_ids[gsp_ids != 0]
-    locations = []
+    df_gsp_loc = df_gsp_loc.loc[gsp_ids]
-    # Load UK GSP locations
-    df_gsp_loc = pd.read_csv(
-        files("ocf_data_sampler.data").joinpath("uk_gsp_locations.csv"),
-        index_col="gsp_id",
-    )
+    locations = []
     for gsp_id in gsp_ids:
         locations.append(
@@ -108,7 +111,10 @@ class AbstractPVNetUKDataset(Dataset):
             valid_t0_times = valid_t0_times[valid_t0_times <= pd.Timestamp(end_time)]
         # Construct list of locations to sample from
-        self.locations = get_gsp_locations(gsp_ids)
+        self.locations = get_gsp_locations(
+            gsp_ids,
+            version=config.input_data.gsp.boundaries_version,
+        )
         self.valid_t0_times = valid_t0_times
         # Assign config and input data to self

{ocf_data_sampler-0.2.16.dist-info → ocf_data_sampler-0.2.17.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.2.16
+Version: 0.2.17
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License

{ocf_data_sampler-0.2.16.dist-info → ocf_data_sampler-0.2.17.dist-info}/RECORD RENAMED Viewed

@@ -2,12 +2,13 @@ ocf_data_sampler/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,
 ocf_data_sampler/utils.py,sha256=DjuneGGisl08ENvPZV_lrcX4b2NCKJC1ZpXgIpxuQi4,290
 ocf_data_sampler/config/__init__.py,sha256=O29mbH0XG2gIY1g3BaveGCnpBO2SFqdu-qzJ7a6evl0,223
 ocf_data_sampler/config/load.py,sha256=LL-7wemI8o4KPkx35j-wQ3HjsMvDgqXr7G46IcASfnU,632
-ocf_data_sampler/config/model.py,sha256=pb02qtCmWhJhrU3_T_gUzC7i2_JcO8xGwwhKGd8yMuk,10209
+ocf_data_sampler/config/model.py,sha256=SyjtlSK6gzQHWUfgX3VNKYLODyiKuD0Mu4hlm9GoHeg,10427
 ocf_data_sampler/config/save.py,sha256=m8SPw5rXjkMm1rByjh3pK5StdBi4e8ysnn3jQopdRaI,1064
-ocf_data_sampler/data/uk_gsp_locations.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZtI58td4d4DhGos,10415772
+ocf_data_sampler/data/uk_gsp_locations_20220314.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZtI58td4d4DhGos,10415772
+ocf_data_sampler/data/uk_gsp_locations_20250109.csv,sha256=XZISFatnbpO9j8LwaxNKFzQSjs6hcHFsV8a9uDDpy2E,9055334
 ocf_data_sampler/load/__init__.py,sha256=-vQP9g0UOWdVbjEGyVX_ipa7R1btmiETIKAf6aw4d78,201
-ocf_data_sampler/load/gsp.py,sha256=keB3Nv_CNK1P6pS9Kdfc8PoZXTI1_YFN-spsvEv_Ewc,899
-ocf_data_sampler/load/load_dataset.py,sha256=Cn-yz7RgHR2HkH3xQM1njivVEkp8rZC3KXXgcidwuME,1863
+ocf_data_sampler/load/gsp.py,sha256=UfPxwHw2Dw2xYSO5Al28oTamgnEM_n_4bYXsqGwY5Tc,1884
+ocf_data_sampler/load/load_dataset.py,sha256=sIi0nkijR_-1fRfW5JcXNTR0ccGbpkhxb7JX_zjJ-W4,1956
 ocf_data_sampler/load/satellite.py,sha256=E7Ln7Y60Qr1RTV-_R71YoxXQM-Ca7Y1faIo3oKB2eFk,2292
 ocf_data_sampler/load/site.py,sha256=zOzlWk6pYZBB5daqG8URGksmDXWKrkutUvN8uALAIh8,1468
 ocf_data_sampler/load/utils.py,sha256=sZ0-zzconcLkVQwAkCYrqKDo98Hrh5ChdiQJv5Bh91g,2040
@@ -38,7 +39,7 @@ ocf_data_sampler/select/location.py,sha256=AZvGR8y62opiW7zACGXjoOtBEWRfSLOZIA73O
 ocf_data_sampler/select/select_spatial_slice.py,sha256=liAqIa-Amj58pOqx5r16i99HURj9oQ41j7gnPgRDQP4,8201
 ocf_data_sampler/select/select_time_slice.py,sha256=HeHbwZ0CP03x0-LaJtpbSdtpLufwVTR73p6wH6O_PS8,5513
 ocf_data_sampler/torch_datasets/datasets/__init__.py,sha256=jfJSFcR0eO1AqeH7S3KnGjsBqVZT5w3oyi784PUR6Q0,146
-ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=9BJ4wVcZUMEzStVCbbWrf2eK8WPpV9SoeOQviZktHAc,12355
+ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=ZV2FoMPxFU2aPTWipj9HhJhGfrEg9MYOJRNR8aFcmvs,12613
 ocf_data_sampler/torch_datasets/datasets/site.py,sha256=nRUlhXQQGVrTuBmE1QnwXAUsPTXz0dsezlQjwK71jIQ,17641
 ocf_data_sampler/torch_datasets/sample/__init__.py,sha256=GL84vdZl_SjHDGVyh9Uekx2XhPYuZ0dnO3l6f6KXnHI,100
 ocf_data_sampler/torch_datasets/sample/base.py,sha256=cQ1oIyhdmlotejZK8B3Cw6MNvpdnBPD8G_o2h7Ye4Vc,2206
@@ -51,9 +52,10 @@ ocf_data_sampler/torch_datasets/utils/spatial_slice_for_dataset.py,sha256=Hvz0wH
 ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py,sha256=1DN6VsWWdLvkpJxodZtBRDUgC4vJE2td_RP5J3ZqPNw,4268
 ocf_data_sampler/torch_datasets/utils/valid_time_periods.py,sha256=xcy75cVxl0WrglnX5YUAFjXXlO2GwEBHWyqo8TDuiOA,4714
 ocf_data_sampler/torch_datasets/utils/validation_utils.py,sha256=YqmT-lExWlI8_ul3l0EP73Ik002fStr_bhsZh9mQqEU,4735
+scripts/download_gsp_location_data.py,sha256=rRDXMoqX-RYY4jPdxhdlxJGhWdl6r245F5UARgKV6P4,3121
 scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
 utils/compute_icon_mean_stddev.py,sha256=a1oWMRMnny39rV-dvu8rcx85sb4bXzPFrR1gkUr4Jpg,2296
-ocf_data_sampler-0.2.16.dist-info/METADATA,sha256=JplbQHR2wlMPCQEDXYk5GPDgu15wYKV3SYZmL0kH2Ho,11581
-ocf_data_sampler-0.2.16.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
-ocf_data_sampler-0.2.16.dist-info/top_level.txt,sha256=LEFU4Uk-PEo72QGLAfnVZIUEm37Q8mKuMeg_Xk-p33g,31
-ocf_data_sampler-0.2.16.dist-info/RECORD,,
+ocf_data_sampler-0.2.17.dist-info/METADATA,sha256=OKEhg6yBn1fCJKsWOBngnCXVSSd5G5VvOnck0J8bXxw,11581
+ocf_data_sampler-0.2.17.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
+ocf_data_sampler-0.2.17.dist-info/top_level.txt,sha256=LEFU4Uk-PEo72QGLAfnVZIUEm37Q8mKuMeg_Xk-p33g,31
+ocf_data_sampler-0.2.17.dist-info/RECORD,,

scripts/download_gsp_location_data.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""This script downloads the GSP location data from the Neso API and saves it to a CSV file.
+This script was used to create the `uk_gsp_locations_20250109.csv` file in the `data` directory.
+"""
+import io
+import os
+import tempfile
+import zipfile
+import geopandas as gpd
+import pandas as pd
+import requests
+SAVE_PATH = "uk_gsp_locations_20250109.csv"
+# --- Configuration ---
+GSP_REGIONS_URL = (
+    "https://api.neso.energy/dataset/2810092e-d4b2-472f-b955-d8bea01f9ec0/"
+    "resource/d95e8c1b-9cd9-41dd-aacb-4b53b8c07c20/download/gsp_regions_20250109.zip"
+)
+# This is the path to the OSBG version of the boundaries. The lon-lats version can be found at:
+#   Proj_4326/GSP_regions_4326_20250109.geojson
+GSP_REGIONS_GEOJSON_PATH_IN_ZIP = "Proj_27700/GSP_regions_27700_20250109.geojson"
+GSP_NAME_MAP_URL = "https://api.pvlive.uk/pvlive/api/v4/gsp_list"
+SAVE_PATH = "uk_gsp_locations_20250109.csv"
+# --- End Configuration ---
+with tempfile.TemporaryDirectory() as tmpdirname:
+    # Download the GSP regions
+    response_regions = requests.get(GSP_REGIONS_URL, timeout=30)
+    response_regions.raise_for_status()
+    # Unzip
+    with zipfile.ZipFile(io.BytesIO(response_regions.content)) as z:
+        geojson_extract_path = os.path.join(tmpdirname, GSP_REGIONS_GEOJSON_PATH_IN_ZIP)
+        z.extract(GSP_REGIONS_GEOJSON_PATH_IN_ZIP, tmpdirname)
+    # Load the GSP regions
+    df_bound = gpd.read_file(geojson_extract_path)
+    # Download the GSP name mapping
+    response_map = requests.get(GSP_NAME_MAP_URL, timeout=10)
+    response_map.raise_for_status()
+    # Load the GSP name mapping
+    gsp_name_map = response_map.json()
+    df_gsp_name_map = (
+        pd.DataFrame(data=gsp_name_map["data"], columns=gsp_name_map["meta"])
+        .drop("pes_id", axis=1)
+    )
+def combine_gsps(gdf: gpd.GeoDataFrame) -> gpd.GeoSeries:
+    """Combine GSPs which have been split into mutliple rows."""
+    # If only one row for the GSP name then just return the row
+    if len(gdf)==0:
+        return gdf.iloc[0]
+    # If multiple rows for the GSP then get union of the GSP shapes
+    else:
+        return gpd.GeoSeries(gdf.unary_union, index=["geometry"], crs=gdf.crs)
+# Combine GSPs which have been split into multiple rows
+df_bound = (
+    df_bound.groupby("GSPs")
+    .apply(combine_gsps, include_groups=False)
+    .reset_index()
+)
+# Add the PVLive GSP ID for each GSP
+df_bound = (
+    df_bound.merge(df_gsp_name_map, left_on="GSPs", right_on="gsp_name")
+    .drop("GSPs", axis=1)
+)
+# Add the national GSP - this is the union of all GSPs
+national_boundaries = gpd.GeoDataFrame(
+    [["NATIONAL", df_bound.unary_union, 0]],
+    columns=["gsp_name", "geometry", "gsp_id"],
+    crs=df_bound.crs,
+)
+df_bound = pd.concat([national_boundaries, df_bound], ignore_index=True)
+# Add the coordinates for the centroid of each GSP
+df_bound["x_osgb"] = df_bound.geometry.centroid.x
+df_bound["y_osgb"] = df_bound.geometry.centroid.y
+# Reorder columns, sort by gsp_id (increasing) and save
+columns = ["gsp_id", "gsp_name", "geometry", "x_osgb", "y_osgb"]
+df_bound[columns].sort_values("gsp_id").to_csv(SAVE_PATH, index=False)

/ocf_data_sampler/data/{uk_gsp_locations.csv → uk_gsp_locations_20220314.csv} RENAMED Viewed

File without changes

{ocf_data_sampler-0.2.16.dist-info → ocf_data_sampler-0.2.17.dist-info}/WHEEL RENAMED Viewed

File without changes

{ocf_data_sampler-0.2.16.dist-info → ocf_data_sampler-0.2.17.dist-info}/top_level.txt RENAMED Viewed

File without changes

ocf-data-sampler 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl

Potentially problematic release.

ocf-data-sampler 0.2.16py3-none-any.whl → 0.2.17py3-none-any.whl