PyPI - ocf-data-sampler - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

ocf-data-sampler 0.1.11py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (78) hide show

ocf_data_sampler/config/load.py +3 -3
ocf_data_sampler/config/model.py +146 -64
ocf_data_sampler/config/save.py +5 -4
ocf_data_sampler/load/gsp.py +6 -5
ocf_data_sampler/load/load_dataset.py +5 -6
ocf_data_sampler/load/nwp/nwp.py +17 -5
ocf_data_sampler/load/nwp/providers/ecmwf.py +6 -7
ocf_data_sampler/load/nwp/providers/gfs.py +36 -0
ocf_data_sampler/load/nwp/providers/icon.py +46 -0
ocf_data_sampler/load/nwp/providers/ukv.py +4 -5
ocf_data_sampler/load/nwp/providers/utils.py +3 -1
ocf_data_sampler/load/satellite.py +9 -10
ocf_data_sampler/load/site.py +10 -6
ocf_data_sampler/load/utils.py +21 -16
ocf_data_sampler/numpy_sample/collate.py +10 -9
ocf_data_sampler/numpy_sample/datetime_features.py +3 -5
ocf_data_sampler/numpy_sample/gsp.py +12 -14
ocf_data_sampler/numpy_sample/nwp.py +12 -12
ocf_data_sampler/numpy_sample/satellite.py +9 -9
ocf_data_sampler/numpy_sample/site.py +5 -8
ocf_data_sampler/numpy_sample/sun_position.py +16 -21
ocf_data_sampler/sample/base.py +15 -17
ocf_data_sampler/sample/site.py +13 -20
ocf_data_sampler/sample/uk_regional.py +29 -35
ocf_data_sampler/select/dropout.py +16 -14
ocf_data_sampler/select/fill_time_periods.py +15 -5
ocf_data_sampler/select/find_contiguous_time_periods.py +88 -75
ocf_data_sampler/select/geospatial.py +63 -54
ocf_data_sampler/select/location.py +16 -51
ocf_data_sampler/select/select_spatial_slice.py +105 -89
ocf_data_sampler/select/select_time_slice.py +71 -58
ocf_data_sampler/select/spatial_slice_for_dataset.py +7 -6
ocf_data_sampler/select/time_slice_for_dataset.py +17 -16
ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py +140 -131
ocf_data_sampler/torch_datasets/datasets/site.py +152 -112
ocf_data_sampler/torch_datasets/utils/__init__.py +3 -0
ocf_data_sampler/torch_datasets/utils/channel_dict_to_dataarray.py +11 -0
ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py +6 -2
ocf_data_sampler/torch_datasets/utils/valid_time_periods.py +23 -22
ocf_data_sampler/utils.py +3 -1
{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/METADATA +7 -18
ocf_data_sampler-0.1.17.dist-info/RECORD +56 -0
{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/WHEEL +1 -1
{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/top_level.txt +1 -1
scripts/refactor_site.py +63 -33
utils/compute_icon_mean_stddev.py +72 -0
ocf_data_sampler/constants.py +0 -222
ocf_data_sampler/torch_datasets/utils/validate_channels.py +0 -82
ocf_data_sampler-0.1.11.dist-info/LICENSE +0 -21
ocf_data_sampler-0.1.11.dist-info/RECORD +0 -82
tests/__init__.py +0 -0
tests/config/test_config.py +0 -113
tests/config/test_load.py +0 -7
tests/config/test_save.py +0 -28
tests/conftest.py +0 -319
tests/load/test_load_gsp.py +0 -15
tests/load/test_load_nwp.py +0 -21
tests/load/test_load_satellite.py +0 -17
tests/load/test_load_sites.py +0 -14
tests/numpy_sample/test_collate.py +0 -21
tests/numpy_sample/test_datetime_features.py +0 -37
tests/numpy_sample/test_gsp.py +0 -38
tests/numpy_sample/test_nwp.py +0 -13
tests/numpy_sample/test_satellite.py +0 -40
tests/numpy_sample/test_sun_position.py +0 -81
tests/select/test_dropout.py +0 -69
tests/select/test_fill_time_periods.py +0 -28
tests/select/test_find_contiguous_time_periods.py +0 -202
tests/select/test_location.py +0 -67
tests/select/test_select_spatial_slice.py +0 -154
tests/select/test_select_time_slice.py +0 -275
tests/test_sample/test_base.py +0 -164
tests/test_sample/test_site_sample.py +0 -165
tests/test_sample/test_uk_regional_sample.py +0 -136
tests/torch_datasets/test_merge_and_fill_utils.py +0 -40
tests/torch_datasets/test_pvnet_uk.py +0 -154
tests/torch_datasets/test_site.py +0 -226
tests/torch_datasets/test_validate_channels_utils.py +0 -78

ocf_data_sampler/torch_datasets/utils/valid_time_periods.py CHANGED Viewed

@@ -1,34 +1,31 @@
+"""Functions pertaining to finding valid time periods for the input data."""
 import numpy as np
 import pandas as pd
 from ocf_data_sampler.config import Configuration
 from ocf_data_sampler.select.find_contiguous_time_periods import (
+    find_contiguous_t0_periods,
     find_contiguous_t0_periods_nwp,
-    find_contiguous_t0_periods,
     intersection_of_multiple_dataframes_of_periods,
 )
 from ocf_data_sampler.utils import minutes
-def find_valid_time_periods(
-    datasets_dict: dict,
-    config: Configuration,
-):
-    """Find the t0 times where all of the requested input data is available
+def find_valid_time_periods(datasets_dict: dict, config: Configuration) -> pd.DataFrame:
+    """Find the t0 times where all of the requested input data is available.
     Args:
         datasets_dict: A dictionary of input datasets
         config: Configuration file
     """
+    if not set(datasets_dict.keys()).issubset({"nwp", "sat", "gsp"}):
+        raise ValueError(f"Invalid keys in datasets_dict: {datasets_dict.keys()}")
-    assert set(datasets_dict.keys()).issubset({"nwp", "sat", "gsp"})
-    contiguous_time_periods: dict[str: pd.DataFrame] = {}  # Used to store contiguous time periods from each data source
+    # Used to store contiguous time periods from each data source
+    contiguous_time_periods: dict[str : pd.DataFrame] = {}
     if "nwp" in datasets_dict:
         for nwp_key, nwp_config in config.input_data.nwp.items():
             da = datasets_dict["nwp"][nwp_key]
             if nwp_config.dropout_timedeltas_minutes is None:
@@ -59,8 +56,12 @@ def find_valid_time_periods(
                 max_staleness = max_possible_staleness
             else:
                 # Make sure the max acceptable staleness isn't longer than the max possible
-                assert max_staleness <= max_possible_staleness
+                if max_staleness > max_possible_staleness:
+                    raise ValueError(
+                        f"max_staleness_minutes is too long for the input data, "
+                        f"{max_staleness=}, {max_possible_staleness=}",
+                    )
             # Find the first forecast step
             first_forecast_step = pd.Timedelta(da["step"].min().item())
@@ -69,34 +70,34 @@ def find_valid_time_periods(
                 interval_start=minutes(nwp_config.interval_start_minutes),
                 max_staleness=max_staleness,
                 max_dropout=max_dropout,
-                first_forecast_step = first_forecast_step,
+                first_forecast_step=first_forecast_step,
             )
-            contiguous_time_periods[f'nwp_{nwp_key}'] = time_periods
+            contiguous_time_periods[f"nwp_{nwp_key}"] = time_periods
     if "sat" in datasets_dict:
         sat_config = config.input_data.satellite
         time_periods = find_contiguous_t0_periods(
             pd.DatetimeIndex(datasets_dict["sat"]["time_utc"]),
-            sample_period_duration=minutes(sat_config.time_resolution_minutes),
+            time_resolution=minutes(sat_config.time_resolution_minutes),
             interval_start=minutes(sat_config.interval_start_minutes),
             interval_end=minutes(sat_config.interval_end_minutes),
         )
-        contiguous_time_periods['sat'] = time_periods
+        contiguous_time_periods["sat"] = time_periods
     if "gsp" in datasets_dict:
         gsp_config = config.input_data.gsp
         time_periods = find_contiguous_t0_periods(
             pd.DatetimeIndex(datasets_dict["gsp"]["time_utc"]),
-            sample_period_duration=minutes(gsp_config.time_resolution_minutes),
+            time_resolution=minutes(gsp_config.time_resolution_minutes),
             interval_start=minutes(gsp_config.interval_start_minutes),
             interval_end=minutes(gsp_config.interval_end_minutes),
         )
-        contiguous_time_periods['gsp'] = time_periods
+        contiguous_time_periods["gsp"] = time_periods
     # just get the values (not the keys)
     contiguous_time_periods_values = list(contiguous_time_periods.values())
@@ -104,7 +105,7 @@ def find_valid_time_periods(
     # Find joint overlapping contiguous time periods
     if len(contiguous_time_periods_values) > 1:
         valid_time_periods = intersection_of_multiple_dataframes_of_periods(
-            contiguous_time_periods_values
+            contiguous_time_periods_values,
         )
     else:
         valid_time_periods = contiguous_time_periods_values[0]
@@ -113,4 +114,4 @@ def find_valid_time_periods(
     if len(valid_time_periods) == 0:
         raise ValueError(f"No valid time periods found, {contiguous_time_periods=}")
-    return valid_time_periods
+    return valid_time_periods

ocf_data_sampler/utils.py CHANGED Viewed

@@ -1,8 +1,10 @@
+"""Miscellaneous helper functions."""
 import pandas as pd
 def minutes(minutes: int | list[float]) -> pd.Timedelta | pd.TimedeltaIndex:
-    """Timedelta minutes
+    """Timedelta minutes.
     Args:
         minutes: the number of minutes, single value or list

{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,8 @@
 Metadata-Version: 2.2
-Name: ocf_data_sampler
-Version: 0.1.11
-Summary: Sample from weather data for renewable energy prediction
-Author: James Fulton, Peter Dudfield, and the Open Climate Fix team
-Author-email: info@openclimatefix.org
-Maintainer: Open Climate Fix Ltd
+Name: ocf-data-sampler
+Version: 0.1.17
+Author: James Fulton, Peter Dudfield
+Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License
         Copyright (c) 2023 Open Climate Fix
@@ -27,21 +25,18 @@ License: MIT License
         OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
         SOFTWARE.
-Project-URL: homepage, https://github.com/openclimatefix
 Project-URL: repository, https://github.com/openclimatefix/ocf-data-sampler
-Keywords: weather data,renewable energy prediction,sample weather data
+Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Operating System :: POSIX :: Linux
-Requires-Python: >=3.8
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
-License-File: LICENSE
 Requires-Dist: torch
 Requires-Dist: numpy
 Requires-Dist: pandas
 Requires-Dist: xarray
 Requires-Dist: zarr==2.18.3
 Requires-Dist: dask
+Requires-Dist: matplotlib
 Requires-Dist: ocf_blosc2
 Requires-Dist: pvlib
 Requires-Dist: pydantic
@@ -50,11 +45,6 @@ Requires-Dist: pathy
 Requires-Dist: pyaml_env
 Requires-Dist: pyresample
 Requires-Dist: h5netcdf
-Provides-Extra: docs
-Requires-Dist: mkdocs>=1.2; extra == "docs"
-Requires-Dist: mkdocs-material>=8.0; extra == "docs"
-Provides-Extra: plot
-Requires-Dist: matplotlib; extra == "plot"
 # ocf-data-sampler
@@ -77,7 +67,6 @@ We are currently migrating to this repo from [ocf_datapipes](https://github.com/
 **ocf-data-sampler** doesn't have external documentation _yet_; you can read a bit about how our torch datasets work in the README [here](ocf_data_sampler/torch_datasets/README.md).
 ## FAQ
 If you have any questions about this or any other of our repos, don't hesitate to hop to our [Discussions Page](https://github.com/orgs/openclimatefix/discussions)!

ocf_data_sampler-0.1.17.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,56 @@
+ocf_data_sampler/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+ocf_data_sampler/utils.py,sha256=DjuneGGisl08ENvPZV_lrcX4b2NCKJC1ZpXgIpxuQi4,290
+ocf_data_sampler/config/__init__.py,sha256=O29mbH0XG2gIY1g3BaveGCnpBO2SFqdu-qzJ7a6evl0,223
+ocf_data_sampler/config/load.py,sha256=LL-7wemI8o4KPkx35j-wQ3HjsMvDgqXr7G46IcASfnU,632
+ocf_data_sampler/config/model.py,sha256=y8maV_1z0LL_m0J607ka_yJ0KI-0ssYDn5Ghk8aNgR0,10189
+ocf_data_sampler/config/save.py,sha256=m8SPw5rXjkMm1rByjh3pK5StdBi4e8ysnn3jQopdRaI,1064
+ocf_data_sampler/data/uk_gsp_locations.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZtI58td4d4DhGos,10415772
+ocf_data_sampler/load/__init__.py,sha256=T5Zj1PGt0aiiNEN7Ra1Ac-cBsNKhphmmHy_8g7XU_w0,219
+ocf_data_sampler/load/gsp.py,sha256=keB3Nv_CNK1P6pS9Kdfc8PoZXTI1_YFN-spsvEv_Ewc,899
+ocf_data_sampler/load/load_dataset.py,sha256=0NyDxCDfgE_esKVW3s-rZEe16WB30FQ74ClWlrIo72M,1602
+ocf_data_sampler/load/satellite.py,sha256=E7Ln7Y60Qr1RTV-_R71YoxXQM-Ca7Y1faIo3oKB2eFk,2292
+ocf_data_sampler/load/site.py,sha256=zOzlWk6pYZBB5daqG8URGksmDXWKrkutUvN8uALAIh8,1468
+ocf_data_sampler/load/utils.py,sha256=Jwbr1rpEa3cefjw-OTVRaxnIHyGixYB3TlTlta0BOdU,1727
+ocf_data_sampler/load/nwp/__init__.py,sha256=SmcrnbygO5xtCKmGR4wtHrj-HI7nOAvnAtfuvRufBGQ,25
+ocf_data_sampler/load/nwp/nwp.py,sha256=0AIHQTJLUtwP2Toz_PskOTYFJXfMvGhk8faAcNvI9jk,922
+ocf_data_sampler/load/nwp/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ocf_data_sampler/load/nwp/providers/ecmwf.py,sha256=5AzktPJgertCx6oij6aePRosPuZHGFznMxTgtkk_mgc,994
+ocf_data_sampler/load/nwp/providers/gfs.py,sha256=JSDeh4YG1wibV8--P3X-zTO8LP0dsJcpFvIyglBbhi0,979
+ocf_data_sampler/load/nwp/providers/icon.py,sha256=yYUrs5HgjU0C5pMHBB6FGn3tLjswi990IY6QCXS1Zmw,1569
+ocf_data_sampler/load/nwp/providers/ukv.py,sha256=-0v8JCLH8ypz8GMXZ6Rrx-I0LoHuHO8sXFupbC1RpM0,1013
+ocf_data_sampler/load/nwp/providers/utils.py,sha256=cJZ9JA4W_ZeTcLQ5z71w46_DJaPcW_2JMmBdjP9r3qs,835
+ocf_data_sampler/numpy_sample/__init__.py,sha256=nY5C6CcuxiWZ_jrXRzWtN7WyKXhJImSiVTIG6Rz4B_4,401
+ocf_data_sampler/numpy_sample/collate.py,sha256=I9YPcbxOwHYaDGKbzxqdV-3DFEHkzqdhAwnW7_tZH2w,1966
+ocf_data_sampler/numpy_sample/datetime_features.py,sha256=INudxHcoB_c-GvYXe08S4Up_8TU5zOJ39PWRrTKfLp8,1203
+ocf_data_sampler/numpy_sample/gsp.py,sha256=EDaQdOVEDBJGrXsq54UNBfpXTzi0ky_WpgBbmlyxOXM,1074
+ocf_data_sampler/numpy_sample/nwp.py,sha256=iBGOdLMn-F5yR3juX3l4G2oXDpvGNuUdcR6ZCZkCqZk,1037
+ocf_data_sampler/numpy_sample/satellite.py,sha256=oBlyNpO-syoyK4SSghoHqIDNyhcBqyd1L6eXSSw0k3w,1036
+ocf_data_sampler/numpy_sample/site.py,sha256=tpX7j6dTOz2YmOFIzVYqTfWvIduKlOnBcLITsuPMgxU,1250
+ocf_data_sampler/numpy_sample/sun_position.py,sha256=nkfgN6NmiLGoLSuJZrDsM-6nsIzJN75tWfN20Z7n8xo,1480
+ocf_data_sampler/sample/__init__.py,sha256=zdS73NTnxFX_j8uh9tT-IXiURB6635wbneM1koWYV1o,169
+ocf_data_sampler/sample/base.py,sha256=lnr-MNRpAxjVFJHCEvCZL86NrYy9LWnNOsLWBGDL8kc,2359
+ocf_data_sampler/sample/site.py,sha256=4aJys40CQ-2CRKo_dgvm3rINTdfyTGWQGEaXGbh58qQ,1236
+ocf_data_sampler/sample/uk_regional.py,sha256=uMtLdqZCsKttjFmhIC6JITzu2JDZh-VQdYUfbpyhgFM,2409
+ocf_data_sampler/select/__init__.py,sha256=E4AJulEbO2K-o0UlG1fgaEteuf_1ZFjHTvrotXSb4YU,332
+ocf_data_sampler/select/dropout.py,sha256=_rzXl8_4VHTY_JMjbaoWopaFCJmLdaBpqfYF4vr24tk,1638
+ocf_data_sampler/select/fill_time_periods.py,sha256=TlGxp1xiAqnhdWfLy0pv3FuZc00dtimjWdLzr4JoTGA,865
+ocf_data_sampler/select/find_contiguous_time_periods.py,sha256=cEXrQDzk8pXknxB0q3v5DakosagHMoLDAj302B8Xpw0,11537
+ocf_data_sampler/select/geospatial.py,sha256=CDExkl36eZOKmdJPzUr_K0Wn3axHqv5nYo-EkSiINcc,5032
+ocf_data_sampler/select/location.py,sha256=AZvGR8y62opiW7zACGXjoOtBEWRfSLOZIA73O5Deu0c,1037
+ocf_data_sampler/select/select_spatial_slice.py,sha256=qY2Ll00EPA80oBtzwMoR5nk0UIpoWZF9oXl22YwWr0Q,12341
+ocf_data_sampler/select/select_time_slice.py,sha256=q5QdgHPIXQb49uT5NwbOguY1GhjWc_o3c-2cDb5kLAo,5455
+ocf_data_sampler/select/spatial_slice_for_dataset.py,sha256=Hvz0wHSWMYYamf2oHNiGlzJcM4cAH6pL_7ZEvIBL2dE,1882
+ocf_data_sampler/select/time_slice_for_dataset.py,sha256=1DN6VsWWdLvkpJxodZtBRDUgC4vJE2td_RP5J3ZqPNw,4268
+ocf_data_sampler/torch_datasets/datasets/__init__.py,sha256=jfJSFcR0eO1AqeH7S3KnGjsBqVZT5w3oyi784PUR6Q0,146
+ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=ZkXm0IQEIzZUi8O-qJJz2PbJr9T4ZvutL424yRQUJhc,12878
+ocf_data_sampler/torch_datasets/datasets/site.py,sha256=j29cWPIcksRbge014MxR0_OgJqoskdki6KqvtoHtxpY,18023
+ocf_data_sampler/torch_datasets/utils/__init__.py,sha256=7Yt4anQVU9y27nj4Wx1tRLqbAQLbzW0ED71UL65LvxA,187
+ocf_data_sampler/torch_datasets/utils/channel_dict_to_dataarray.py,sha256=MGylKhXxXLQC2fYv-8L_GVoYhov3LcEwC0Q21xItDSk,353
+ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py,sha256=we7BTxRH7B7jKayDT7YfNyfI3zZClz2Bk-HXKQIokgU,956
+ocf_data_sampler/torch_datasets/utils/valid_time_periods.py,sha256=LdHgLPAYUVoCRMk2nnFdsMpygGS2kbps3h-7_bZnETw,4718
+scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
+utils/compute_icon_mean_stddev.py,sha256=a1oWMRMnny39rV-dvu8rcx85sb4bXzPFrR1gkUr4Jpg,2296
+ocf_data_sampler-0.1.17.dist-info/METADATA,sha256=RI0JClDkwWGjw7gel_j-k2B-SmMKMFLwHdDqEVP5R0U,11713
+ocf_data_sampler-0.1.17.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+ocf_data_sampler-0.1.17.dist-info/top_level.txt,sha256=LEFU4Uk-PEo72QGLAfnVZIUEm37Q8mKuMeg_Xk-p33g,31
+ocf_data_sampler-0.1.17.dist-info/RECORD,,

{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/top_level.txt RENAMED Viewed

@@ -1,3 +1,3 @@
 ocf_data_sampler
 scripts
-tests
+utils

scripts/refactor_site.py CHANGED Viewed

@@ -1,50 +1,80 @@
-""" Helper functions for refactoring legacy site data """
+"""Refactor legacy site data into a more structured format."""
+import pandas as pd
 import xarray as xr
-def legacy_format(data_ds, metadata_df):
-    """This formats old legacy data to the new format.
-    1. This renames the columns in the metadata
-    2. Re-formats the site data from data variables named by the site_id to
-    a data array with a site_id dimension. Also adds capacity_kwp to the dataset as a time series for each site_id
-    """
+def legacy_format(data_ds: xr.Dataset, metadata_df: pd.DataFrame) -> xr.Dataset:
+    """Converts old legacy site data into a more structured format.
+    This function does three main things:
+    1. Renames some columns in the metadata to keep things consistent.
+    2. Reshapes site data so that instead of having separate variables for each site,
+       we use a `site_id` dimension—makes life easier for analysis.
+    3. Adds `capacity_kwp` as a time series so that each site has its capacity info.
+    Parameters:
+        data_ds (xr.Dataset): The dataset containing legacy site data.
+        metadata_df (pd.DataFrame): A DataFrame with metadata about the sites.
+    Returns:
+        xr.Dataset: Reformatted dataset with `generation_kw` and `capacity_kwp`.
+    """
+    # Step 1: Rename metadata columns to match the new expected format
     if "system_id" in metadata_df.columns:
-        metadata_df["site_id"] = metadata_df["system_id"]
+        metadata_df = metadata_df.rename(columns={"system_id": "site_id"})
+    # Convert capacity from megawatts to kilowatts if needed
     if "capacity_megawatts" in metadata_df.columns:
         metadata_df["capacity_kwp"] = metadata_df["capacity_megawatts"] * 1000
-    # only site data has the site_id as data variables.
-    # We want to join them all together and create another coordinate called site_id
+    # Quick sanity check to ensure we have what we need
+    if "site_id" not in metadata_df.columns or "capacity_kwp" not in metadata_df.columns:
+        raise ValueError("Metadata is missing required columns: 'site_id' and 'capacity_kwp'.")
+    # Step 2: Transform the dataset
+    # Check if we actually have site data in the expected format
     if "0" in data_ds:
-        gen_df = data_ds.to_dataframe()
-        gen_da = xr.DataArray(
-            data=gen_df.values,
-            coords=(
-                ("time_utc", gen_df.index.values),
-                ("site_id", metadata_df["site_id"]),
-            ),
+        # Convert the dataset into a DataFrame so we can manipulate it more easily
+        site_data_df = data_ds.to_dataframe()
+        # Create a DataArray for generation data
+        generation_da = xr.DataArray(
+            data=site_data_df.values,
+            coords={
+                "time_utc": site_data_df.index.values,
+                "site_id": metadata_df["site_id"].values,
+            },
+            dims=["time_utc", "site_id"],
             name="generation_kw",
         )
-        capacity_df = gen_df
-        for col in capacity_df.columns:
-            capacity_df[col] = metadata_df[metadata_df["site_id"].astype(str) == col][
-                "capacity_kwp"
-            ].iloc[0]
+        # Step 3: Attach capacity information
+        # Map site_ids to their respective capacities
+        site_ids = site_data_df.columns
+        capacities = metadata_df.set_index("site_id").loc[site_ids, "capacity_kwp"]
+        # Broadcast capacities across all timestamps
+        capacity_df = pd.DataFrame(
+            {site_id: [capacities[site_id]] * len(site_data_df) for site_id in site_ids},
+            index=site_data_df.index,
+        )
+        # Create a DataArray for capacity data
         capacity_da = xr.DataArray(
             data=capacity_df.values,
-            coords=(
-                ("time_utc", gen_df.index.values),
-                ("site_id", metadata_df["site_id"]),
-            ),
+            coords={
+                "time_utc": site_data_df.index.values,
+                "site_id": metadata_df["site_id"].values,
+            },
+            dims=["time_utc", "site_id"],
             name="capacity_kwp",
         )
-        data_ds = xr.Dataset(
-            {
-                "generation_kw": gen_da,
-                "capacity_kwp": capacity_da,
-            }
-        )
-    return data_ds
+        # Finally, bundle everything into a single Dataset
+        data_ds = xr.Dataset({
+            "generation_kw": generation_da,
+            "capacity_kwp": capacity_da,
+        })
+    return data_ds

utils/compute_icon_mean_stddev.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Script to compute normalisation constants from NWP data."""
+import argparse
+import glob
+import logging
+import numpy as np
+import xarray as xr
+from ocf_data_sampler.load.nwp.providers.icon import open_icon_eu
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Add argument parser
+parser = argparse.ArgumentParser(description="Compute normalization constants from NWP data")
+parser.add_argument("--data-path", type=str, required=True,
+                    help='Path pattern to zarr files (e.g., "/path/to/data/*.zarr.zip")')
+parser.add_argument("--n-samples", type=int, default=2000,
+                    help="Number of random samples to use (default: 2000)")
+args = parser.parse_args()
+zarr_files = glob.glob(args.data_path)
+n_samples = args.n_samples
+ds = open_icon_eu(zarr_files)
+n_init_times = ds.sizes["init_time_utc"]
+n_lats = ds.sizes["latitude"]
+n_longs = ds.sizes["longitude"]
+n_steps = ds.sizes["step"]
+random_init_times = np.random.choice(n_init_times, size=n_samples, replace=True)
+random_lats = np.random.choice(n_lats, size=n_samples, replace=True)
+random_longs = np.random.choice(n_longs, size=n_samples, replace=True)
+random_steps = np.random.choice(n_steps, size=n_samples, replace=True)
+samples = []
+for i in range(n_samples):
+    sample = ds.isel(init_time_utc=random_init_times[i],
+                    latitude=random_lats[i],
+                    longitude=random_longs[i],
+                    step=random_steps[i])
+    samples.append(sample)
+samples_stack = xr.concat(samples, dim="samples")
+available_channels = samples_stack.channel.values.tolist()
+logger.info("Available channels: %s", available_channels)
+ICON_EU_MEAN = {}
+ICON_EU_STD = {}
+for var in available_channels:
+    if var not in available_channels:
+        logger.warning("Variable '%s' not found in the channel coordinate; skipping.", var)
+        continue
+    var_data = samples_stack.sel(channel=var)
+    var_mean = float(var_data.mean().compute())
+    var_std = float(var_data.std().compute())
+    ICON_EU_MEAN[var] = var_mean
+    ICON_EU_STD[var] = var_std
+    logger.info("Processed %s: mean=%.4f, std=%.4f", var, var_mean, var_std)
+logger.info("\nMean values:\n%s", ICON_EU_MEAN)
+logger.info("\nStandard deviations:\n%s", ICON_EU_STD)

ocf_data_sampler/constants.py DELETED Viewed

@@ -1,222 +0,0 @@
-import xarray as xr
-import numpy as np
-NWP_PROVIDERS = [
-    "ukv",
-    "ecmwf",
-    "gfs"
-]
-# TODO add ICON
-def _to_data_array(d):
-    return xr.DataArray(
-        [d[k] for k in d.keys()],
-        coords={"channel": [k for k in d.keys()]},
-    ).astype(np.float32)
-class NWPStatDict(dict):
-    """Custom dictionary class to hold NWP normalization stats"""
-    def __getitem__(self, key):
-        if key not in NWP_PROVIDERS:
-            raise KeyError(f"{key} is not a supported NWP provider - {NWP_PROVIDERS}")
-        elif key in self.keys():
-            return super().__getitem__(key)
-        else:
-            raise KeyError(
-                f"Values for {key} not yet available in ocf-data-sampler {list(self.keys())}"
-            )
-# ------ UKV
-# Means and std computed WITH version_7 and higher, MetOffice values
-UKV_STD = {
-    "cdcb": 2126.99350113,
-    "lcc": 39.33210726,
-    "mcc": 41.91144559,
-    "hcc": 38.07184418,
-    "sde": 0.1029753,
-    "hcct": 18382.63958991,
-    "dswrf": 190.47216887,
-    "dlwrf": 39.45988077,
-    "h": 1075.77812282,
-    "t": 4.38818501,
-    "r": 11.45012499,
-    "dpt": 4.57250482,
-    "vis": 21578.97975625,
-    "si10": 3.94718813,
-    "wdir10": 94.08407495,
-    "prmsl": 1252.71790539,
-    "prate": 0.00021497,
-}
-UKV_MEAN = {
-    "cdcb": 1412.26599062,
-    "lcc": 50.08362643,
-    "mcc": 40.88984494,
-    "hcc": 29.11949682,
-    "sde": 0.00289545,
-    "hcct": -18345.97478167,
-    "dswrf": 111.28265039,
-    "dlwrf": 325.03130139,
-    "h": 2096.51991356,
-    "t": 283.64913206,
-    "r": 81.79229501,
-    "dpt": 280.54379901,
-    "vis": 32262.03285118,
-    "si10": 6.88348448,
-    "wdir10": 199.41891636,
-    "prmsl": 101321.61574029,
-    "prate": 3.45793433e-05,
-}
-UKV_STD = _to_data_array(UKV_STD)
-UKV_MEAN = _to_data_array(UKV_MEAN)
-# ------ ECMWF
-# These were calculated from 100 random init times of UK data from 2020-2023
-ECMWF_STD = {
-    "dlwrf": 15855867.0,
-    "dswrf": 13025427.0,
-    "duvrs": 1445635.25,
-    "hcc": 0.42244860529899597,
-    "lcc": 0.3791404366493225,
-    "mcc": 0.38039860129356384,
-    "prate": 9.81039775069803e-05,
-    "sd": 0.000913831521756947,
-    "sr": 16294988.0,
-    "t2m": 3.692270040512085,
-    "tcc": 0.37487083673477173,
-    "u10": 5.531515598297119,
-    "u100": 7.2320556640625,
-    "u200": 8.049470901489258,
-    "v10": 5.411230564117432,
-    "v100": 6.944501876831055,
-    "v200": 7.561611652374268,
-    "diff_dlwrf": 131942.03125,
-    "diff_dswrf": 715366.3125,
-    "diff_duvrs": 81605.25,
-    "diff_sr": 818950.6875,
-}
-ECMWF_MEAN = {
-    "dlwrf": 27187026.0,
-    "dswrf": 11458988.0,
-    "duvrs": 1305651.25,
-    "hcc": 0.3961029052734375,
-    "lcc": 0.44901806116104126,
-    "mcc": 0.3288780450820923,
-    "prate": 3.108070450252853e-05,
-    "sd": 8.107526082312688e-05,
-    "sr": 12905302.0,
-    "t2m": 283.48333740234375,
-    "tcc": 0.7049227356910706,
-    "u10": 1.7677178382873535,
-    "u100": 2.393547296524048,
-    "u200": 2.7963004112243652,
-    "v10": 0.985887885093689,
-    "v100": 1.4244288206100464,
-    "v200": 1.6010299921035767,
-    "diff_dlwrf": 1136464.0,
-    "diff_dswrf": 420584.6875,
-    "diff_duvrs": 48265.4765625,
-    "diff_sr": 469169.5,
-}
-ECMWF_STD = _to_data_array(ECMWF_STD)
-ECMWF_MEAN = _to_data_array(ECMWF_MEAN)
-# ------ GFS
-GFS_STD = {
-    "dlwrf": 96.305916,
-    "dswrf": 246.18533,
-    "hcc": 42.525383,
-    "lcc": 44.3732,
-    "mcc": 43.150745,
-    "prate": 0.00010159573,
-    "r": 25.440672,
-    "sde": 0.43345627,
-    "t": 22.825893,
-    "tcc": 41.030598,
-    "u10": 5.470838,
-    "u100": 6.8899174,
-    "v10": 4.7401133,
-    "v100": 6.076132,
-    "vis": 8294.022,
-    "u": 10.614556,
-    "v": 7.176398,
-}
-GFS_MEAN = {
-    "dlwrf": 298.342,
-    "dswrf": 168.12321,
-    "hcc": 35.272,
-    "lcc": 43.578342,
-    "mcc": 33.738823,
-    "prate": 2.8190969e-05,
-    "r": 18.359747,
-    "sde": 0.36937004,
-    "t": 278.5223,
-    "tcc": 66.841606,
-    "u10": -0.0022310058,
-    "u100": 0.0823025,
-    "v10": 0.06219831,
-    "v100": 0.0797807,
-    "vis": 19628.32,
-    "u": 11.645444,
-    "v": 0.12330122,
-}
-GFS_STD = _to_data_array(GFS_STD)
-GFS_MEAN = _to_data_array(GFS_MEAN)
-NWP_STDS = NWPStatDict(
-    ukv=UKV_STD,
-    ecmwf=ECMWF_STD,
-    gfs=GFS_STD
-)
-NWP_MEANS = NWPStatDict(
-    ukv=UKV_MEAN,
-    ecmwf=ECMWF_MEAN,
-    gfs=GFS_MEAN
-)
-# ------ Satellite
-# RSS Mean and std values from randomised 20% of 2020 imagery
-RSS_STD = {
-    "HRV": 0.11405209,
-    "IR_016": 0.21462157,
-    "IR_039": 0.04618041,
-    "IR_087": 0.06687243,
-    "IR_097": 0.0468558,
-    "IR_108": 0.17482725,
-    "IR_120": 0.06115861,
-    "IR_134": 0.04492306,
-    "VIS006": 0.12184761,
-    "VIS008": 0.13090034,
-    "WV_062": 0.16111417,
-    "WV_073": 0.12924142,
-}
-RSS_MEAN = {
-    "HRV": 0.09298719,
-    "IR_016": 0.17594202,
-    "IR_039": 0.86167645,
-    "IR_087": 0.7719318,
-    "IR_097": 0.8014212,
-    "IR_108": 0.71254843,
-    "IR_120": 0.89058584,
-    "IR_134": 0.944365,
-    "VIS006": 0.09633306,
-    "VIS008": 0.11426069,
-    "WV_062": 0.7359355,
-    "WV_073": 0.62479186,
-}
-RSS_STD = _to_data_array(RSS_STD)
-RSS_MEAN = _to_data_array(RSS_MEAN)

ocf-data-sampler 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

Potentially problematic release.

ocf-data-sampler 0.1.11py3-none-any.whl → 0.1.17py3-none-any.whl