PyPI - ocf-data-sampler - Versions diffs - 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl - Mend

ocf-data-sampler 0.0.37py3-none-any.whl → 0.0.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (10) hide show

ocf_data_sampler/constants.py CHANGED Viewed

@@ -28,6 +28,7 @@ class NWPStatDict(dict):
                 f"Values for {key} not yet available in ocf-data-sampler {list(self.keys())}"
             )
 # ------ UKV
 # Means and std computed WITH version_7 and higher, MetOffice values
 UKV_STD = {
@@ -49,6 +50,7 @@ UKV_STD = {
     "prmsl": 1252.71790539,
     "prate": 0.00021497,
 }
 UKV_MEAN = {
     "cdcb": 1412.26599062,
     "lcc": 50.08362643,
@@ -97,6 +99,7 @@ ECMWF_STD = {
     "diff_duvrs": 81605.25,
     "diff_sr": 818950.6875,
 }
 ECMWF_MEAN = {
     "dlwrf": 27187026.0,
     "dswrf": 11458988.0,
@@ -133,3 +136,38 @@ NWP_MEANS = NWPStatDict(
     ecmwf=ECMWF_MEAN,
 )
+# ------ Satellite
+# RSS Mean and std values from randomised 20% of 2020 imagery
+RSS_STD = {
+    "HRV": 0.11405209,
+    "IR_016": 0.21462157,
+    "IR_039": 0.04618041,
+    "IR_087": 0.06687243,
+    "IR_097": 0.0468558,
+    "IR_108": 0.17482725,
+    "IR_120": 0.06115861,
+    "IR_134": 0.04492306,
+    "VIS006": 0.12184761,
+    "VIS008": 0.13090034,
+    "WV_062": 0.16111417,
+    "WV_073": 0.12924142,
+}
+RSS_MEAN = {
+    "HRV": 0.09298719,
+    "IR_016": 0.17594202,
+    "IR_039": 0.86167645,
+    "IR_087": 0.7719318,
+    "IR_097": 0.8014212,
+    "IR_108": 0.71254843,
+    "IR_120": 0.89058584,
+    "IR_134": 0.944365,
+    "VIS006": 0.09633306,
+    "VIS008": 0.11426069,
+    "WV_062": 0.7359355,
+    "WV_073": 0.62479186,
+}
+RSS_STD = _to_data_array(RSS_STD)
+RSS_MEAN = _to_data_array(RSS_MEAN)

ocf_data_sampler/numpy_batch/nwp.py CHANGED Viewed

@@ -1,5 +1,4 @@
 """Convert NWP to NumpyBatch"""
 import pandas as pd
 import xarray as xr

ocf_data_sampler/numpy_batch/satellite.py CHANGED Viewed

@@ -13,6 +13,7 @@ class SatelliteBatchKey:
 def convert_satellite_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> dict:
     """Convert from Xarray to NumpyBatch"""
     example = {
         SatelliteBatchKey.satellite_actual: da.values,
         SatelliteBatchKey.time_utc: da.time_utc.values.astype(float),
@@ -27,4 +28,4 @@ def convert_satellite_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None
     if t0_idx is not None:
         example[SatelliteBatchKey.t0_idx] = t0_idx
-    return example
+    return example

ocf_data_sampler/torch_datasets/process_and_combine.py CHANGED Viewed

@@ -4,7 +4,7 @@ import xarray as xr
 from typing import Tuple
 from ocf_data_sampler.config import Configuration
-from ocf_data_sampler.constants import NWP_MEANS, NWP_STDS
+from ocf_data_sampler.constants import NWP_MEANS, NWP_STDS, RSS_MEAN, RSS_STD
 from ocf_data_sampler.numpy_batch import (
     convert_nwp_to_numpy_batch,
     convert_satellite_to_numpy_batch,
@@ -25,8 +25,8 @@ def process_and_combine_datasets(
     location: Location,
     target_key: str = 'gsp'
 ) -> dict:
-    """Normalize and convert data to numpy arrays"""
+    """Normalise and convert data to numpy arrays"""
     numpy_modalities = []
     if "nwp" in dataset_dict:
@@ -37,19 +37,23 @@ def process_and_combine_datasets(
             # Standardise
             provider = config.input_data.nwp[nwp_key].provider
             da_nwp = (da_nwp - NWP_MEANS[provider]) / NWP_STDS[provider]
             # Convert to NumpyBatch
             nwp_numpy_modalities[nwp_key] = convert_nwp_to_numpy_batch(da_nwp)
         # Combine the NWPs into NumpyBatch
         numpy_modalities.append({NWPBatchKey.nwp: nwp_numpy_modalities})
     if "sat" in dataset_dict:
-        # Satellite is already in the range [0-1] so no need to standardise
+        # Standardise
         da_sat = dataset_dict["sat"]
+        da_sat = (da_sat - RSS_MEAN) / RSS_STD
         # Convert to NumpyBatch
         numpy_modalities.append(convert_satellite_to_numpy_batch(da_sat))
     gsp_config = config.input_data.gsp
     if "gsp" in dataset_dict:
@@ -93,6 +97,7 @@ def process_and_combine_datasets(
     return combined_sample
 def process_and_combine_site_sample_dict(
     dataset_dict: dict,
     config: Configuration,
@@ -119,8 +124,9 @@ def process_and_combine_site_sample_dict(
             data_arrays.append((f"nwp-{provider}", da_nwp))
     if "sat" in dataset_dict:
-        # TODO add some satellite normalisation
+        # Standardise
         da_sat = dataset_dict["sat"]
+        da_sat = (da_sat - RSS_MEAN) / RSS_STD
         data_arrays.append(("satellite", da_sat))
     if "site" in dataset_dict:
@@ -143,6 +149,7 @@ def merge_dicts(list_of_dicts: list[dict]) -> dict:
         combined_dict.update(d)
     return combined_dict
 def merge_arrays(normalised_data_arrays: list[Tuple[str, xr.DataArray]]) -> xr.Dataset:
     """
     Combine a list of DataArrays into a single Dataset with unique naming conventions.

{ocf_data_sampler-0.0.37.dist-info → ocf_data_sampler-0.0.38.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ocf_data_sampler
-Version: 0.0.37
+Version: 0.0.38
 Summary: Sample from weather data for renewable energy prediction
 Author: James Fulton, Peter Dudfield, and the Open Climate Fix team
 Author-email: info@openclimatefix.org

{ocf_data_sampler-0.0.37.dist-info → ocf_data_sampler-0.0.38.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 ocf_data_sampler/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-ocf_data_sampler/constants.py,sha256=tUwHrsGShqIn5Izze4i32_xB6X0v67rvQwIYB-P5PJQ,3355
+ocf_data_sampler/constants.py,sha256=G2VfkE_-veq_0hNBQQOQCtCsfC37O5-QG9mJWEmln5s,4153
 ocf_data_sampler/utils.py,sha256=rKA0BHAyAG4f90zEcgxp25EEYrXS-aOVNzttZ6Mzv2k,250
 ocf_data_sampler/config/__init__.py,sha256=YXnAkgHViHB26hSsjiv32b6EbpG-A1kKTkARJf0_RkY,212
 ocf_data_sampler/config/load.py,sha256=4f7vPHAIAmd-55tPxoIzn7F_TI_ue4NxkDcLPoVWl0g,943
@@ -21,8 +21,8 @@ ocf_data_sampler/load/nwp/providers/utils.py,sha256=Sy2exG1wpXLLhMXYdsfR-DZMR3tx
 ocf_data_sampler/numpy_batch/__init__.py,sha256=8MgRF29rK9bKP4b4iHakaoGwBKUcjWZ-VFKjCcq53QA,336
 ocf_data_sampler/numpy_batch/collate.py,sha256=KyWdDi8AXD5YiokXXiqr2_X1SC1me1GrhnQMelg0Qx8,2202
 ocf_data_sampler/numpy_batch/gsp.py,sha256=QjQ25JmtufvdiSsxUkBTPhxouYGWPnnWze8pXr_aBno,960
-ocf_data_sampler/numpy_batch/nwp.py,sha256=dAehfRo5DL2Yb20ifHHl5cU1QOrm3ZOpQmN39fSUOw8,1255
-ocf_data_sampler/numpy_batch/satellite.py,sha256=3NoE_ElzMHwO60apqJeFAwI6J7eIxD0OWTyAVl-uJi8,903
+ocf_data_sampler/numpy_batch/nwp.py,sha256=bEvBB9xGf7B8okPBZ-eZLK4PBWA0nvmmEFiN49dgqPU,1254
+ocf_data_sampler/numpy_batch/satellite.py,sha256=VKo8eiSIcYhAdHHBUH697HMz7rBv6S9XZ6_XCZ-qG4Y,905
 ocf_data_sampler/numpy_batch/site.py,sha256=CWI0efUl8SrnGm0VNGdGwAqrmlT1XaVbJIUE2hSOz9E,744
 ocf_data_sampler/numpy_batch/sun_position.py,sha256=zw2bjtcjsm_tvKk0r_MZmgfYUJLHuLjLly2sMjwP3XI,1606
 ocf_data_sampler/select/__init__.py,sha256=E4AJulEbO2K-o0UlG1fgaEteuf_1ZFjHTvrotXSb4YU,332
@@ -36,7 +36,7 @@ ocf_data_sampler/select/select_time_slice.py,sha256=D5P_cSvnv8Qs49K5au7lPxDr9U_V
 ocf_data_sampler/select/spatial_slice_for_dataset.py,sha256=3tRrMBXr7s4CnClbVSIq7hpls3H4Y3qYTDwswcxCCCE,1763
 ocf_data_sampler/select/time_slice_for_dataset.py,sha256=LMw8KnOCKnPjD0m4UubAWERpaiQtzRKkI2cSh5a0A-M,4335
 ocf_data_sampler/torch_datasets/__init__.py,sha256=nJUa2KzVa84ZoM0PT2AbDz26ennmAYc7M7WJVfypPMs,85
-ocf_data_sampler/torch_datasets/process_and_combine.py,sha256=WwwuzxXoq8S70R-tWABXUMO854TG8GWYnNhb1IU8MRY,7526
+ocf_data_sampler/torch_datasets/process_and_combine.py,sha256=ImfU4I75x7A57KCShWj6dr62tNtJqJ0ImKRiT0hijIQ,7564
 ocf_data_sampler/torch_datasets/pvnet_uk_regional.py,sha256=QRFqbdfNchVWj4y70n-rJdFvFGvQj-WpZLdFqWjnOTw,5543
 ocf_data_sampler/torch_datasets/site.py,sha256=NYuhgm9ti9SRt1dcb_WrFYYo14NgVdOsaoPbc5FsnaA,6560
 ocf_data_sampler/torch_datasets/valid_time_periods.py,sha256=Qo65qUHtle_bW5tLTYr7empHTRv-lpjvfx_6GNJj3Xg,4371
@@ -59,10 +59,11 @@ tests/select/test_find_contiguous_time_periods.py,sha256=kOga_V7er5We7ewMARXaKdM
 tests/select/test_location.py,sha256=_WZk2FPYeJ-nIfCJS6Sp_yaVEEo7m31DmMFoZzgyCts,2712
 tests/select/test_select_spatial_slice.py,sha256=7EX9b6g-pMdACQx3yefjs5do2s-Rho2UmKevV4oglsU,5147
 tests/select/test_select_time_slice.py,sha256=K1EJR5TwZa9dJf_YTEHxGtvs398iy1xS2lr1BgJZkoo,9603
+tests/torch_datasets/test_process_and_combine.py,sha256=SWmrI59JVfMnHK78N5yhKzQR8b5kJ8TeMZke9Mlnc-o,5717
 tests/torch_datasets/test_pvnet_uk_regional.py,sha256=eqy0nQOWoHnqltlJlGmRlgIiIzPEwOC6o5A6GARryKA,2118
 tests/torch_datasets/test_site.py,sha256=YuVjWTI14_kmEOx23XE5J_RZ8UalCKD2xRv6mqYizB8,2872
-ocf_data_sampler-0.0.37.dist-info/LICENSE,sha256=F-Q3UFCR-BECSocV55BFDpn4YKxve9PKrm-lTt6o_Tg,1073
-ocf_data_sampler-0.0.37.dist-info/METADATA,sha256=tKixIA37U0AA76QsYmCIfLzpzE2aSGRmquSx69jX4aY,10290
-ocf_data_sampler-0.0.37.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-ocf_data_sampler-0.0.37.dist-info/top_level.txt,sha256=Faob6N6cFdPc5eUpCTYcXgCaNhi4XLLteUL5W5ayYmg,31
-ocf_data_sampler-0.0.37.dist-info/RECORD,,
+ocf_data_sampler-0.0.38.dist-info/LICENSE,sha256=F-Q3UFCR-BECSocV55BFDpn4YKxve9PKrm-lTt6o_Tg,1073
+ocf_data_sampler-0.0.38.dist-info/METADATA,sha256=YbU2ymHq94ZLsyjlD1ZdKoYpVVDzUUmyWN7xRDBvQDM,10290
+ocf_data_sampler-0.0.38.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+ocf_data_sampler-0.0.38.dist-info/top_level.txt,sha256=Faob6N6cFdPc5eUpCTYcXgCaNhi4XLLteUL5W5ayYmg,31
+ocf_data_sampler-0.0.38.dist-info/RECORD,,

tests/torch_datasets/test_process_and_combine.py ADDED Viewed

@@ -0,0 +1,165 @@
+import pytest
+import tempfile
+import numpy as np
+import pandas as pd
+import xarray as xr
+import dask.array as da
+from ocf_data_sampler.config import load_yaml_configuration, save_yaml_configuration
+from ocf_data_sampler.config import Configuration
+from ocf_data_sampler.select.location import Location
+from ocf_data_sampler.numpy_batch import NWPBatchKey, GSPBatchKey, SatelliteBatchKey
+from ocf_data_sampler.torch_datasets import PVNetUKRegionalDataset
+from ocf_data_sampler.torch_datasets.process_and_combine import (
+    process_and_combine_datasets,
+    process_and_combine_site_sample_dict,
+    merge_dicts,
+    fill_nans_in_arrays,
+    compute,
+)
+def test_process_and_combine_datasets(pvnet_config_filename):
+    # Load in config for function and define location
+    config = load_yaml_configuration(pvnet_config_filename)
+    t0 = pd.Timestamp("2024-01-01 00:00")
+    location = Location(coordinate_system="osgb", x=1234, y=5678, id=1)
+    nwp_data = xr.DataArray(
+        np.random.rand(4, 2, 2, 2),
+        dims=["time_utc", "channel", "y", "x"],
+        coords={
+            "time_utc": pd.date_range("2024-01-01 00:00", periods=4, freq="h"),
+            "channel": ["t2m", "dswrf"],
+            "step": ("time_utc", pd.timedelta_range(start='0h', periods=4, freq='h')),
+            "init_time_utc": pd.Timestamp("2024-01-01 00:00")
+        }
+    )
+    sat_data = xr.DataArray(
+        np.random.rand(7, 1, 2, 2),
+        dims=["time_utc", "channel", "y", "x"],
+        coords={
+            "time_utc": pd.date_range("2024-01-01 00:00", periods=7, freq="5min"),
+            "channel": ["HRV"],
+            "x_geostationary": (["y", "x"], np.array([[1, 2], [1, 2]])),
+            "y_geostationary": (["y", "x"], np.array([[1, 1], [2, 2]]))
+        }
+    )
+    # Combine as dict
+    dataset_dict = {
+        "nwp": {"ukv": nwp_data},
+        "sat": sat_data
+    }
+    # Call relevant function
+    result = process_and_combine_datasets(dataset_dict, config, t0, location)
+    # Assert result is dict - check and validate
+    assert isinstance(result, dict)
+    assert NWPBatchKey.nwp in result
+    assert result[SatelliteBatchKey.satellite_actual].shape == (7, 1, 2, 2)
+    assert result[NWPBatchKey.nwp]["ukv"][NWPBatchKey.nwp].shape == (4, 1, 2, 2)
+def test_merge_dicts():
+    """Test merge_dicts function"""
+    dict1 = {"a": 1, "b": 2}
+    dict2 = {"c": 3, "d": 4}
+    dict3 = {"e": 5}
+    result = merge_dicts([dict1, dict2, dict3])
+    assert result == {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}
+    # Test key overwriting
+    dict4 = {"a": 10, "f": 6}
+    result = merge_dicts([dict1, dict4])
+    assert result["a"] == 10
+def test_fill_nans_in_arrays():
+    """Test the fill_nans_in_arrays function"""
+    array_with_nans = np.array([1.0, np.nan, 3.0, np.nan])
+    nested_dict = {
+        "array1": array_with_nans,
+        "nested": {
+            "array2": np.array([np.nan, 2.0, np.nan, 4.0])
+        },
+        "string_key": "not_an_array"
+    }
+    result = fill_nans_in_arrays(nested_dict)
+    assert not np.isnan(result["array1"]).any()
+    assert np.array_equal(result["array1"], np.array([1.0, 0.0, 3.0, 0.0]))
+    assert not np.isnan(result["nested"]["array2"]).any()
+    assert np.array_equal(result["nested"]["array2"], np.array([0.0, 2.0, 0.0, 4.0]))
+    assert result["string_key"] == "not_an_array"
+def test_compute():
+    """Test compute function with dask array"""
+    da_dask = xr.DataArray(da.random.random((5, 5)))
+    # Create a nested dictionary with dask array
+    nested_dict = {
+        "array1": da_dask,
+        "nested": {
+            "array2": da_dask
+        }
+    }
+    # Ensure initial data is lazy - i.e. not yet computed
+    assert not isinstance(nested_dict["array1"].data, np.ndarray)
+    assert not isinstance(nested_dict["nested"]["array2"].data, np.ndarray)
+    # Call the compute function
+    result = compute(nested_dict)
+    # Assert that the result is an xarray DataArray and no longer lazy
+    assert isinstance(result["array1"], xr.DataArray)
+    assert isinstance(result["nested"]["array2"], xr.DataArray)
+    assert isinstance(result["array1"].data, np.ndarray)
+    assert isinstance(result["nested"]["array2"].data, np.ndarray)
+    # Ensure there no NaN values in computed data
+    assert not np.isnan(result["array1"].data).any()
+    assert not np.isnan(result["nested"]["array2"].data).any()
+def test_process_and_combine_site_sample_dict(pvnet_config_filename):
+    # Load config
+    config = load_yaml_configuration(pvnet_config_filename)
+    # Specify minimal structure for testing
+    raw_nwp_values = np.random.rand(4, 1, 2, 2)  # Single channel
+    site_dict = {
+        "nwp": {
+            "ukv": xr.DataArray(
+                raw_nwp_values,
+                dims=["time_utc", "channel", "y", "x"],
+                coords={
+                    "time_utc": pd.date_range("2024-01-01 00:00", periods=4, freq="h"),
+                    "channel": ["dswrf"],  # Single channel
+                },
+            )
+        }
+    }
+    print(f"Input site_dict: {site_dict}")
+    # Call function
+    result = process_and_combine_site_sample_dict(site_dict, config)
+    # Assert to validate output structure
+    assert isinstance(result, xr.Dataset), "Result should be an xarray.Dataset"
+    assert len(result.data_vars) > 0, "Dataset should contain data variables"
+    # Validate variable via assertion and shape of such
+    expected_variable = "nwp-ukv"
+    assert expected_variable in result.data_vars, f"Expected variable '{expected_variable}' not found"
+    nwp_result = result[expected_variable]
+    assert nwp_result.shape == (4, 1, 2, 2), f"Unexpected shape for '{expected_variable}': {nwp_result.shape}"

{ocf_data_sampler-0.0.37.dist-info → ocf_data_sampler-0.0.38.dist-info}/LICENSE RENAMED Viewed

File without changes

{ocf_data_sampler-0.0.37.dist-info → ocf_data_sampler-0.0.38.dist-info}/WHEEL RENAMED Viewed

File without changes

{ocf_data_sampler-0.0.37.dist-info → ocf_data_sampler-0.0.38.dist-info}/top_level.txt RENAMED Viewed

File without changes

ocf-data-sampler 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl

Potentially problematic release.

ocf-data-sampler 0.0.37py3-none-any.whl → 0.0.38py3-none-any.whl