PyPI - ocf-data-sampler - Versions diffs - 0.5.15__py3-none-any.whl → 0.5.16__py3-none-any.whl - Mend

ocf-data-sampler 0.5.15py3-none-any.whl → 0.5.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (12) hide show

ocf_data_sampler/numpy_sample/nwp.py CHANGED Viewed

@@ -28,7 +28,7 @@ def convert_nwp_to_numpy_sample(da: xr.DataArray, t0_idx: int | None = None) ->
         NWPSampleKey.channel_names: da.channel.values,
         NWPSampleKey.init_time_utc: da.init_time_utc.values.astype(float),
         NWPSampleKey.step: (da.step.values / 3600).astype(int),
-        NWPSampleKey.target_time_utc: da.target_time_utc.values.astype(float),
+        NWPSampleKey.target_time_utc: (da.init_time_utc.values + da.step.values).astype(float),
     }
     if t0_idx is not None:

ocf_data_sampler/select/diff_channels.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Takes the diff along the step axis for a given set of channels."""
+import numpy as np
+import xarray as xr
+def diff_channels(da: xr.DataArray, accum_channels: list[str]) -> xr.DataArray:
+    """Perform in-place diff of the given channels of the DataArray in the steps dimension.
+    Args:
+        da: The DataArray to slice from
+        accum_channels: Channels which are accumulated and need to be differenced
+    """
+    if da.dims[:2] != ("step", "channel"):
+        raise ValueError("This function assumes the first two dimensions are step then channel")
+    all_channels = da.channel.values
+    accum_channel_inds = [i for i, c in enumerate(all_channels) if c in accum_channels]
+    # Make a copy of the values to avoid changing the underlying numpy array
+    vals = da.values.copy()
+    vals[:-1, accum_channel_inds] = np.diff(vals[:, accum_channel_inds], axis=0)
+    da.values = vals
+    return da.isel(step=slice(0, -1))

ocf_data_sampler/select/select_time_slice.py CHANGED Viewed

@@ -38,7 +38,6 @@ def select_time_slice_nwp(
     time_resolution: pd.Timedelta,
     dropout_timedeltas: list[pd.Timedelta] | None = None,
     dropout_frac: float | None = 0,
-    accum_channels: list[str] | None = None,
 ) -> xr.DataArray:
     """Select a time slice from an NWP DataArray.
@@ -50,11 +49,8 @@ def select_time_slice_nwp(
         time_resolution: Distance between neighbouring timestamps
         dropout_timedeltas: List of possible timedeltas before t0 where data availability may start
         dropout_frac: Probability to apply dropout
-        accum_channels: Channels which are accumulated and need to be differenced
     """
-    if accum_channels is None:
-        accum_channels = []
+    # Input checking
     if dropout_timedeltas is None:
         dropout_timedeltas = []
@@ -69,75 +65,43 @@ def select_time_slice_nwp(
     consider_dropout = len(dropout_timedeltas) > 0 and dropout_frac > 0
-    # The accumatated and non-accumulated channels
-    accum_channels = np.intersect1d(da.channel.values, accum_channels)
-    non_accum_channels = np.setdiff1d(da.channel.values, accum_channels)
     start_dt = (t0 + interval_start).ceil(time_resolution)
     end_dt = (t0 + interval_end).ceil(time_resolution)
     target_times = pd.date_range(start_dt, end_dt, freq=time_resolution)
     # Potentially apply NWP dropout
     if consider_dropout and (np.random.uniform() < dropout_frac):
-        dt = np.random.choice(dropout_timedeltas)
-        t0_available = t0 + dt
+        t0_available = t0 + np.random.choice(dropout_timedeltas)
     else:
         t0_available = t0
-    # Forecasts made up to and including t0
-    available_init_times = da.init_time_utc.sel(init_time_utc=slice(None, t0_available))
+    # Get the available and relevant init-times
+    t_min = target_times[0] - da.step.values[-1]
+    init_times = da.init_time_utc.values
+    available_init_times = init_times[(t_min<=init_times) & (init_times<=t0_available)]
-    # Find the most recent available init times for all target times
-    selected_init_times = available_init_times.sel(
-        init_time_utc=target_times,
-        method="ffill",  # forward fill from init times to target times
-    ).values
+    # Find the most recent available init-times for all target-times
+    selected_init_times = np.array(
+        [available_init_times[available_init_times<=t][-1] for t in target_times],
+    )
-    # Find the required steps for all target times
+    # Find the required steps for all target-times
     steps = target_times - selected_init_times
-    # We want one timestep for each target_time_hourly (obviously!) If we simply do
-    # nwp.sel(init_time=init_times, step=steps) then we'll get the *product* of
-    # init_times and steps, which is not what we want! Instead, we use xarray's
-    # vectorised-indexing mode via using a DataArray indexer.  See the last example here:
-    # https://docs.xarray.dev/en/latest/user-guide/indexing.html#more-advanced-indexing
-    coords = {"target_time_utc": target_times}
-    init_time_indexer = xr.DataArray(selected_init_times, coords=coords)
-    step_indexer = xr.DataArray(steps, coords=coords)
+    # If we are only selecting from one init-time we can construct the slice so its faster
+    if len(np.unique(selected_init_times))==1:
+        da_sel = da.sel(init_time_utc=selected_init_times[0], step=slice(steps[0], steps[-1]))
-    if len(accum_channels) == 0:
-        da_sel = da.sel(step=step_indexer, init_time_utc=init_time_indexer)
+    # If we are selecting from multiple init times this more complex and slower
     else:
-        # First minimise the size of the dataset we are diffing
-        # - find the init times we are slicing from
-        unique_init_times = np.unique(selected_init_times)
-        # - find the min and max steps we slice over. Max is extended due to diff
-        min_step = min(steps)
-        max_step = max(steps) + time_resolution
-        da_min = da.sel(init_time_utc=unique_init_times, step=slice(min_step, max_step))
-        # Slice out the data which does not need to be diffed
-        da_non_accum = da_min.sel(channel=non_accum_channels)
-        da_sel_non_accum = da_non_accum.sel(step=step_indexer, init_time_utc=init_time_indexer)
-        # Slice out the channels which need to be diffed
-        da_accum = da_min.sel(channel=accum_channels)
-        # Take the diff and slice requested data
-        da_accum = da_accum.diff(dim="step", label="lower")
-        da_sel_accum = da_accum.sel(step=step_indexer, init_time_utc=init_time_indexer)
-        # Join diffed and non-diffed variables
-        da_sel = xr.concat([da_sel_non_accum, da_sel_accum], dim="channel")
-        # Reorder the variable back to the original order
-        da_sel = da_sel.sel(channel=da.channel.values)
-        # Rename the diffed channels
-        da_sel["channel"] = [
-            f"diff_{v}" if v in accum_channels else v for v in da_sel.channel.values
-        ]
+        # We want one timestep for each target_time_hourly (obviously!) If we simply do
+        # nwp.sel(init_time=init_times, step=steps) then we'll get the *product* of
+        # init_times and steps, which is not what we want! Instead, we use xarray's
+        # vectorised-indexing mode via using a DataArray indexer.  See the last example here:
+        # https://docs.xarray.dev/en/latest/user-guide/indexing.html#more-advanced-indexing
+        coords = {"step": steps}
+        init_time_indexer = xr.DataArray(selected_init_times, coords=coords)
+        step_indexer = xr.DataArray(steps, coords=coords)
+        da_sel = da.sel(init_time_utc=init_time_indexer, step=step_indexer)
     return da_sel

ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py CHANGED Viewed

@@ -22,6 +22,7 @@ from ocf_data_sampler.select import Location, fill_time_periods
 from ocf_data_sampler.torch_datasets.utils import (
     add_alterate_coordinate_projections,
     config_normalization_values_to_dicts,
+    diff_nwp_data,
     fill_nans_in_arrays,
     find_valid_time_periods,
     merge_dicts,
@@ -259,7 +260,7 @@ class PVNetUKRegionalDataset(AbstractPVNetUKDataset):
         sample_dict = slice_datasets_by_space(self.datasets_dict, location, self.config)
         sample_dict = slice_datasets_by_time(sample_dict, t0, self.config)
         sample_dict = tensorstore_compute(sample_dict)
+        sample_dict = diff_nwp_data(sample_dict, self.config)
         return self.process_and_combine_datasets(sample_dict, t0, location)
     @override
@@ -318,6 +319,7 @@ class PVNetUKConcurrentDataset(AbstractPVNetUKDataset):
         # Slice by time then load to avoid loading the data multiple times from disk
         sample_dict = slice_datasets_by_time(self.datasets_dict, t0, self.config)
         sample_dict = tensorstore_compute(sample_dict)
+        sample_dict = diff_nwp_data(sample_dict, self.config)
         gsp_samples = []

ocf_data_sampler/torch_datasets/datasets/site.py CHANGED Viewed

@@ -27,6 +27,7 @@ from ocf_data_sampler.select import (
 from ocf_data_sampler.torch_datasets.utils import (
     add_alterate_coordinate_projections,
     config_normalization_values_to_dicts,
+    diff_nwp_data,
     fill_nans_in_arrays,
     find_valid_time_periods,
     merge_dicts,
@@ -57,6 +58,7 @@ def get_locations(site_xr: xr.Dataset) -> list[Location]:
     return locations
 def process_and_combine_datasets(
     dataset_dict: dict,
     config: Configuration,
@@ -80,8 +82,6 @@ def process_and_combine_datasets(
         for nwp_key, da_nwp in dataset_dict["nwp"].items():
-            # Standardise and convert to NumpyBatch
             channel_means = means_dict["nwp"][nwp_key]
             channel_stds = stds_dict["nwp"][nwp_key]
@@ -276,8 +276,8 @@ class SitesDataset(Dataset):
         """
         sample_dict = slice_datasets_by_space(self.datasets_dict, location, self.config)
         sample_dict = slice_datasets_by_time(sample_dict, t0, self.config)
         sample_dict = tensorstore_compute(sample_dict)
+        sample_dict = diff_nwp_data(sample_dict, self.config)
         return process_and_combine_datasets(
             sample_dict,
@@ -414,6 +414,7 @@ class SitesDatasetConcurrent(Dataset):
         # slice by time first as we want to keep all site id info
         sample_dict = slice_datasets_by_time(self.datasets_dict, t0, self.config)
         sample_dict = tensorstore_compute(sample_dict)
+        sample_dict = diff_nwp_data(sample_dict, self.config)
         site_samples = []

ocf_data_sampler/torch_datasets/utils/__init__.py CHANGED Viewed

@@ -3,4 +3,5 @@ from .merge_and_fill_utils import fill_nans_in_arrays, merge_dicts
 from .valid_time_periods import find_valid_time_periods
 from .spatial_slice_for_dataset import slice_datasets_by_space
 from .time_slice_for_dataset import slice_datasets_by_time
-from .add_alterate_coordinate_projections import add_alterate_coordinate_projections
+from .add_alterate_coordinate_projections import add_alterate_coordinate_projections
+from .diff_nwp_data import diff_nwp_data

ocf_data_sampler/torch_datasets/utils/diff_nwp_data.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Take the in-place diff of some channels of the NWP data."""
+from ocf_data_sampler.config import Configuration
+from ocf_data_sampler.select.diff_channels import diff_channels
+def diff_nwp_data(dataset_dict: dict, config: Configuration) -> dict:
+    """Take the in-place diff of some channels of the NWP data.
+    Args:
+        dataset_dict: Dictionary of xarray datasets
+        config: Configuration object
+    """
+    if "nwp" in dataset_dict:
+        for nwp_key, da_nwp in dataset_dict["nwp"].items():
+            accum_channels = config.input_data.nwp[nwp_key].accum_channels
+            if len(accum_channels)>0:
+                # diff_channels() is an in-place operation and modifies the input
+                dataset_dict["nwp"][nwp_key] = diff_channels(da_nwp, accum_channels)
+    return dataset_dict

ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py CHANGED Viewed

@@ -28,15 +28,23 @@ def slice_datasets_by_time(
         for nwp_key, da_nwp in datasets_dict["nwp"].items():
             nwp_config = config.input_data.nwp[nwp_key]
+            # Add a buffer if we need to diff some of the channels in time
+            if len(nwp_config.accum_channels)>0:
+                interval_end_mins = (
+                    nwp_config.interval_end_minutes
+                    + nwp_config.time_resolution_minutes
+                )
+            else:
+                interval_end_mins = nwp_config.interval_end_minutes
             sliced_datasets_dict["nwp"][nwp_key] = select_time_slice_nwp(
                 da_nwp,
                 t0,
                 time_resolution=minutes(nwp_config.time_resolution_minutes),
                 interval_start=minutes(nwp_config.interval_start_minutes),
-                interval_end=minutes(nwp_config.interval_end_minutes),
+                interval_end=minutes(interval_end_mins),
                 dropout_timedeltas=minutes(nwp_config.dropout_timedeltas_minutes),
                 dropout_frac=nwp_config.dropout_fraction,
-                accum_channels=nwp_config.accum_channels,
             )
     if "sat" in datasets_dict:

{ocf_data_sampler-0.5.15.dist-info → ocf_data_sampler-0.5.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.5.15
+Version: 0.5.16
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License

{ocf_data_sampler-0.5.15.dist-info → ocf_data_sampler-0.5.16.dist-info}/RECORD RENAMED Viewed

@@ -27,36 +27,38 @@ ocf_data_sampler/numpy_sample/collate.py,sha256=hoxIc5SoHoIs3Nx37aRZzWChpswjy9lH
 ocf_data_sampler/numpy_sample/common_types.py,sha256=9CjYHkUTx0ObduWh43fhsybZCTXvexql7qC2ptMDoek,377
 ocf_data_sampler/numpy_sample/datetime_features.py,sha256=ObHM42VnZB7_daQ5a42GeftoDWYtVMT-wDP8kRtY_84,857
 ocf_data_sampler/numpy_sample/gsp.py,sha256=sOWX1ubeQSrK6_0vdy_RKVUvqzohOc5pBu7W4Co7iN8,983
-ocf_data_sampler/numpy_sample/nwp.py,sha256=lXqE2Il0xX5hzz76HHkiYmfDsXWWhmaA_6bSnmwbAXU,1078
+ocf_data_sampler/numpy_sample/nwp.py,sha256=AabiasD6OZDdfkPtYWpehV9XpaRHOiEr5g1nSdZdDv8,1095
 ocf_data_sampler/numpy_sample/satellite.py,sha256=RaYzYIcB1AmDrKeiqSpn4QVfBH-QMe26F1P5t1az2Jg,1111
 ocf_data_sampler/numpy_sample/site.py,sha256=4S19bzCN5lswVUrmWRfwpVsBPUE7bi0OIdxsD9wgvhU,982
 ocf_data_sampler/numpy_sample/sun_position.py,sha256=5tt-zNm6aRuZMsxZPaAxyg7HeikswfZCeHWXTHuO2K0,1555
 ocf_data_sampler/select/__init__.py,sha256=mK7Wu_-j9IXGTYrOuDf5yDDuU5a306b0iGKTAooNg_s,210
+ocf_data_sampler/select/diff_channels.py,sha256=W66JcI2pSM-7DnB76_Ag6kUv3f7FqMS-vNkb2467WAk,938
 ocf_data_sampler/select/dropout.py,sha256=i5NDP6oQnZBkQRJW-aXVrPXawktVKQz5VMexe5Ww51g,2021
 ocf_data_sampler/select/fill_time_periods.py,sha256=TlGxp1xiAqnhdWfLy0pv3FuZc00dtimjWdLzr4JoTGA,865
 ocf_data_sampler/select/find_contiguous_time_periods.py,sha256=etkr6LuB7zxkfzWJ6SgHiULdRuFzFlq5bOUNd257Qx4,11545
 ocf_data_sampler/select/geospatial.py,sha256=rvMy_e--3tm-KAy9pU6b9-UMBQqH2sXykr3N_4SHYy4,6528
 ocf_data_sampler/select/location.py,sha256=Qp0di-Pgq8WLjN9IBcTVTaRM3lckhr4ZVzaDRcgVXHw,2352
 ocf_data_sampler/select/select_spatial_slice.py,sha256=Ym_YJjZqeMPC5Bw_xMi7Re2-uCbUagm2KXhnAnstTHo,7200
-ocf_data_sampler/select/select_time_slice.py,sha256=HeHbwZ0CP03x0-LaJtpbSdtpLufwVTR73p6wH6O_PS8,5513
+ocf_data_sampler/select/select_time_slice.py,sha256=cpkdovJMvcjxSGfq9G0OJK5aDAeCXg7exWYrJnR4N2w,4116
 ocf_data_sampler/torch_datasets/datasets/__init__.py,sha256=o0SsEXXZ6k9iL__5_RN1Sf60lw_eqK91P3UFEHAD2k0,102
-ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=wVx4QKHqak2FbxtryAxsVe6wpYM2n_YKgIKpiVs6gpE,12098
-ocf_data_sampler/torch_datasets/datasets/site.py,sha256=ivdSB_YpAqL8-Q1m_uKTGU2YlNQ1bZXdwialT_UpGuo,15590
+ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=wUsIZ0Fhq5bbE8v02C0UPcFWIhWI7kfSka9UrWP0_m4,12240
+ocf_data_sampler/torch_datasets/datasets/site.py,sha256=OXrYSRrWUdQbEjsEPPJjam10zJKU6S3r5kA07RbpzFU,15680
 ocf_data_sampler/torch_datasets/sample/__init__.py,sha256=GL84vdZl_SjHDGVyh9Uekx2XhPYuZ0dnO3l6f6KXnHI,100
 ocf_data_sampler/torch_datasets/sample/base.py,sha256=cQ1oIyhdmlotejZK8B3Cw6MNvpdnBPD8G_o2h7Ye4Vc,2206
 ocf_data_sampler/torch_datasets/sample/site.py,sha256=40NwNTqjL1WVhPdwe02zDHHfDLG2u_bvCfRCtGAtFc0,1466
 ocf_data_sampler/torch_datasets/sample/uk_regional.py,sha256=Xx5cBYUyaM6PGUWQ76MHT9hwj6IJ7WAOxbpmYFbJGhc,10483
-ocf_data_sampler/torch_datasets/utils/__init__.py,sha256=TNSYuSSmFgjsvvJxtoDrH645Z64CHsNUUQ0iayTccP4,416
+ocf_data_sampler/torch_datasets/utils/__init__.py,sha256=4l1VcEmxHInU9G66zrimNMa8WcyKUASQST_iF9QfxUw,457
 ocf_data_sampler/torch_datasets/utils/add_alterate_coordinate_projections.py,sha256=w6Q4TyxNyl7PKAbhqiXvqOpnqIjwmOUcGREIvPNGYlQ,2666
 ocf_data_sampler/torch_datasets/utils/config_normalization_values_to_dicts.py,sha256=SGt1H2nXcaj44ND14-gHzvA7dkLfgjTacCq7rOkRGwg,1991
+ocf_data_sampler/torch_datasets/utils/diff_nwp_data.py,sha256=o7NpKWxKHhwMbol3xBAF087-tDgDUZeP0j8vG08E7Nc,816
 ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py,sha256=we7BTxRH7B7jKayDT7YfNyfI3zZClz2Bk-HXKQIokgU,956
 ocf_data_sampler/torch_datasets/utils/spatial_slice_for_dataset.py,sha256=Hvz0wHSWMYYamf2oHNiGlzJcM4cAH6pL_7ZEvIBL2dE,1882
-ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py,sha256=Q_-kCTtUieyEDpSElY1xwJct7Vsw0LAn5MbYSg2O6vg,3621
+ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py,sha256=1r1J2KNSo1_imN9gpVf5AupJaZ7VSnSevS1o_wck440,3925
 ocf_data_sampler/torch_datasets/utils/valid_time_periods.py,sha256=xcy75cVxl0WrglnX5YUAFjXXlO2GwEBHWyqo8TDuiOA,4714
 ocf_data_sampler/torch_datasets/utils/validation_utils.py,sha256=YqmT-lExWlI8_ul3l0EP73Ik002fStr_bhsZh9mQqEU,4735
 scripts/download_gsp_location_data.py,sha256=rRDXMoqX-RYY4jPdxhdlxJGhWdl6r245F5UARgKV6P4,3121
 scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
-ocf_data_sampler-0.5.15.dist-info/METADATA,sha256=AcLJpUOG6smk3WDSZkj3K8cjhvSg9z0lPoEKM16B6q8,12817
-ocf_data_sampler-0.5.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ocf_data_sampler-0.5.15.dist-info/top_level.txt,sha256=deUxqmsONNAGZDNbsntbXH7BRA1MqWaUeAJrCo6q_xA,25
-ocf_data_sampler-0.5.15.dist-info/RECORD,,
+ocf_data_sampler-0.5.16.dist-info/METADATA,sha256=82UiAraNLrkhOMwZcLeK7Ckg3zgArx5BuzvfBOhy9m8,12817
+ocf_data_sampler-0.5.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ocf_data_sampler-0.5.16.dist-info/top_level.txt,sha256=deUxqmsONNAGZDNbsntbXH7BRA1MqWaUeAJrCo6q_xA,25
+ocf_data_sampler-0.5.16.dist-info/RECORD,,

{ocf_data_sampler-0.5.15.dist-info → ocf_data_sampler-0.5.16.dist-info}/WHEEL RENAMED Viewed

File without changes

{ocf_data_sampler-0.5.15.dist-info → ocf_data_sampler-0.5.16.dist-info}/top_level.txt RENAMED Viewed

File without changes

ocf-data-sampler 0.5.15__py3-none-any.whl → 0.5.16__py3-none-any.whl

Potentially problematic release.

ocf-data-sampler 0.5.15py3-none-any.whl → 0.5.16py3-none-any.whl