ocf-data-sampler 0.2.20__tar.gz → 0.2.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (67) hide show
  1. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/PKG-INFO +1 -1
  2. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/gsp.py +1 -1
  3. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/select/dropout.py +10 -20
  4. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py +11 -13
  5. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py +19 -23
  6. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler.egg-info/PKG-INFO +1 -1
  7. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/LICENSE +0 -0
  8. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/README.md +0 -0
  9. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/__init__.py +0 -0
  10. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/config/__init__.py +0 -0
  11. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/config/load.py +0 -0
  12. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/config/model.py +0 -0
  13. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/config/save.py +0 -0
  14. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/data/uk_gsp_locations_20220314.csv +0 -0
  15. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/data/uk_gsp_locations_20250109.csv +0 -0
  16. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/__init__.py +0 -0
  17. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/load_dataset.py +0 -0
  18. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/__init__.py +0 -0
  19. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/nwp.py +0 -0
  20. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/providers/__init__.py +0 -0
  21. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/providers/cloudcasting.py +0 -0
  22. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/providers/ecmwf.py +0 -0
  23. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/providers/gfs.py +0 -0
  24. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/providers/icon.py +0 -0
  25. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/providers/ukv.py +0 -0
  26. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/nwp/providers/utils.py +0 -0
  27. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/satellite.py +0 -0
  28. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/site.py +0 -0
  29. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/load/utils.py +0 -0
  30. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/__init__.py +0 -0
  31. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/collate.py +0 -0
  32. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/common_types.py +0 -0
  33. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/datetime_features.py +0 -0
  34. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/gsp.py +0 -0
  35. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/nwp.py +0 -0
  36. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/satellite.py +0 -0
  37. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/site.py +0 -0
  38. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/numpy_sample/sun_position.py +0 -0
  39. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/select/__init__.py +0 -0
  40. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/select/fill_time_periods.py +0 -0
  41. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/select/find_contiguous_time_periods.py +0 -0
  42. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/select/geospatial.py +0 -0
  43. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/select/location.py +0 -0
  44. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/select/select_spatial_slice.py +0 -0
  45. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/select/select_time_slice.py +0 -0
  46. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/datasets/__init__.py +0 -0
  47. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/datasets/site.py +0 -0
  48. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/sample/__init__.py +0 -0
  49. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/sample/base.py +0 -0
  50. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/sample/site.py +0 -0
  51. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/sample/uk_regional.py +0 -0
  52. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/utils/__init__.py +0 -0
  53. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/utils/channel_dict_to_dataarray.py +0 -0
  54. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py +0 -0
  55. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/utils/spatial_slice_for_dataset.py +0 -0
  56. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/utils/valid_time_periods.py +0 -0
  57. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/torch_datasets/utils/validation_utils.py +0 -0
  58. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler/utils.py +0 -0
  59. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler.egg-info/SOURCES.txt +0 -0
  60. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler.egg-info/dependency_links.txt +0 -0
  61. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler.egg-info/requires.txt +0 -0
  62. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/ocf_data_sampler.egg-info/top_level.txt +0 -0
  63. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/pyproject.toml +0 -0
  64. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/scripts/download_gsp_location_data.py +0 -0
  65. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/scripts/refactor_site.py +0 -0
  66. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/setup.cfg +0 -0
  67. {ocf_data_sampler-0.2.20 → ocf_data_sampler-0.2.22}/utils/compute_icon_mean_stddev.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocf-data-sampler
3
- Version: 0.2.20
3
+ Version: 0.2.22
4
4
  Author: James Fulton, Peter Dudfield
5
5
  Author-email: Open Climate Fix team <info@openclimatefix.org>
6
6
  License: MIT License
@@ -48,7 +48,7 @@ def open_gsp(zarr_path: str, boundaries_version: str = "20220314") -> xr.DataArr
48
48
 
49
49
  if not (ds.gsp_id.isin(df_gsp_loc.index)).all():
50
50
  raise ValueError(
51
- "Some GSP IDs in the GSP generation data are available in the locations file.",
51
+ "Some GSP IDs in the GSP generation data are not available in the locations file.",
52
52
  )
53
53
 
54
54
  # Select the locations by the GSP IDs in the generation data
@@ -9,19 +9,22 @@ import pandas as pd
9
9
  import xarray as xr
10
10
 
11
11
 
12
- def draw_dropout_time(
12
+ def apply_sampled_dropout_time(
13
13
  t0: pd.Timestamp,
14
14
  dropout_timedeltas: list[pd.Timedelta],
15
15
  dropout_frac: float,
16
- ) -> pd.Timestamp:
17
- """Randomly pick a dropout time from a list of timedeltas.
16
+ da: xr.DataArray,
17
+ ) -> xr.DataArray:
18
+ """Randomly pick a dropout time from a list of timedeltas and apply dropout time to the data.
18
19
 
19
20
  Args:
20
21
  t0: The forecast init-time
21
22
  dropout_timedeltas: List of timedeltas relative to t0 to pick from
22
23
  dropout_frac: Probability that dropout will be applied.
23
24
  This should be between 0 and 1 inclusive
25
+ da: Xarray DataArray with 'time_utc' coordinate
24
26
  """
27
+ # sample dropout time
25
28
  if dropout_frac > 0 and len(dropout_timedeltas) == 0:
26
29
  raise ValueError("To apply dropout, dropout_timedeltas must be provided")
27
30
 
@@ -37,21 +40,8 @@ def draw_dropout_time(
37
40
  else:
38
41
  dropout_time = t0 + np.random.choice(dropout_timedeltas)
39
42
 
40
- return dropout_time
41
-
42
-
43
- def apply_dropout_time(
44
- ds: xr.DataArray,
45
- dropout_time: pd.Timestamp | None,
46
- ) -> xr.DataArray:
47
- """Apply dropout time to the data.
48
-
49
- Args:
50
- ds: Xarray DataArray with 'time_utc' coordinate
51
- dropout_time: Time after which data is set to NaN
52
- """
43
+ # apply dropout time
53
44
  if dropout_time is None:
54
- return ds
55
- else:
56
- # This replaces the times after the dropout with NaNs
57
- return ds.where(ds.time_utc <= dropout_time)
45
+ return da
46
+ # This replaces the times after the dropout with NaNs
47
+ return da.where(da.time_utc <= dropout_time)
@@ -1,6 +1,5 @@
1
1
  """Torch dataset for UK PVNet."""
2
2
 
3
- import numpy as np
4
3
  import pandas as pd
5
4
  import xarray as xr
6
5
  from torch.utils.data import Dataset
@@ -257,22 +256,12 @@ class PVNetUKRegionalDataset(AbstractPVNetUKDataset):
257
256
  # Construct a lookup for locations - useful for users to construct sample by GSP ID
258
257
  location_lookup = {loc.id: loc for loc in self.locations}
259
258
 
260
- # Construct indices for sampling
261
- t_index, loc_index = np.meshgrid(
262
- np.arange(len(self.valid_t0_times)),
263
- np.arange(len(self.locations)),
264
- )
265
-
266
- # Make array of all possible (t0, location) coordinates. Each row is a single coordinate
267
- index_pairs = np.stack((t_index.ravel(), loc_index.ravel())).T
268
-
269
259
  # Assign coords and indices to self
270
260
  self.location_lookup = location_lookup
271
- self.index_pairs = index_pairs
272
261
 
273
262
  @override
274
263
  def __len__(self) -> int:
275
- return len(self.index_pairs)
264
+ return len(self.locations)*len(self.valid_t0_times)
276
265
 
277
266
  def _get_sample(self, t0: pd.Timestamp, location: Location) -> NumpySample:
278
267
  """Generate the PVNet sample for given coordinates.
@@ -290,7 +279,16 @@ class PVNetUKRegionalDataset(AbstractPVNetUKDataset):
290
279
  @override
291
280
  def __getitem__(self, idx: int) -> NumpySample:
292
281
  # Get the coordinates of the sample
293
- t_index, loc_index = self.index_pairs[idx]
282
+
283
+ if idx >= len(self):
284
+ raise ValueError(f"Index {idx} out of range for dataset of length {len(self)}")
285
+
286
+ # t_index will be between 0 and len(self.valid_t0_times)-1
287
+ t_index = idx % len(self.valid_t0_times)
288
+
289
+ # For each location, there are len(self.valid_t0_times) possible samples
290
+ loc_index = idx // len(self.valid_t0_times)
291
+
294
292
  location = self.locations[loc_index]
295
293
  t0 = self.valid_t0_times[t_index]
296
294
 
@@ -4,7 +4,7 @@ import pandas as pd
4
4
  import xarray as xr
5
5
 
6
6
  from ocf_data_sampler.config import Configuration
7
- from ocf_data_sampler.select.dropout import apply_dropout_time, draw_dropout_time
7
+ from ocf_data_sampler.select.dropout import apply_sampled_dropout_time
8
8
  from ocf_data_sampler.select.select_time_slice import select_time_slice, select_time_slice_nwp
9
9
  from ocf_data_sampler.utils import minutes
10
10
 
@@ -51,17 +51,12 @@ def slice_datasets_by_time(
51
51
  interval_end=minutes(sat_config.interval_end_minutes),
52
52
  )
53
53
 
54
- # Randomly sample dropout
55
- sat_dropout_time = draw_dropout_time(
54
+ # Apply the randomly sampled dropout
55
+ sliced_datasets_dict["sat"] = apply_sampled_dropout_time(
56
56
  t0,
57
57
  dropout_timedeltas=minutes(sat_config.dropout_timedeltas_minutes),
58
58
  dropout_frac=sat_config.dropout_fraction,
59
- )
60
-
61
- # Apply the dropout
62
- sliced_datasets_dict["sat"] = apply_dropout_time(
63
- sliced_datasets_dict["sat"],
64
- sat_dropout_time,
59
+ da=sliced_datasets_dict["sat"],
65
60
  )
66
61
 
67
62
  if "gsp" in datasets_dict:
@@ -76,15 +71,11 @@ def slice_datasets_by_time(
76
71
  )
77
72
 
78
73
  # Dropout on the past GSP, but not the future GSP
79
- gsp_dropout_time = draw_dropout_time(
74
+ da_gsp_past = apply_sampled_dropout_time(
80
75
  t0,
81
76
  dropout_timedeltas=minutes(gsp_config.dropout_timedeltas_minutes),
82
77
  dropout_frac=gsp_config.dropout_fraction,
83
- )
84
-
85
- da_gsp_past = apply_dropout_time(
86
- da_gsp_past,
87
- gsp_dropout_time,
78
+ da=da_gsp_past,
88
79
  )
89
80
 
90
81
  da_gsp_future = select_time_slice(
@@ -100,25 +91,30 @@ def slice_datasets_by_time(
100
91
  if "site" in datasets_dict:
101
92
  site_config = config.input_data.site
102
93
 
103
- sliced_datasets_dict["site"] = select_time_slice(
94
+ da_site_past = select_time_slice(
104
95
  datasets_dict["site"],
105
96
  t0,
106
97
  time_resolution=minutes(site_config.time_resolution_minutes),
107
98
  interval_start=minutes(site_config.interval_start_minutes),
108
- interval_end=minutes(site_config.interval_end_minutes),
99
+ interval_end=minutes(0),
109
100
  )
110
101
 
111
- # Randomly sample dropout
112
- site_dropout_time = draw_dropout_time(
102
+ # Apply the randomly sampled dropout on the past site not the future
103
+ da_site_past = apply_sampled_dropout_time(
113
104
  t0,
114
105
  dropout_timedeltas=minutes(site_config.dropout_timedeltas_minutes),
115
106
  dropout_frac=site_config.dropout_fraction,
107
+ da=da_site_past,
116
108
  )
117
109
 
118
- # Apply the dropout
119
- sliced_datasets_dict["site"] = apply_dropout_time(
120
- sliced_datasets_dict["site"],
121
- site_dropout_time,
110
+ da_site_future = select_time_slice(
111
+ datasets_dict["site"],
112
+ t0,
113
+ time_resolution=minutes(site_config.time_resolution_minutes),
114
+ interval_start=minutes(site_config.time_resolution_minutes),
115
+ interval_end=minutes(site_config.interval_end_minutes),
122
116
  )
123
117
 
118
+ sliced_datasets_dict["site"] = xr.concat([da_site_past, da_site_future], dim="time_utc")
119
+
124
120
  return sliced_datasets_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocf-data-sampler
3
- Version: 0.2.20
3
+ Version: 0.2.22
4
4
  Author: James Fulton, Peter Dudfield
5
5
  Author-email: Open Climate Fix team <info@openclimatefix.org>
6
6
  License: MIT License