ocf-data-sampler 0.5.14__tar.gz → 0.5.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (70) hide show
  1. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/PKG-INFO +1 -1
  2. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/config/model.py +8 -17
  3. ocf_data_sampler-0.5.15/ocf_data_sampler/select/dropout.py +59 -0
  4. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py +12 -28
  5. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler.egg-info/PKG-INFO +1 -1
  6. ocf_data_sampler-0.5.14/ocf_data_sampler/select/dropout.py +0 -61
  7. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/LICENSE +0 -0
  8. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/README.md +0 -0
  9. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/__init__.py +0 -0
  10. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/config/__init__.py +0 -0
  11. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/config/load.py +0 -0
  12. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/config/save.py +0 -0
  13. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/data/uk_gsp_locations_20220314.csv +0 -0
  14. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/data/uk_gsp_locations_20250109.csv +0 -0
  15. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/__init__.py +0 -0
  16. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/gsp.py +0 -0
  17. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/load_dataset.py +0 -0
  18. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/__init__.py +0 -0
  19. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/nwp.py +0 -0
  20. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/providers/__init__.py +0 -0
  21. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/providers/cloudcasting.py +0 -0
  22. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/providers/ecmwf.py +0 -0
  23. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/providers/gfs.py +0 -0
  24. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/providers/icon.py +0 -0
  25. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/providers/ukv.py +0 -0
  26. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/nwp/providers/utils.py +0 -0
  27. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/open_xarray_tensorstore.py +0 -0
  28. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/satellite.py +0 -0
  29. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/site.py +0 -0
  30. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/load/utils.py +0 -0
  31. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/__init__.py +0 -0
  32. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/collate.py +0 -0
  33. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/common_types.py +0 -0
  34. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/datetime_features.py +0 -0
  35. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/gsp.py +0 -0
  36. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/nwp.py +0 -0
  37. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/satellite.py +0 -0
  38. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/site.py +0 -0
  39. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/numpy_sample/sun_position.py +0 -0
  40. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/select/__init__.py +0 -0
  41. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/select/fill_time_periods.py +0 -0
  42. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/select/find_contiguous_time_periods.py +0 -0
  43. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/select/geospatial.py +0 -0
  44. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/select/location.py +0 -0
  45. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/select/select_spatial_slice.py +0 -0
  46. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/select/select_time_slice.py +0 -0
  47. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/datasets/__init__.py +0 -0
  48. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py +0 -0
  49. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/datasets/site.py +0 -0
  50. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/sample/__init__.py +0 -0
  51. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/sample/base.py +0 -0
  52. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/sample/site.py +0 -0
  53. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/sample/uk_regional.py +0 -0
  54. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/__init__.py +0 -0
  55. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/add_alterate_coordinate_projections.py +0 -0
  56. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/config_normalization_values_to_dicts.py +0 -0
  57. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py +0 -0
  58. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/spatial_slice_for_dataset.py +0 -0
  59. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/valid_time_periods.py +0 -0
  60. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/torch_datasets/utils/validation_utils.py +0 -0
  61. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler/utils.py +0 -0
  62. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler.egg-info/SOURCES.txt +0 -0
  63. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler.egg-info/dependency_links.txt +0 -0
  64. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler.egg-info/requires.txt +0 -0
  65. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/ocf_data_sampler.egg-info/top_level.txt +0 -0
  66. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/pyproject.toml +0 -0
  67. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/scripts/download_gsp_location_data.py +0 -0
  68. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/scripts/refactor_site.py +0 -0
  69. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/setup.cfg +0 -0
  70. {ocf_data_sampler-0.5.14 → ocf_data_sampler-0.5.15}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocf-data-sampler
3
- Version: 0.5.14
3
+ Version: 0.5.15
4
4
  Author: James Fulton, Peter Dudfield
5
5
  Author-email: Open Climate Fix team <info@openclimatefix.org>
6
6
  License: MIT License
@@ -90,7 +90,7 @@ class DropoutMixin(Base):
90
90
  "negative or zero.",
91
91
  )
92
92
 
93
- dropout_fraction: float|list[float] = Field(
93
+ dropout_fraction: float | list[float] = Field(
94
94
  default=0,
95
95
  description="Either a float(Chance of dropout being applied to each sample) or a list of "
96
96
  "floats (probability that dropout of the corresponding timedelta is applied)",
@@ -106,31 +106,22 @@ class DropoutMixin(Base):
106
106
 
107
107
 
108
108
  @field_validator("dropout_fraction")
109
- def dropout_fractions(cls, dropout_frac: float|list[float]) -> float|list[float]:
109
+ def dropout_fractions(cls, dropout_frac: float | list[float]) -> float | list[float]:
110
110
  """Validate 'dropout_frac'."""
111
- from math import isclose
112
- if isinstance(dropout_frac, float):
113
- if not (dropout_frac <= 1):
114
- raise ValueError("Input should be less than or equal to 1")
115
- elif not (dropout_frac >= 0):
116
- raise ValueError("Input should be greater than or equal to 0")
111
+ if isinstance(dropout_frac, float | int):
112
+ if not (0<= dropout_frac <= 1):
113
+ raise ValueError("Dropout fractions must be in range [0, 1]")
117
114
 
118
115
  elif isinstance(dropout_frac, list):
119
116
  if not dropout_frac:
120
117
  raise ValueError("List cannot be empty")
121
118
 
122
- if not all(isinstance(i, float) for i in dropout_frac):
123
- raise ValueError("All elements in the list must be floats")
124
-
125
119
  if not all(0 <= i <= 1 for i in dropout_frac):
126
- raise ValueError("Each float in the list must be between 0 and 1")
127
-
128
- if not isclose(sum(dropout_frac), 1.0, rel_tol=1e-9):
129
- raise ValueError("Sum of all floats in the list must be 1.0")
120
+ raise ValueError("All dropout fractions must be in range [0, 1]")
130
121
 
122
+ if not (0 <= sum(dropout_frac) <= 1):
123
+ raise ValueError("The sum of dropout fractions must be in range [0, 1]")
131
124
 
132
- else:
133
- raise TypeError("Must be either a float or a list of floats")
134
125
  return dropout_frac
135
126
 
136
127
 
@@ -0,0 +1,59 @@
1
+ """Functions for simulating dropout in time series data.
2
+
3
+ This is used for the following types of data: GSP, Satellite and Site
4
+ This is not used for NWP
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import xarray as xr
10
+
11
+
12
+ def apply_history_dropout(
13
+ t0: pd.Timestamp,
14
+ dropout_timedeltas: list[pd.Timedelta],
15
+ dropout_frac: float | list[float],
16
+ da: xr.DataArray,
17
+ ) -> xr.DataArray:
18
+ """Apply randomly sampled dropout to the historical part of some sequence data.
19
+
20
+ Dropped out data is replaced with NaNs
21
+
22
+ Args:
23
+ t0: The forecast init-time.
24
+ dropout_timedeltas: List of timedeltas relative to t0 to pick from
25
+ dropout_frac: The probabilit(ies) that each dropout timedelta will be applied. This should
26
+ be between 0 and 1 inclusive.
27
+ da: Xarray DataArray with 'time_utc' coordinate
28
+ """
29
+ if len(dropout_timedeltas)==0:
30
+ return da
31
+
32
+ if isinstance(dropout_frac, float | int):
33
+
34
+ if not (0<=dropout_frac<=1):
35
+ raise ValueError("`dropout_frac` must be in range [0, 1]")
36
+
37
+ # Create list with equal chance for all dropout timedeltas
38
+ n = len(dropout_timedeltas)
39
+ dropout_frac = [dropout_frac/n for _ in range(n)]
40
+ else:
41
+ if not 0<=sum(dropout_frac)<=1:
42
+ raise ValueError("The sum of `dropout_frac` must be in range [0, 1]")
43
+ if len(dropout_timedeltas)!=len(dropout_frac):
44
+ raise ValueError("`dropout_timedeltas` and `dropout_frac` must have the same length")
45
+
46
+ dropout_frac = [*dropout_frac] # Make copy of the list so we can append to it
47
+
48
+ dropout_timedeltas = [*dropout_timedeltas] # Make copy of the list so we can append to it
49
+
50
+ # Add chance of no dropout
51
+ dropout_frac.append(1-sum(dropout_frac))
52
+ dropout_timedeltas.append(None)
53
+
54
+ timedelta_choice = np.random.choice(dropout_timedeltas, p=dropout_frac)
55
+
56
+ if timedelta_choice is None:
57
+ return da
58
+ else:
59
+ return da.where((da.time_utc <= timedelta_choice + t0) | (da.time_utc> t0))
@@ -1,10 +1,9 @@
1
1
  """Slice datasets by time."""
2
2
 
3
3
  import pandas as pd
4
- import xarray as xr
5
4
 
6
5
  from ocf_data_sampler.config import Configuration
7
- from ocf_data_sampler.select.dropout import apply_sampled_dropout_time
6
+ from ocf_data_sampler.select.dropout import apply_history_dropout
8
7
  from ocf_data_sampler.select.select_time_slice import select_time_slice, select_time_slice_nwp
9
8
  from ocf_data_sampler.utils import minutes
10
9
 
@@ -52,7 +51,7 @@ def slice_datasets_by_time(
52
51
  )
53
52
 
54
53
  # Apply the randomly sampled dropout
55
- sliced_datasets_dict["sat"] = apply_sampled_dropout_time(
54
+ sliced_datasets_dict["sat"] = apply_history_dropout(
56
55
  t0,
57
56
  dropout_timedeltas=minutes(sat_config.dropout_timedeltas_minutes),
58
57
  dropout_frac=sat_config.dropout_fraction,
@@ -62,59 +61,44 @@ def slice_datasets_by_time(
62
61
  if "gsp" in datasets_dict:
63
62
  gsp_config = config.input_data.gsp
64
63
 
65
- da_gsp_past = select_time_slice(
64
+ da_gsp = select_time_slice(
66
65
  datasets_dict["gsp"],
67
66
  t0,
68
67
  time_resolution=minutes(gsp_config.time_resolution_minutes),
69
68
  interval_start=minutes(gsp_config.interval_start_minutes),
70
- interval_end=minutes(0),
69
+ interval_end=minutes(gsp_config.interval_end_minutes),
71
70
  )
72
71
 
73
72
  # Dropout on the past GSP, but not the future GSP
74
- da_gsp_past = apply_sampled_dropout_time(
73
+ da_gsp = apply_history_dropout(
75
74
  t0,
76
75
  dropout_timedeltas=minutes(gsp_config.dropout_timedeltas_minutes),
77
76
  dropout_frac=gsp_config.dropout_fraction,
78
- da=da_gsp_past,
79
- )
80
-
81
- da_gsp_future = select_time_slice(
82
- datasets_dict["gsp"],
83
- t0,
84
- time_resolution=minutes(gsp_config.time_resolution_minutes),
85
- interval_start=minutes(gsp_config.time_resolution_minutes),
86
- interval_end=minutes(gsp_config.interval_end_minutes),
77
+ da=da_gsp,
87
78
  )
88
79
 
89
- sliced_datasets_dict["gsp"] = xr.concat([da_gsp_past, da_gsp_future], dim="time_utc")
80
+ sliced_datasets_dict["gsp"] = da_gsp
90
81
 
91
82
  if "site" in datasets_dict:
92
83
  site_config = config.input_data.site
93
84
 
94
- da_site_past = select_time_slice(
85
+ da_site = select_time_slice(
95
86
  datasets_dict["site"],
96
87
  t0,
97
88
  time_resolution=minutes(site_config.time_resolution_minutes),
98
89
  interval_start=minutes(site_config.interval_start_minutes),
99
- interval_end=minutes(0),
90
+ interval_end=minutes(site_config.interval_end_minutes),
100
91
  )
101
92
 
102
93
  # Apply the randomly sampled dropout on the past site not the future
103
- da_site_past = apply_sampled_dropout_time(
94
+ da_site = apply_history_dropout(
104
95
  t0,
105
96
  dropout_timedeltas=minutes(site_config.dropout_timedeltas_minutes),
106
97
  dropout_frac=site_config.dropout_fraction,
107
- da=da_site_past,
98
+ da=da_site,
108
99
  )
109
100
 
110
- da_site_future = select_time_slice(
111
- datasets_dict["site"],
112
- t0,
113
- time_resolution=minutes(site_config.time_resolution_minutes),
114
- interval_start=minutes(site_config.time_resolution_minutes),
115
- interval_end=minutes(site_config.interval_end_minutes),
116
- )
101
+ sliced_datasets_dict["site"] = da_site
117
102
 
118
- sliced_datasets_dict["site"] = xr.concat([da_site_past, da_site_future], dim="time_utc")
119
103
 
120
104
  return sliced_datasets_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocf-data-sampler
3
- Version: 0.5.14
3
+ Version: 0.5.15
4
4
  Author: James Fulton, Peter Dudfield
5
5
  Author-email: Open Climate Fix team <info@openclimatefix.org>
6
6
  License: MIT License
@@ -1,61 +0,0 @@
1
- """Functions for simulating dropout in time series data.
2
-
3
- This is used for the following types of data: GSP, Satellite and Site
4
- This is not used for NWP
5
- """
6
-
7
- import numpy as np
8
- import pandas as pd
9
- import xarray as xr
10
-
11
-
12
- def apply_sampled_dropout_time(
13
- t0: pd.Timestamp,
14
- dropout_timedeltas: list[pd.Timedelta],
15
- dropout_frac: float|list[float],
16
- da: xr.DataArray,
17
- ) -> xr.DataArray:
18
- """Randomly pick a dropout time from a list of timedeltas and apply dropout time to the data.
19
-
20
- Args:
21
- t0: The forecast init-time
22
- dropout_timedeltas: List of timedeltas relative to t0 to pick from
23
- dropout_frac: Either a probability that dropout will be applied.
24
- This should be between 0 and 1 inclusive.
25
- Or a list of probabilities for each of the corresponding timedeltas
26
- da: Xarray DataArray with 'time_utc' coordinate
27
- """
28
- if isinstance(dropout_frac, list):
29
- # checking if len match
30
- if len(dropout_frac) != len(dropout_timedeltas):
31
- raise ValueError("Lengths of dropout_frac and dropout_timedeltas should match")
32
-
33
-
34
-
35
-
36
- dropout_time = t0 + np.random.choice(dropout_timedeltas,p=dropout_frac)
37
-
38
- return da.where(da.time_utc <= dropout_time)
39
-
40
-
41
-
42
- # old logic
43
- else:
44
- # sample dropout time
45
- if dropout_frac > 0 and len(dropout_timedeltas) == 0:
46
- raise ValueError("To apply dropout, dropout_timedeltas must be provided")
47
-
48
-
49
- if not (0 <= dropout_frac <= 1):
50
- raise ValueError("dropout_frac must be between 0 and 1 inclusive")
51
-
52
- if (len(dropout_timedeltas) == 0) or (np.random.uniform() >= dropout_frac):
53
- dropout_time = None
54
- else:
55
- dropout_time = t0 + np.random.choice(dropout_timedeltas)
56
-
57
- # apply dropout time
58
- if dropout_time is None:
59
- return da
60
- # This replaces the times after the dropout with NaNs
61
- return da.where(da.time_utc <= dropout_time)