ocf-data-sampler 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocf-data-sampler might be problematic. Click here for more details.

@@ -0,0 +1,7 @@
1
+ """Conversion from Xarray to NumpyBatch"""
2
+
3
+ from .gsp import convert_gsp_to_numpy_batch
4
+ from .nwp import convert_nwp_to_numpy_batch
5
+ from .satellite import convert_satellite_to_numpy_batch
6
+ from .sun_position import make_sun_position_numpy_batch
7
+
@@ -0,0 +1,23 @@
1
+ """Convert GSP to Numpy Batch"""
2
+
3
+ import xarray as xr
4
+ from ocf_datapipes.batch import BatchKey, NumpyBatch
5
+
6
+
7
+ def convert_gsp_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> NumpyBatch:
8
+ """Convert from Xarray to NumpyBatch"""
9
+
10
+ example: NumpyBatch = {
11
+ BatchKey.gsp: da.values,
12
+ BatchKey.gsp_id: da.gsp_id.values,
13
+ BatchKey.gsp_nominal_capacity_mwp: da.isel(time_utc=0)["nominal_capacity_mwp"].values,
14
+ BatchKey.gsp_effective_capacity_mwp: da.isel(time_utc=0)["effective_capacity_mwp"].values,
15
+ BatchKey.gsp_time_utc: da["time_utc"].values.astype(float),
16
+ BatchKey.gsp_x_osgb: da.x_osgb.item(),
17
+ BatchKey.gsp_y_osgb: da.y_osgb.item(),
18
+ }
19
+
20
+ if t0_idx is not None:
21
+ example[BatchKey.gsp_t0_idx] = t0_idx
22
+
23
+ return example
@@ -0,0 +1,33 @@
1
+ """Convert NWP to NumpyBatch"""
2
+
3
+ import pandas as pd
4
+ import xarray as xr
5
+
6
+ from ocf_datapipes.batch import NWPBatchKey, NWPNumpyBatch
7
+
8
+
9
+ def convert_nwp_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> NWPNumpyBatch:
10
+ """Convert from Xarray to NWP NumpyBatch"""
11
+
12
+ example: NWPNumpyBatch = {
13
+ NWPBatchKey.nwp: da.values,
14
+ NWPBatchKey.nwp_channel_names: da.channel.values,
15
+ NWPBatchKey.nwp_init_time_utc: da.init_time_utc.values.astype(float),
16
+ NWPBatchKey.nwp_step: (da.step.values / pd.Timedelta("1H")).astype(int),
17
+ }
18
+
19
+ if "target_time_utc" in da.coords:
20
+ example[NWPBatchKey.nwp_target_time_utc] = da.target_time_utc.values.astype(float)
21
+
22
+ # TODO: Do we need this at all? Especially since it is only present in UKV data
23
+ for batch_key, dataset_key in (
24
+ (NWPBatchKey.nwp_y_osgb, "y_osgb"),
25
+ (NWPBatchKey.nwp_x_osgb, "x_osgb"),
26
+ ):
27
+ if dataset_key in da.coords:
28
+ example[batch_key] = da[dataset_key].values
29
+
30
+ if t0_idx is not None:
31
+ example[NWPBatchKey.nwp_t0_idx] = t0_idx
32
+
33
+ return example
@@ -0,0 +1,23 @@
1
+ """Convert Satellite to NumpyBatch"""
2
+ import xarray as xr
3
+
4
+ from ocf_datapipes.batch import BatchKey, NumpyBatch
5
+
6
+
7
+ def convert_satellite_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> NumpyBatch:
8
+ """Convert from Xarray to NumpyBatch"""
9
+ example: NumpyBatch = {
10
+ BatchKey.satellite_actual: da.values,
11
+ BatchKey.satellite_time_utc: da.time_utc.values.astype(float),
12
+ }
13
+
14
+ for batch_key, dataset_key in (
15
+ (BatchKey.satellite_x_geostationary, "x_geostationary"),
16
+ (BatchKey.satellite_y_geostationary, "y_geostationary"),
17
+ ):
18
+ example[batch_key] = da[dataset_key].values
19
+
20
+ if t0_idx is not None:
21
+ example[BatchKey.satellite_t0_idx] = t0_idx
22
+
23
+ return example
@@ -0,0 +1,66 @@
1
+
2
+ import pvlib
3
+ import numpy as np
4
+ import pandas as pd
5
+ from ocf_datapipes.batch import BatchKey, NumpyBatch
6
+
7
+
8
+ def calculate_azimuth_and_elevation(
9
+ datetimes: pd.DatetimeIndex,
10
+ lon: float,
11
+ lat: float
12
+ ) -> tuple[np.ndarray, np.ndarray]:
13
+ """Calculate the solar coordinates for multiple datetimes at a single location
14
+
15
+ Args:
16
+ datetimes: The datetimes to calculate for
17
+ lon: The longitude
18
+ lat: The latitude
19
+
20
+ Returns:
21
+ np.ndarray: The azimuth of the datetimes in degrees
22
+ np.ndarray: The elevation of the datetimes in degrees
23
+ """
24
+
25
+ solpos = pvlib.solarposition.get_solarposition(
26
+ time=datetimes,
27
+ longitude=lon,
28
+ latitude=lat,
29
+ method='nrel_numpy'
30
+ )
31
+ azimuth = solpos["azimuth"].values
32
+ elevation = solpos["elevation"].values
33
+ return azimuth, elevation
34
+
35
+
36
+ def make_sun_position_numpy_batch(
37
+ datetimes: pd.DatetimeIndex,
38
+ lon: float,
39
+ lat: float,
40
+ key_preffix: str = "gsp"
41
+ ) -> NumpyBatch:
42
+ """Creates NumpyBatch with standardized solar coordinates
43
+
44
+ Args:
45
+ datetimes: The datetimes to calculate solar angles for
46
+ lon: The longitude
47
+ lat: The latitude
48
+ """
49
+
50
+ azimuth, elevation = calculate_azimuth_and_elevation(datetimes, lon, lat)
51
+
52
+ # Normalise
53
+
54
+ # Azimuth is in range [0, 360] degrees
55
+ azimuth = azimuth / 360
56
+
57
+ # Elevation is in range [-90, 90] degrees
58
+ elevation = elevation / 180 + 0.5
59
+
60
+ # Make NumpyBatch
61
+ sun_numpy_batch: NumpyBatch = {
62
+ BatchKey[key_preffix + "_solar_azimuth"]: azimuth,
63
+ BatchKey[key_preffix + "_solar_elevation"]: elevation,
64
+ }
65
+
66
+ return sun_numpy_batch
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,38 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import xarray as xr
4
+
5
+
6
+ def draw_dropout_time(
7
+ t0: pd.Timestamp,
8
+ dropout_timedeltas: list[pd.Timedelta] | None,
9
+ dropout_frac: float = 0,
10
+ ):
11
+
12
+ if dropout_timedeltas is not None:
13
+ assert len(dropout_timedeltas) >= 1, "Must include list of relative dropout timedeltas"
14
+ assert all(
15
+ [t < pd.Timedelta("0min") for t in dropout_timedeltas]
16
+ ), "dropout timedeltas must be negative"
17
+ assert 0 <= dropout_frac <= 1
18
+
19
+ if (dropout_timedeltas is None) or (np.random.uniform() >= dropout_frac):
20
+ dropout_time = None
21
+ else:
22
+ t0_datetime_utc = pd.Timestamp(t0)
23
+ dt = np.random.choice(dropout_timedeltas)
24
+ dropout_time = t0_datetime_utc + dt
25
+
26
+ return dropout_time
27
+
28
+
29
+ def apply_dropout_time(
30
+ ds: xr.Dataset,
31
+ dropout_time: pd.Timestamp | None,
32
+ ):
33
+
34
+ if dropout_time is None:
35
+ return ds
36
+ else:
37
+ # This replaces the times after the dropout with NaNs
38
+ return ds.where(ds.time_utc <= dropout_time)
@@ -0,0 +1,11 @@
1
+ """fill time periods"""
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+
7
+ def fill_time_periods(time_periods: pd.DataFrame, freq: pd.Timedelta):
8
+ start_dts = pd.to_datetime(time_periods["start_dt"].values).ceil(freq)
9
+ end_dts = pd.to_datetime(time_periods["end_dt"].values)
10
+ date_ranges = [pd.date_range(start_dt, end_dt, freq=freq) for start_dt, end_dt in zip(start_dts, end_dts)]
11
+ return pd.DatetimeIndex(np.concatenate(date_ranges))
@@ -0,0 +1,301 @@
1
+ """Get contiguous time periods for training"""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+
7
+
8
+ def find_contiguous_time_periods(
9
+ datetimes: pd.DatetimeIndex,
10
+ min_seq_length: int,
11
+ max_gap_duration: pd.Timedelta,
12
+ ) -> pd.DataFrame:
13
+ """Return a pd.DataFrame where each row records the boundary of a contiguous time period.
14
+
15
+ Args:
16
+ datetimes: pd.DatetimeIndex. Must be sorted.
17
+ min_seq_length: Sequences of min_seq_length or shorter will be discarded. Typically, this
18
+ would be set to the `total_seq_length` of each machine learning example.
19
+ max_gap_duration: If any pair of consecutive `datetimes` is more than `max_gap_duration`
20
+ apart, then this pair of `datetimes` will be considered a "gap" between two contiguous
21
+ sequences. Typically, `max_gap_duration` would be set to the sample period of
22
+ the timeseries.
23
+
24
+ Returns:
25
+ pd.DataFrame where each row represents a single time period. The pd.DataFrame
26
+ has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
27
+ """
28
+ # Sanity checks.
29
+ assert len(datetimes) > 0
30
+ assert min_seq_length > 1
31
+ assert datetimes.is_monotonic_increasing
32
+ assert datetimes.is_unique
33
+
34
+ # Find indices of gaps larger than max_gap:
35
+ gap_mask = pd.TimedeltaIndex(np.diff(datetimes)) > max_gap_duration
36
+ gap_indices = np.argwhere(gap_mask)[:, 0]
37
+
38
+ # gap_indicies are the indices into dt_index for the timestep immediately before the gap.
39
+ # e.g. if the datetimes at 12:00, 12:05, 18:00, 18:05 then gap_indicies will be [1].
40
+ # So we add 1 to gap_indices to get segment_boundaries, an index into dt_index
41
+ # which identifies the _start_ of each segment.
42
+ segment_boundaries = gap_indices + 1
43
+
44
+ # Capture the last segment of dt_index.
45
+ segment_boundaries = np.concatenate((segment_boundaries, [len(datetimes)]))
46
+
47
+ periods: list[dict[str, pd.Timestamp]] = []
48
+ start_i = 0
49
+ for next_start_i in segment_boundaries:
50
+ n_timesteps = next_start_i - start_i
51
+ if n_timesteps > min_seq_length:
52
+ end_i = next_start_i - 1
53
+ period = {"start_dt": datetimes[start_i], "end_dt": datetimes[end_i]}
54
+ periods.append(period)
55
+ start_i = next_start_i
56
+
57
+ assert len(periods) > 0, (
58
+ f"Did not find an periods from {datetimes}. " f"{min_seq_length=} {max_gap_duration=}"
59
+ )
60
+
61
+ return pd.DataFrame(periods)
62
+
63
+
64
+ def trim_contiguous_time_periods(
65
+ contiguous_time_periods: pd.DataFrame,
66
+ history_duration: pd.Timedelta,
67
+ forecast_duration: pd.Timedelta,
68
+ ) -> pd.DataFrame:
69
+ """Trim the contiguous time periods to allow for history and forecast durations.
70
+
71
+ Args:
72
+ contiguous_time_periods: DataFrame where each row represents a single time period. The
73
+ DataFrame must have `start_dt` and `end_dt` columns.
74
+ history_duration: Length of the historical slice used for a sample
75
+ forecast_duration: Length of the forecast slice used for a sample
76
+
77
+
78
+ Returns:
79
+ The contiguous_time_periods DataFrame with the `start_dt` and `end_dt` columns updated.
80
+ """
81
+ contiguous_time_periods = contiguous_time_periods.copy()
82
+
83
+ contiguous_time_periods["start_dt"] += history_duration
84
+ contiguous_time_periods["end_dt"] -= forecast_duration
85
+
86
+ valid_mask = contiguous_time_periods["start_dt"] <= contiguous_time_periods["end_dt"]
87
+ contiguous_time_periods = contiguous_time_periods.loc[valid_mask]
88
+
89
+ return contiguous_time_periods
90
+
91
+
92
+
93
+ def find_contiguous_t0_periods(
94
+ datetimes: pd.DatetimeIndex,
95
+ history_duration: pd.Timedelta,
96
+ forecast_duration: pd.Timedelta,
97
+ sample_period_duration: pd.Timedelta,
98
+ ) -> pd.DataFrame:
99
+ """Return a pd.DataFrame where each row records the boundary of a contiguous time period.
100
+
101
+ Args:
102
+ datetimes: pd.DatetimeIndex. Must be sorted.
103
+ history_duration: Length of the historical slice used for each sample
104
+ forecast_duration: Length of the forecast slice used for each sample
105
+ sample_period_duration: The sample frequency of the timeseries
106
+
107
+
108
+ Returns:
109
+ pd.DataFrame where each row represents a single time period. The pd.DataFrame
110
+ has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
111
+ """
112
+ total_duration = history_duration + forecast_duration
113
+
114
+ contiguous_time_periods = find_contiguous_time_periods(
115
+ datetimes=datetimes,
116
+ min_seq_length=int(total_duration / sample_period_duration) + 1,
117
+ max_gap_duration=sample_period_duration,
118
+ )
119
+
120
+ contiguous_t0_periods = trim_contiguous_time_periods(
121
+ contiguous_time_periods=contiguous_time_periods,
122
+ history_duration=history_duration,
123
+ forecast_duration=forecast_duration,
124
+ )
125
+
126
+ assert len(contiguous_t0_periods) > 0
127
+
128
+ return contiguous_t0_periods
129
+
130
+
131
+ def _find_contiguous_t0_periods_nwp(
132
+ ds,
133
+ history_duration: pd.Timedelta,
134
+ forecast_duration: pd.Timedelta,
135
+ max_staleness: pd.Timedelta | None = None,
136
+ max_dropout: pd.Timedelta = pd.Timedelta(0),
137
+ time_dim: str = "init_time_utc",
138
+ end_buffer: pd.Timedelta = pd.Timedelta(0),
139
+ ):
140
+
141
+ assert "step" in ds.coords
142
+ # It is possible to use up to this amount of max staleness for the dataset and slice
143
+ # required
144
+ possible_max_staleness = (
145
+ pd.Timedelta(ds["step"].max().item())
146
+ - forecast_duration
147
+ - end_buffer
148
+ )
149
+
150
+ # If max_staleness is set to None we set it based on the max step ahead of the input
151
+ # forecast data
152
+ if max_staleness is None:
153
+ max_staleness = possible_max_staleness
154
+ else:
155
+ # Make sure the max acceptable staleness isn't longer than the max possible
156
+ assert max_staleness <= possible_max_staleness
157
+ max_staleness = max_staleness
158
+
159
+ contiguous_time_periods = find_contiguous_t0_periods_nwp(
160
+ datetimes=pd.DatetimeIndex(ds[time_dim]),
161
+ history_duration=history_duration,
162
+ max_staleness=max_staleness,
163
+ max_dropout=max_dropout,
164
+ )
165
+ return contiguous_time_periods
166
+
167
+
168
+
169
+ def find_contiguous_t0_periods_nwp(
170
+ datetimes: pd.DatetimeIndex,
171
+ history_duration: pd.Timedelta,
172
+ max_staleness: pd.Timedelta,
173
+ max_dropout: pd.Timedelta = pd.Timedelta(0),
174
+ ) -> pd.DataFrame:
175
+ """Get all time periods from the NWP init times which are valid as t0 datetimes.
176
+
177
+ Args:
178
+ datetimes: Sorted pd.DatetimeIndex
179
+ history_duration: Length of the historical slice used for a sample
180
+ max_staleness: Up to how long after an NWP forecast init_time are we willing to use the
181
+ forecast. Each init time will only be used up to this t0 time regardless of the forecast
182
+ valid time.
183
+ max_dropout: What is the maximum amount of dropout that will be used. This must be <=
184
+ max_staleness.
185
+
186
+ Returns:
187
+ pd.DataFrame where each row represents a single time period. The pd.DataFrame
188
+ has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
189
+ """
190
+ # Sanity checks.
191
+ assert len(datetimes) > 0
192
+ assert datetimes.is_monotonic_increasing
193
+ assert datetimes.is_unique
194
+ assert history_duration >= pd.Timedelta(0)
195
+ assert max_staleness >= pd.Timedelta(0)
196
+ assert max_dropout <= max_staleness
197
+
198
+ hist_drop_buffer = max(history_duration, max_dropout)
199
+
200
+ # Store contiguous periods
201
+ contiguous_periods = []
202
+
203
+ # Start first period allowing for history slice and max dropout
204
+ start_this_period = datetimes[0] + hist_drop_buffer
205
+
206
+ # The first forecast is valid up to the max staleness
207
+ end_this_period = datetimes[0] + max_staleness
208
+
209
+ for dt_init in datetimes[1:]:
210
+ # If the previous init time becomes stale before the next init becomes valid whilst also
211
+ # considering dropout - then the contiguous period breaks, and new starts with considering
212
+ # dropout and history duration
213
+ if end_this_period < dt_init + max_dropout:
214
+ contiguous_periods.append([start_this_period, end_this_period])
215
+
216
+ # And start a new period
217
+ start_this_period = dt_init + hist_drop_buffer
218
+ end_this_period = dt_init + max_staleness
219
+
220
+ contiguous_periods.append([start_this_period, end_this_period])
221
+
222
+ return pd.DataFrame(contiguous_periods, columns=["start_dt", "end_dt"])
223
+
224
+
225
+ def intersection_of_multiple_dataframes_of_periods(
226
+ time_periods: list[pd.DataFrame],
227
+ ) -> pd.DataFrame:
228
+ """Find the intersection of a list of time periods.
229
+
230
+ See the docstring of intersection_of_2_dataframes_of_periods() for more details.
231
+ """
232
+ assert len(time_periods) > 0
233
+ intersection = time_periods[0]
234
+ for time_period in time_periods[1:]:
235
+ intersection = intersection_of_2_dataframes_of_periods(intersection, time_period)
236
+ return intersection
237
+
238
+
239
+ def intersection_of_2_dataframes_of_periods(a: pd.DataFrame, b: pd.DataFrame) -> pd.DataFrame:
240
+ """Find the intersection of two pd.DataFrames of time periods.
241
+
242
+ Each row of each pd.DataFrame represents a single time period. Each pd.DataFrame has
243
+ two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
244
+
245
+ A typical use-case is that each pd.DataFrame represents all the time periods where
246
+ a `DataSource` has contiguous, valid data.
247
+
248
+ Here's a graphical example of two pd.DataFrames of time periods and their intersection:
249
+
250
+ ----------------------> TIME ->---------------------
251
+ a: |-----| |----| |----------| |-----------|
252
+ b: |--------| |----| |---|
253
+ intersection: |--| |-| |--| |---|
254
+
255
+ Args:
256
+ a: pd.DataFrame where each row represents a time period. The pd.DataFrame has
257
+ two columns: start_dt and end_dt.
258
+ b: pd.DataFrame where each row represents a time period. The pd.DataFrame has
259
+ two columns: start_dt and end_dt.
260
+
261
+ Returns:
262
+ Sorted list of intersecting time periods represented as a pd.DataFrame with two columns:
263
+ start_dt and end_dt.
264
+ """
265
+ if a.empty or b.empty:
266
+ return pd.DataFrame(columns=["start_dt", "end_dt"])
267
+
268
+ all_intersecting_periods = []
269
+ for a_period in a.itertuples():
270
+ # Five ways in which two periods may overlap:
271
+ # a: |----| or |---| or |---| or |--| or |-|
272
+ # b: |--| |---| |---| |------| |-|
273
+ # In all five, `a` must always start before `b` ends,
274
+ # and `a` must always end after `b` starts:
275
+
276
+ # TODO: <= and >= because we should allow overlap time periods of length 1. e.g.
277
+ # a: |----| or |---|
278
+ # b: |--| |---|
279
+ # These aren't allowed if we use < and >.
280
+
281
+ overlapping_periods = b[(a_period.start_dt < b.end_dt) & (a_period.end_dt > b.start_dt)]
282
+
283
+ # There are two ways in which two periods may *not* overlap:
284
+ # a: |---| or |---|
285
+ # b: |---| |---|
286
+ # `overlapping` will not include periods which do *not* overlap.
287
+
288
+ # Now find the intersection of each period in `overlapping_periods` with
289
+ # the period from `a` that starts at `a_start_dt` and ends at `a_end_dt`.
290
+ # We do this by clipping each row of `overlapping_periods`
291
+ # to start no earlier than `a_start_dt`, and end no later than `a_end_dt`.
292
+
293
+ # First, make a copy, so we don't clip the underlying data in `b`.
294
+ intersection = overlapping_periods.copy()
295
+ intersection["start_dt"] = intersection.start_dt.clip(lower=a_period.start_dt)
296
+ intersection["end_dt"] = intersection.end_dt.clip(upper=a_period.end_dt)
297
+
298
+ all_intersecting_periods.append(intersection)
299
+
300
+ all_intersecting_periods = pd.concat(all_intersecting_periods)
301
+ return all_intersecting_periods.sort_values(by="start_dt").reset_index(drop=True)