ocf-data-sampler 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocf-data-sampler might be problematic. Click here for more details.
- ocf_data_sampler/__init__.py +1 -0
- ocf_data_sampler/data/uk_gsp_locations.csv +319 -0
- ocf_data_sampler/numpy_batch/__init__.py +7 -0
- ocf_data_sampler/numpy_batch/gsp.py +23 -0
- ocf_data_sampler/numpy_batch/nwp.py +33 -0
- ocf_data_sampler/numpy_batch/satellite.py +23 -0
- ocf_data_sampler/numpy_batch/sun_position.py +66 -0
- ocf_data_sampler/select/__init__.py +1 -0
- ocf_data_sampler/select/dropout.py +38 -0
- ocf_data_sampler/select/fill_time_periods.py +11 -0
- ocf_data_sampler/select/find_contiguous_time_periods.py +301 -0
- ocf_data_sampler/select/select_spatial_slice.py +358 -0
- ocf_data_sampler/select/select_time_slice.py +184 -0
- ocf_data_sampler/torch_datasets/__init__.py +1 -0
- ocf_data_sampler/torch_datasets/pvnet_uk_regional.py +538 -0
- {ocf_data_sampler-0.0.7.dist-info → ocf_data_sampler-0.0.8.dist-info}/METADATA +1 -1
- ocf_data_sampler-0.0.8.dist-info/RECORD +22 -0
- ocf_data_sampler-0.0.8.dist-info/top_level.txt +2 -0
- ocf_data_sampler-0.0.7.dist-info/RECORD +0 -7
- ocf_data_sampler-0.0.7.dist-info/top_level.txt +0 -1
- {ocf_data_sampler-0.0.7.dist-info → ocf_data_sampler-0.0.8.dist-info}/LICENSE +0 -0
- {ocf_data_sampler-0.0.7.dist-info → ocf_data_sampler-0.0.8.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Convert GSP to Numpy Batch"""
|
|
2
|
+
|
|
3
|
+
import xarray as xr
|
|
4
|
+
from ocf_datapipes.batch import BatchKey, NumpyBatch
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_gsp_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> NumpyBatch:
|
|
8
|
+
"""Convert from Xarray to NumpyBatch"""
|
|
9
|
+
|
|
10
|
+
example: NumpyBatch = {
|
|
11
|
+
BatchKey.gsp: da.values,
|
|
12
|
+
BatchKey.gsp_id: da.gsp_id.values,
|
|
13
|
+
BatchKey.gsp_nominal_capacity_mwp: da.isel(time_utc=0)["nominal_capacity_mwp"].values,
|
|
14
|
+
BatchKey.gsp_effective_capacity_mwp: da.isel(time_utc=0)["effective_capacity_mwp"].values,
|
|
15
|
+
BatchKey.gsp_time_utc: da["time_utc"].values.astype(float),
|
|
16
|
+
BatchKey.gsp_x_osgb: da.x_osgb.item(),
|
|
17
|
+
BatchKey.gsp_y_osgb: da.y_osgb.item(),
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
if t0_idx is not None:
|
|
21
|
+
example[BatchKey.gsp_t0_idx] = t0_idx
|
|
22
|
+
|
|
23
|
+
return example
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Convert NWP to NumpyBatch"""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import xarray as xr
|
|
5
|
+
|
|
6
|
+
from ocf_datapipes.batch import NWPBatchKey, NWPNumpyBatch
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_nwp_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> NWPNumpyBatch:
|
|
10
|
+
"""Convert from Xarray to NWP NumpyBatch"""
|
|
11
|
+
|
|
12
|
+
example: NWPNumpyBatch = {
|
|
13
|
+
NWPBatchKey.nwp: da.values,
|
|
14
|
+
NWPBatchKey.nwp_channel_names: da.channel.values,
|
|
15
|
+
NWPBatchKey.nwp_init_time_utc: da.init_time_utc.values.astype(float),
|
|
16
|
+
NWPBatchKey.nwp_step: (da.step.values / pd.Timedelta("1H")).astype(int),
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
if "target_time_utc" in da.coords:
|
|
20
|
+
example[NWPBatchKey.nwp_target_time_utc] = da.target_time_utc.values.astype(float)
|
|
21
|
+
|
|
22
|
+
# TODO: Do we need this at all? Especially since it is only present in UKV data
|
|
23
|
+
for batch_key, dataset_key in (
|
|
24
|
+
(NWPBatchKey.nwp_y_osgb, "y_osgb"),
|
|
25
|
+
(NWPBatchKey.nwp_x_osgb, "x_osgb"),
|
|
26
|
+
):
|
|
27
|
+
if dataset_key in da.coords:
|
|
28
|
+
example[batch_key] = da[dataset_key].values
|
|
29
|
+
|
|
30
|
+
if t0_idx is not None:
|
|
31
|
+
example[NWPBatchKey.nwp_t0_idx] = t0_idx
|
|
32
|
+
|
|
33
|
+
return example
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Convert Satellite to NumpyBatch"""
|
|
2
|
+
import xarray as xr
|
|
3
|
+
|
|
4
|
+
from ocf_datapipes.batch import BatchKey, NumpyBatch
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_satellite_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> NumpyBatch:
|
|
8
|
+
"""Convert from Xarray to NumpyBatch"""
|
|
9
|
+
example: NumpyBatch = {
|
|
10
|
+
BatchKey.satellite_actual: da.values,
|
|
11
|
+
BatchKey.satellite_time_utc: da.time_utc.values.astype(float),
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
for batch_key, dataset_key in (
|
|
15
|
+
(BatchKey.satellite_x_geostationary, "x_geostationary"),
|
|
16
|
+
(BatchKey.satellite_y_geostationary, "y_geostationary"),
|
|
17
|
+
):
|
|
18
|
+
example[batch_key] = da[dataset_key].values
|
|
19
|
+
|
|
20
|
+
if t0_idx is not None:
|
|
21
|
+
example[BatchKey.satellite_t0_idx] = t0_idx
|
|
22
|
+
|
|
23
|
+
return example
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
|
|
2
|
+
import pvlib
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from ocf_datapipes.batch import BatchKey, NumpyBatch
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def calculate_azimuth_and_elevation(
|
|
9
|
+
datetimes: pd.DatetimeIndex,
|
|
10
|
+
lon: float,
|
|
11
|
+
lat: float
|
|
12
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
13
|
+
"""Calculate the solar coordinates for multiple datetimes at a single location
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
datetimes: The datetimes to calculate for
|
|
17
|
+
lon: The longitude
|
|
18
|
+
lat: The latitude
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
np.ndarray: The azimuth of the datetimes in degrees
|
|
22
|
+
np.ndarray: The elevation of the datetimes in degrees
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
solpos = pvlib.solarposition.get_solarposition(
|
|
26
|
+
time=datetimes,
|
|
27
|
+
longitude=lon,
|
|
28
|
+
latitude=lat,
|
|
29
|
+
method='nrel_numpy'
|
|
30
|
+
)
|
|
31
|
+
azimuth = solpos["azimuth"].values
|
|
32
|
+
elevation = solpos["elevation"].values
|
|
33
|
+
return azimuth, elevation
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def make_sun_position_numpy_batch(
|
|
37
|
+
datetimes: pd.DatetimeIndex,
|
|
38
|
+
lon: float,
|
|
39
|
+
lat: float,
|
|
40
|
+
key_preffix: str = "gsp"
|
|
41
|
+
) -> NumpyBatch:
|
|
42
|
+
"""Creates NumpyBatch with standardized solar coordinates
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
datetimes: The datetimes to calculate solar angles for
|
|
46
|
+
lon: The longitude
|
|
47
|
+
lat: The latitude
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
azimuth, elevation = calculate_azimuth_and_elevation(datetimes, lon, lat)
|
|
51
|
+
|
|
52
|
+
# Normalise
|
|
53
|
+
|
|
54
|
+
# Azimuth is in range [0, 360] degrees
|
|
55
|
+
azimuth = azimuth / 360
|
|
56
|
+
|
|
57
|
+
# Elevation is in range [-90, 90] degrees
|
|
58
|
+
elevation = elevation / 180 + 0.5
|
|
59
|
+
|
|
60
|
+
# Make NumpyBatch
|
|
61
|
+
sun_numpy_batch: NumpyBatch = {
|
|
62
|
+
BatchKey[key_preffix + "_solar_azimuth"]: azimuth,
|
|
63
|
+
BatchKey[key_preffix + "_solar_elevation"]: elevation,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return sun_numpy_batch
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import xarray as xr
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def draw_dropout_time(
|
|
7
|
+
t0: pd.Timestamp,
|
|
8
|
+
dropout_timedeltas: list[pd.Timedelta] | None,
|
|
9
|
+
dropout_frac: float = 0,
|
|
10
|
+
):
|
|
11
|
+
|
|
12
|
+
if dropout_timedeltas is not None:
|
|
13
|
+
assert len(dropout_timedeltas) >= 1, "Must include list of relative dropout timedeltas"
|
|
14
|
+
assert all(
|
|
15
|
+
[t < pd.Timedelta("0min") for t in dropout_timedeltas]
|
|
16
|
+
), "dropout timedeltas must be negative"
|
|
17
|
+
assert 0 <= dropout_frac <= 1
|
|
18
|
+
|
|
19
|
+
if (dropout_timedeltas is None) or (np.random.uniform() >= dropout_frac):
|
|
20
|
+
dropout_time = None
|
|
21
|
+
else:
|
|
22
|
+
t0_datetime_utc = pd.Timestamp(t0)
|
|
23
|
+
dt = np.random.choice(dropout_timedeltas)
|
|
24
|
+
dropout_time = t0_datetime_utc + dt
|
|
25
|
+
|
|
26
|
+
return dropout_time
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def apply_dropout_time(
|
|
30
|
+
ds: xr.Dataset,
|
|
31
|
+
dropout_time: pd.Timestamp | None,
|
|
32
|
+
):
|
|
33
|
+
|
|
34
|
+
if dropout_time is None:
|
|
35
|
+
return ds
|
|
36
|
+
else:
|
|
37
|
+
# This replaces the times after the dropout with NaNs
|
|
38
|
+
return ds.where(ds.time_utc <= dropout_time)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""fill time periods"""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def fill_time_periods(time_periods: pd.DataFrame, freq: pd.Timedelta):
|
|
8
|
+
start_dts = pd.to_datetime(time_periods["start_dt"].values).ceil(freq)
|
|
9
|
+
end_dts = pd.to_datetime(time_periods["end_dt"].values)
|
|
10
|
+
date_ranges = [pd.date_range(start_dt, end_dt, freq=freq) for start_dt, end_dt in zip(start_dts, end_dts)]
|
|
11
|
+
return pd.DatetimeIndex(np.concatenate(date_ranges))
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Get contiguous time periods for training"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def find_contiguous_time_periods(
|
|
9
|
+
datetimes: pd.DatetimeIndex,
|
|
10
|
+
min_seq_length: int,
|
|
11
|
+
max_gap_duration: pd.Timedelta,
|
|
12
|
+
) -> pd.DataFrame:
|
|
13
|
+
"""Return a pd.DataFrame where each row records the boundary of a contiguous time period.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
datetimes: pd.DatetimeIndex. Must be sorted.
|
|
17
|
+
min_seq_length: Sequences of min_seq_length or shorter will be discarded. Typically, this
|
|
18
|
+
would be set to the `total_seq_length` of each machine learning example.
|
|
19
|
+
max_gap_duration: If any pair of consecutive `datetimes` is more than `max_gap_duration`
|
|
20
|
+
apart, then this pair of `datetimes` will be considered a "gap" between two contiguous
|
|
21
|
+
sequences. Typically, `max_gap_duration` would be set to the sample period of
|
|
22
|
+
the timeseries.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
pd.DataFrame where each row represents a single time period. The pd.DataFrame
|
|
26
|
+
has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
|
|
27
|
+
"""
|
|
28
|
+
# Sanity checks.
|
|
29
|
+
assert len(datetimes) > 0
|
|
30
|
+
assert min_seq_length > 1
|
|
31
|
+
assert datetimes.is_monotonic_increasing
|
|
32
|
+
assert datetimes.is_unique
|
|
33
|
+
|
|
34
|
+
# Find indices of gaps larger than max_gap:
|
|
35
|
+
gap_mask = pd.TimedeltaIndex(np.diff(datetimes)) > max_gap_duration
|
|
36
|
+
gap_indices = np.argwhere(gap_mask)[:, 0]
|
|
37
|
+
|
|
38
|
+
# gap_indicies are the indices into dt_index for the timestep immediately before the gap.
|
|
39
|
+
# e.g. if the datetimes at 12:00, 12:05, 18:00, 18:05 then gap_indicies will be [1].
|
|
40
|
+
# So we add 1 to gap_indices to get segment_boundaries, an index into dt_index
|
|
41
|
+
# which identifies the _start_ of each segment.
|
|
42
|
+
segment_boundaries = gap_indices + 1
|
|
43
|
+
|
|
44
|
+
# Capture the last segment of dt_index.
|
|
45
|
+
segment_boundaries = np.concatenate((segment_boundaries, [len(datetimes)]))
|
|
46
|
+
|
|
47
|
+
periods: list[dict[str, pd.Timestamp]] = []
|
|
48
|
+
start_i = 0
|
|
49
|
+
for next_start_i in segment_boundaries:
|
|
50
|
+
n_timesteps = next_start_i - start_i
|
|
51
|
+
if n_timesteps > min_seq_length:
|
|
52
|
+
end_i = next_start_i - 1
|
|
53
|
+
period = {"start_dt": datetimes[start_i], "end_dt": datetimes[end_i]}
|
|
54
|
+
periods.append(period)
|
|
55
|
+
start_i = next_start_i
|
|
56
|
+
|
|
57
|
+
assert len(periods) > 0, (
|
|
58
|
+
f"Did not find an periods from {datetimes}. " f"{min_seq_length=} {max_gap_duration=}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return pd.DataFrame(periods)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def trim_contiguous_time_periods(
|
|
65
|
+
contiguous_time_periods: pd.DataFrame,
|
|
66
|
+
history_duration: pd.Timedelta,
|
|
67
|
+
forecast_duration: pd.Timedelta,
|
|
68
|
+
) -> pd.DataFrame:
|
|
69
|
+
"""Trim the contiguous time periods to allow for history and forecast durations.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
contiguous_time_periods: DataFrame where each row represents a single time period. The
|
|
73
|
+
DataFrame must have `start_dt` and `end_dt` columns.
|
|
74
|
+
history_duration: Length of the historical slice used for a sample
|
|
75
|
+
forecast_duration: Length of the forecast slice used for a sample
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
The contiguous_time_periods DataFrame with the `start_dt` and `end_dt` columns updated.
|
|
80
|
+
"""
|
|
81
|
+
contiguous_time_periods = contiguous_time_periods.copy()
|
|
82
|
+
|
|
83
|
+
contiguous_time_periods["start_dt"] += history_duration
|
|
84
|
+
contiguous_time_periods["end_dt"] -= forecast_duration
|
|
85
|
+
|
|
86
|
+
valid_mask = contiguous_time_periods["start_dt"] <= contiguous_time_periods["end_dt"]
|
|
87
|
+
contiguous_time_periods = contiguous_time_periods.loc[valid_mask]
|
|
88
|
+
|
|
89
|
+
return contiguous_time_periods
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def find_contiguous_t0_periods(
|
|
94
|
+
datetimes: pd.DatetimeIndex,
|
|
95
|
+
history_duration: pd.Timedelta,
|
|
96
|
+
forecast_duration: pd.Timedelta,
|
|
97
|
+
sample_period_duration: pd.Timedelta,
|
|
98
|
+
) -> pd.DataFrame:
|
|
99
|
+
"""Return a pd.DataFrame where each row records the boundary of a contiguous time period.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
datetimes: pd.DatetimeIndex. Must be sorted.
|
|
103
|
+
history_duration: Length of the historical slice used for each sample
|
|
104
|
+
forecast_duration: Length of the forecast slice used for each sample
|
|
105
|
+
sample_period_duration: The sample frequency of the timeseries
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
pd.DataFrame where each row represents a single time period. The pd.DataFrame
|
|
110
|
+
has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
|
|
111
|
+
"""
|
|
112
|
+
total_duration = history_duration + forecast_duration
|
|
113
|
+
|
|
114
|
+
contiguous_time_periods = find_contiguous_time_periods(
|
|
115
|
+
datetimes=datetimes,
|
|
116
|
+
min_seq_length=int(total_duration / sample_period_duration) + 1,
|
|
117
|
+
max_gap_duration=sample_period_duration,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
contiguous_t0_periods = trim_contiguous_time_periods(
|
|
121
|
+
contiguous_time_periods=contiguous_time_periods,
|
|
122
|
+
history_duration=history_duration,
|
|
123
|
+
forecast_duration=forecast_duration,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
assert len(contiguous_t0_periods) > 0
|
|
127
|
+
|
|
128
|
+
return contiguous_t0_periods
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _find_contiguous_t0_periods_nwp(
|
|
132
|
+
ds,
|
|
133
|
+
history_duration: pd.Timedelta,
|
|
134
|
+
forecast_duration: pd.Timedelta,
|
|
135
|
+
max_staleness: pd.Timedelta | None = None,
|
|
136
|
+
max_dropout: pd.Timedelta = pd.Timedelta(0),
|
|
137
|
+
time_dim: str = "init_time_utc",
|
|
138
|
+
end_buffer: pd.Timedelta = pd.Timedelta(0),
|
|
139
|
+
):
|
|
140
|
+
|
|
141
|
+
assert "step" in ds.coords
|
|
142
|
+
# It is possible to use up to this amount of max staleness for the dataset and slice
|
|
143
|
+
# required
|
|
144
|
+
possible_max_staleness = (
|
|
145
|
+
pd.Timedelta(ds["step"].max().item())
|
|
146
|
+
- forecast_duration
|
|
147
|
+
- end_buffer
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# If max_staleness is set to None we set it based on the max step ahead of the input
|
|
151
|
+
# forecast data
|
|
152
|
+
if max_staleness is None:
|
|
153
|
+
max_staleness = possible_max_staleness
|
|
154
|
+
else:
|
|
155
|
+
# Make sure the max acceptable staleness isn't longer than the max possible
|
|
156
|
+
assert max_staleness <= possible_max_staleness
|
|
157
|
+
max_staleness = max_staleness
|
|
158
|
+
|
|
159
|
+
contiguous_time_periods = find_contiguous_t0_periods_nwp(
|
|
160
|
+
datetimes=pd.DatetimeIndex(ds[time_dim]),
|
|
161
|
+
history_duration=history_duration,
|
|
162
|
+
max_staleness=max_staleness,
|
|
163
|
+
max_dropout=max_dropout,
|
|
164
|
+
)
|
|
165
|
+
return contiguous_time_periods
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def find_contiguous_t0_periods_nwp(
|
|
170
|
+
datetimes: pd.DatetimeIndex,
|
|
171
|
+
history_duration: pd.Timedelta,
|
|
172
|
+
max_staleness: pd.Timedelta,
|
|
173
|
+
max_dropout: pd.Timedelta = pd.Timedelta(0),
|
|
174
|
+
) -> pd.DataFrame:
|
|
175
|
+
"""Get all time periods from the NWP init times which are valid as t0 datetimes.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
datetimes: Sorted pd.DatetimeIndex
|
|
179
|
+
history_duration: Length of the historical slice used for a sample
|
|
180
|
+
max_staleness: Up to how long after an NWP forecast init_time are we willing to use the
|
|
181
|
+
forecast. Each init time will only be used up to this t0 time regardless of the forecast
|
|
182
|
+
valid time.
|
|
183
|
+
max_dropout: What is the maximum amount of dropout that will be used. This must be <=
|
|
184
|
+
max_staleness.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
pd.DataFrame where each row represents a single time period. The pd.DataFrame
|
|
188
|
+
has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
|
|
189
|
+
"""
|
|
190
|
+
# Sanity checks.
|
|
191
|
+
assert len(datetimes) > 0
|
|
192
|
+
assert datetimes.is_monotonic_increasing
|
|
193
|
+
assert datetimes.is_unique
|
|
194
|
+
assert history_duration >= pd.Timedelta(0)
|
|
195
|
+
assert max_staleness >= pd.Timedelta(0)
|
|
196
|
+
assert max_dropout <= max_staleness
|
|
197
|
+
|
|
198
|
+
hist_drop_buffer = max(history_duration, max_dropout)
|
|
199
|
+
|
|
200
|
+
# Store contiguous periods
|
|
201
|
+
contiguous_periods = []
|
|
202
|
+
|
|
203
|
+
# Start first period allowing for history slice and max dropout
|
|
204
|
+
start_this_period = datetimes[0] + hist_drop_buffer
|
|
205
|
+
|
|
206
|
+
# The first forecast is valid up to the max staleness
|
|
207
|
+
end_this_period = datetimes[0] + max_staleness
|
|
208
|
+
|
|
209
|
+
for dt_init in datetimes[1:]:
|
|
210
|
+
# If the previous init time becomes stale before the next init becomes valid whilst also
|
|
211
|
+
# considering dropout - then the contiguous period breaks, and new starts with considering
|
|
212
|
+
# dropout and history duration
|
|
213
|
+
if end_this_period < dt_init + max_dropout:
|
|
214
|
+
contiguous_periods.append([start_this_period, end_this_period])
|
|
215
|
+
|
|
216
|
+
# And start a new period
|
|
217
|
+
start_this_period = dt_init + hist_drop_buffer
|
|
218
|
+
end_this_period = dt_init + max_staleness
|
|
219
|
+
|
|
220
|
+
contiguous_periods.append([start_this_period, end_this_period])
|
|
221
|
+
|
|
222
|
+
return pd.DataFrame(contiguous_periods, columns=["start_dt", "end_dt"])
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def intersection_of_multiple_dataframes_of_periods(
|
|
226
|
+
time_periods: list[pd.DataFrame],
|
|
227
|
+
) -> pd.DataFrame:
|
|
228
|
+
"""Find the intersection of a list of time periods.
|
|
229
|
+
|
|
230
|
+
See the docstring of intersection_of_2_dataframes_of_periods() for more details.
|
|
231
|
+
"""
|
|
232
|
+
assert len(time_periods) > 0
|
|
233
|
+
intersection = time_periods[0]
|
|
234
|
+
for time_period in time_periods[1:]:
|
|
235
|
+
intersection = intersection_of_2_dataframes_of_periods(intersection, time_period)
|
|
236
|
+
return intersection
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def intersection_of_2_dataframes_of_periods(a: pd.DataFrame, b: pd.DataFrame) -> pd.DataFrame:
|
|
240
|
+
"""Find the intersection of two pd.DataFrames of time periods.
|
|
241
|
+
|
|
242
|
+
Each row of each pd.DataFrame represents a single time period. Each pd.DataFrame has
|
|
243
|
+
two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
|
|
244
|
+
|
|
245
|
+
A typical use-case is that each pd.DataFrame represents all the time periods where
|
|
246
|
+
a `DataSource` has contiguous, valid data.
|
|
247
|
+
|
|
248
|
+
Here's a graphical example of two pd.DataFrames of time periods and their intersection:
|
|
249
|
+
|
|
250
|
+
----------------------> TIME ->---------------------
|
|
251
|
+
a: |-----| |----| |----------| |-----------|
|
|
252
|
+
b: |--------| |----| |---|
|
|
253
|
+
intersection: |--| |-| |--| |---|
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
a: pd.DataFrame where each row represents a time period. The pd.DataFrame has
|
|
257
|
+
two columns: start_dt and end_dt.
|
|
258
|
+
b: pd.DataFrame where each row represents a time period. The pd.DataFrame has
|
|
259
|
+
two columns: start_dt and end_dt.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Sorted list of intersecting time periods represented as a pd.DataFrame with two columns:
|
|
263
|
+
start_dt and end_dt.
|
|
264
|
+
"""
|
|
265
|
+
if a.empty or b.empty:
|
|
266
|
+
return pd.DataFrame(columns=["start_dt", "end_dt"])
|
|
267
|
+
|
|
268
|
+
all_intersecting_periods = []
|
|
269
|
+
for a_period in a.itertuples():
|
|
270
|
+
# Five ways in which two periods may overlap:
|
|
271
|
+
# a: |----| or |---| or |---| or |--| or |-|
|
|
272
|
+
# b: |--| |---| |---| |------| |-|
|
|
273
|
+
# In all five, `a` must always start before `b` ends,
|
|
274
|
+
# and `a` must always end after `b` starts:
|
|
275
|
+
|
|
276
|
+
# TODO: <= and >= because we should allow overlap time periods of length 1. e.g.
|
|
277
|
+
# a: |----| or |---|
|
|
278
|
+
# b: |--| |---|
|
|
279
|
+
# These aren't allowed if we use < and >.
|
|
280
|
+
|
|
281
|
+
overlapping_periods = b[(a_period.start_dt < b.end_dt) & (a_period.end_dt > b.start_dt)]
|
|
282
|
+
|
|
283
|
+
# There are two ways in which two periods may *not* overlap:
|
|
284
|
+
# a: |---| or |---|
|
|
285
|
+
# b: |---| |---|
|
|
286
|
+
# `overlapping` will not include periods which do *not* overlap.
|
|
287
|
+
|
|
288
|
+
# Now find the intersection of each period in `overlapping_periods` with
|
|
289
|
+
# the period from `a` that starts at `a_start_dt` and ends at `a_end_dt`.
|
|
290
|
+
# We do this by clipping each row of `overlapping_periods`
|
|
291
|
+
# to start no earlier than `a_start_dt`, and end no later than `a_end_dt`.
|
|
292
|
+
|
|
293
|
+
# First, make a copy, so we don't clip the underlying data in `b`.
|
|
294
|
+
intersection = overlapping_periods.copy()
|
|
295
|
+
intersection["start_dt"] = intersection.start_dt.clip(lower=a_period.start_dt)
|
|
296
|
+
intersection["end_dt"] = intersection.end_dt.clip(upper=a_period.end_dt)
|
|
297
|
+
|
|
298
|
+
all_intersecting_periods.append(intersection)
|
|
299
|
+
|
|
300
|
+
all_intersecting_periods = pd.concat(all_intersecting_periods)
|
|
301
|
+
return all_intersecting_periods.sort_values(by="start_dt").reset_index(drop=True)
|