ocf-data-sampler 0.5.14__py3-none-any.whl → 0.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocf-data-sampler might be problematic. Click here for more details.
- ocf_data_sampler/config/model.py +8 -17
- ocf_data_sampler/select/dropout.py +30 -32
- ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py +12 -28
- {ocf_data_sampler-0.5.14.dist-info → ocf_data_sampler-0.5.15.dist-info}/METADATA +1 -1
- {ocf_data_sampler-0.5.14.dist-info → ocf_data_sampler-0.5.15.dist-info}/RECORD +7 -7
- {ocf_data_sampler-0.5.14.dist-info → ocf_data_sampler-0.5.15.dist-info}/WHEEL +0 -0
- {ocf_data_sampler-0.5.14.dist-info → ocf_data_sampler-0.5.15.dist-info}/top_level.txt +0 -0
ocf_data_sampler/config/model.py
CHANGED
|
@@ -90,7 +90,7 @@ class DropoutMixin(Base):
|
|
|
90
90
|
"negative or zero.",
|
|
91
91
|
)
|
|
92
92
|
|
|
93
|
-
dropout_fraction: float|list[float] = Field(
|
|
93
|
+
dropout_fraction: float | list[float] = Field(
|
|
94
94
|
default=0,
|
|
95
95
|
description="Either a float(Chance of dropout being applied to each sample) or a list of "
|
|
96
96
|
"floats (probability that dropout of the corresponding timedelta is applied)",
|
|
@@ -106,31 +106,22 @@ class DropoutMixin(Base):
|
|
|
106
106
|
|
|
107
107
|
|
|
108
108
|
@field_validator("dropout_fraction")
|
|
109
|
-
def dropout_fractions(cls, dropout_frac: float|list[float]) -> float|list[float]:
|
|
109
|
+
def dropout_fractions(cls, dropout_frac: float | list[float]) -> float | list[float]:
|
|
110
110
|
"""Validate 'dropout_frac'."""
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
raise ValueError("Input should be less than or equal to 1")
|
|
115
|
-
elif not (dropout_frac >= 0):
|
|
116
|
-
raise ValueError("Input should be greater than or equal to 0")
|
|
111
|
+
if isinstance(dropout_frac, float | int):
|
|
112
|
+
if not (0<= dropout_frac <= 1):
|
|
113
|
+
raise ValueError("Dropout fractions must be in range [0, 1]")
|
|
117
114
|
|
|
118
115
|
elif isinstance(dropout_frac, list):
|
|
119
116
|
if not dropout_frac:
|
|
120
117
|
raise ValueError("List cannot be empty")
|
|
121
118
|
|
|
122
|
-
if not all(isinstance(i, float) for i in dropout_frac):
|
|
123
|
-
raise ValueError("All elements in the list must be floats")
|
|
124
|
-
|
|
125
119
|
if not all(0 <= i <= 1 for i in dropout_frac):
|
|
126
|
-
raise ValueError("
|
|
127
|
-
|
|
128
|
-
if not isclose(sum(dropout_frac), 1.0, rel_tol=1e-9):
|
|
129
|
-
raise ValueError("Sum of all floats in the list must be 1.0")
|
|
120
|
+
raise ValueError("All dropout fractions must be in range [0, 1]")
|
|
130
121
|
|
|
122
|
+
if not (0 <= sum(dropout_frac) <= 1):
|
|
123
|
+
raise ValueError("The sum of dropout fractions must be in range [0, 1]")
|
|
131
124
|
|
|
132
|
-
else:
|
|
133
|
-
raise TypeError("Must be either a float or a list of floats")
|
|
134
125
|
return dropout_frac
|
|
135
126
|
|
|
136
127
|
|
|
@@ -9,53 +9,51 @@ import pandas as pd
|
|
|
9
9
|
import xarray as xr
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def
|
|
12
|
+
def apply_history_dropout(
|
|
13
13
|
t0: pd.Timestamp,
|
|
14
14
|
dropout_timedeltas: list[pd.Timedelta],
|
|
15
|
-
dropout_frac: float|list[float],
|
|
15
|
+
dropout_frac: float | list[float],
|
|
16
16
|
da: xr.DataArray,
|
|
17
17
|
) -> xr.DataArray:
|
|
18
|
-
"""
|
|
18
|
+
"""Apply randomly sampled dropout to the historical part of some sequence data.
|
|
19
|
+
|
|
20
|
+
Dropped out data is replaced with NaNs
|
|
19
21
|
|
|
20
22
|
Args:
|
|
21
|
-
t0: The forecast init-time
|
|
23
|
+
t0: The forecast init-time.
|
|
22
24
|
dropout_timedeltas: List of timedeltas relative to t0 to pick from
|
|
23
|
-
dropout_frac:
|
|
24
|
-
|
|
25
|
-
Or a list of probabilities for each of the corresponding timedeltas
|
|
25
|
+
dropout_frac: The probabilit(ies) that each dropout timedelta will be applied. This should
|
|
26
|
+
be between 0 and 1 inclusive.
|
|
26
27
|
da: Xarray DataArray with 'time_utc' coordinate
|
|
27
28
|
"""
|
|
28
|
-
if
|
|
29
|
-
|
|
30
|
-
if len(dropout_frac) != len(dropout_timedeltas):
|
|
31
|
-
raise ValueError("Lengths of dropout_frac and dropout_timedeltas should match")
|
|
29
|
+
if len(dropout_timedeltas)==0:
|
|
30
|
+
return da
|
|
32
31
|
|
|
32
|
+
if isinstance(dropout_frac, float | int):
|
|
33
33
|
|
|
34
|
+
if not (0<=dropout_frac<=1):
|
|
35
|
+
raise ValueError("`dropout_frac` must be in range [0, 1]")
|
|
34
36
|
|
|
37
|
+
# Create list with equal chance for all dropout timedeltas
|
|
38
|
+
n = len(dropout_timedeltas)
|
|
39
|
+
dropout_frac = [dropout_frac/n for _ in range(n)]
|
|
40
|
+
else:
|
|
41
|
+
if not 0<=sum(dropout_frac)<=1:
|
|
42
|
+
raise ValueError("The sum of `dropout_frac` must be in range [0, 1]")
|
|
43
|
+
if len(dropout_timedeltas)!=len(dropout_frac):
|
|
44
|
+
raise ValueError("`dropout_timedeltas` and `dropout_frac` must have the same length")
|
|
35
45
|
|
|
36
|
-
|
|
46
|
+
dropout_frac = [*dropout_frac] # Make copy of the list so we can append to it
|
|
37
47
|
|
|
38
|
-
|
|
48
|
+
dropout_timedeltas = [*dropout_timedeltas] # Make copy of the list so we can append to it
|
|
39
49
|
|
|
50
|
+
# Add chance of no dropout
|
|
51
|
+
dropout_frac.append(1-sum(dropout_frac))
|
|
52
|
+
dropout_timedeltas.append(None)
|
|
40
53
|
|
|
54
|
+
timedelta_choice = np.random.choice(dropout_timedeltas, p=dropout_frac)
|
|
41
55
|
|
|
42
|
-
|
|
56
|
+
if timedelta_choice is None:
|
|
57
|
+
return da
|
|
43
58
|
else:
|
|
44
|
-
|
|
45
|
-
if dropout_frac > 0 and len(dropout_timedeltas) == 0:
|
|
46
|
-
raise ValueError("To apply dropout, dropout_timedeltas must be provided")
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if not (0 <= dropout_frac <= 1):
|
|
50
|
-
raise ValueError("dropout_frac must be between 0 and 1 inclusive")
|
|
51
|
-
|
|
52
|
-
if (len(dropout_timedeltas) == 0) or (np.random.uniform() >= dropout_frac):
|
|
53
|
-
dropout_time = None
|
|
54
|
-
else:
|
|
55
|
-
dropout_time = t0 + np.random.choice(dropout_timedeltas)
|
|
56
|
-
|
|
57
|
-
# apply dropout time
|
|
58
|
-
if dropout_time is None:
|
|
59
|
-
return da
|
|
60
|
-
# This replaces the times after the dropout with NaNs
|
|
61
|
-
return da.where(da.time_utc <= dropout_time)
|
|
59
|
+
return da.where((da.time_utc <= timedelta_choice + t0) | (da.time_utc> t0))
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
"""Slice datasets by time."""
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
import xarray as xr
|
|
5
4
|
|
|
6
5
|
from ocf_data_sampler.config import Configuration
|
|
7
|
-
from ocf_data_sampler.select.dropout import
|
|
6
|
+
from ocf_data_sampler.select.dropout import apply_history_dropout
|
|
8
7
|
from ocf_data_sampler.select.select_time_slice import select_time_slice, select_time_slice_nwp
|
|
9
8
|
from ocf_data_sampler.utils import minutes
|
|
10
9
|
|
|
@@ -52,7 +51,7 @@ def slice_datasets_by_time(
|
|
|
52
51
|
)
|
|
53
52
|
|
|
54
53
|
# Apply the randomly sampled dropout
|
|
55
|
-
sliced_datasets_dict["sat"] =
|
|
54
|
+
sliced_datasets_dict["sat"] = apply_history_dropout(
|
|
56
55
|
t0,
|
|
57
56
|
dropout_timedeltas=minutes(sat_config.dropout_timedeltas_minutes),
|
|
58
57
|
dropout_frac=sat_config.dropout_fraction,
|
|
@@ -62,59 +61,44 @@ def slice_datasets_by_time(
|
|
|
62
61
|
if "gsp" in datasets_dict:
|
|
63
62
|
gsp_config = config.input_data.gsp
|
|
64
63
|
|
|
65
|
-
|
|
64
|
+
da_gsp = select_time_slice(
|
|
66
65
|
datasets_dict["gsp"],
|
|
67
66
|
t0,
|
|
68
67
|
time_resolution=minutes(gsp_config.time_resolution_minutes),
|
|
69
68
|
interval_start=minutes(gsp_config.interval_start_minutes),
|
|
70
|
-
interval_end=minutes(
|
|
69
|
+
interval_end=minutes(gsp_config.interval_end_minutes),
|
|
71
70
|
)
|
|
72
71
|
|
|
73
72
|
# Dropout on the past GSP, but not the future GSP
|
|
74
|
-
|
|
73
|
+
da_gsp = apply_history_dropout(
|
|
75
74
|
t0,
|
|
76
75
|
dropout_timedeltas=minutes(gsp_config.dropout_timedeltas_minutes),
|
|
77
76
|
dropout_frac=gsp_config.dropout_fraction,
|
|
78
|
-
da=
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
da_gsp_future = select_time_slice(
|
|
82
|
-
datasets_dict["gsp"],
|
|
83
|
-
t0,
|
|
84
|
-
time_resolution=minutes(gsp_config.time_resolution_minutes),
|
|
85
|
-
interval_start=minutes(gsp_config.time_resolution_minutes),
|
|
86
|
-
interval_end=minutes(gsp_config.interval_end_minutes),
|
|
77
|
+
da=da_gsp,
|
|
87
78
|
)
|
|
88
79
|
|
|
89
|
-
sliced_datasets_dict["gsp"] =
|
|
80
|
+
sliced_datasets_dict["gsp"] = da_gsp
|
|
90
81
|
|
|
91
82
|
if "site" in datasets_dict:
|
|
92
83
|
site_config = config.input_data.site
|
|
93
84
|
|
|
94
|
-
|
|
85
|
+
da_site = select_time_slice(
|
|
95
86
|
datasets_dict["site"],
|
|
96
87
|
t0,
|
|
97
88
|
time_resolution=minutes(site_config.time_resolution_minutes),
|
|
98
89
|
interval_start=minutes(site_config.interval_start_minutes),
|
|
99
|
-
interval_end=minutes(
|
|
90
|
+
interval_end=minutes(site_config.interval_end_minutes),
|
|
100
91
|
)
|
|
101
92
|
|
|
102
93
|
# Apply the randomly sampled dropout on the past site not the future
|
|
103
|
-
|
|
94
|
+
da_site = apply_history_dropout(
|
|
104
95
|
t0,
|
|
105
96
|
dropout_timedeltas=minutes(site_config.dropout_timedeltas_minutes),
|
|
106
97
|
dropout_frac=site_config.dropout_fraction,
|
|
107
|
-
da=
|
|
98
|
+
da=da_site,
|
|
108
99
|
)
|
|
109
100
|
|
|
110
|
-
|
|
111
|
-
datasets_dict["site"],
|
|
112
|
-
t0,
|
|
113
|
-
time_resolution=minutes(site_config.time_resolution_minutes),
|
|
114
|
-
interval_start=minutes(site_config.time_resolution_minutes),
|
|
115
|
-
interval_end=minutes(site_config.interval_end_minutes),
|
|
116
|
-
)
|
|
101
|
+
sliced_datasets_dict["site"] = da_site
|
|
117
102
|
|
|
118
|
-
sliced_datasets_dict["site"] = xr.concat([da_site_past, da_site_future], dim="time_utc")
|
|
119
103
|
|
|
120
104
|
return sliced_datasets_dict
|
|
@@ -2,7 +2,7 @@ ocf_data_sampler/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,
|
|
|
2
2
|
ocf_data_sampler/utils.py,sha256=CTJf9bjHjO8vOJebUtXiMpvgwUpF7gEOjjaoE77fhTk,1177
|
|
3
3
|
ocf_data_sampler/config/__init__.py,sha256=O29mbH0XG2gIY1g3BaveGCnpBO2SFqdu-qzJ7a6evl0,223
|
|
4
4
|
ocf_data_sampler/config/load.py,sha256=LL-7wemI8o4KPkx35j-wQ3HjsMvDgqXr7G46IcASfnU,632
|
|
5
|
-
ocf_data_sampler/config/model.py,sha256=
|
|
5
|
+
ocf_data_sampler/config/model.py,sha256=5ou8BZgQ9h-xyJEqHdspPKZgZO9Vr6opjSphUys7yE8,11505
|
|
6
6
|
ocf_data_sampler/config/save.py,sha256=m8SPw5rXjkMm1rByjh3pK5StdBi4e8ysnn3jQopdRaI,1064
|
|
7
7
|
ocf_data_sampler/data/uk_gsp_locations_20220314.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZtI58td4d4DhGos,10415772
|
|
8
8
|
ocf_data_sampler/data/uk_gsp_locations_20250109.csv,sha256=XZISFatnbpO9j8LwaxNKFzQSjs6hcHFsV8a9uDDpy2E,9055334
|
|
@@ -32,7 +32,7 @@ ocf_data_sampler/numpy_sample/satellite.py,sha256=RaYzYIcB1AmDrKeiqSpn4QVfBH-QMe
|
|
|
32
32
|
ocf_data_sampler/numpy_sample/site.py,sha256=4S19bzCN5lswVUrmWRfwpVsBPUE7bi0OIdxsD9wgvhU,982
|
|
33
33
|
ocf_data_sampler/numpy_sample/sun_position.py,sha256=5tt-zNm6aRuZMsxZPaAxyg7HeikswfZCeHWXTHuO2K0,1555
|
|
34
34
|
ocf_data_sampler/select/__init__.py,sha256=mK7Wu_-j9IXGTYrOuDf5yDDuU5a306b0iGKTAooNg_s,210
|
|
35
|
-
ocf_data_sampler/select/dropout.py,sha256=
|
|
35
|
+
ocf_data_sampler/select/dropout.py,sha256=i5NDP6oQnZBkQRJW-aXVrPXawktVKQz5VMexe5Ww51g,2021
|
|
36
36
|
ocf_data_sampler/select/fill_time_periods.py,sha256=TlGxp1xiAqnhdWfLy0pv3FuZc00dtimjWdLzr4JoTGA,865
|
|
37
37
|
ocf_data_sampler/select/find_contiguous_time_periods.py,sha256=etkr6LuB7zxkfzWJ6SgHiULdRuFzFlq5bOUNd257Qx4,11545
|
|
38
38
|
ocf_data_sampler/select/geospatial.py,sha256=rvMy_e--3tm-KAy9pU6b9-UMBQqH2sXykr3N_4SHYy4,6528
|
|
@@ -51,12 +51,12 @@ ocf_data_sampler/torch_datasets/utils/add_alterate_coordinate_projections.py,sha
|
|
|
51
51
|
ocf_data_sampler/torch_datasets/utils/config_normalization_values_to_dicts.py,sha256=SGt1H2nXcaj44ND14-gHzvA7dkLfgjTacCq7rOkRGwg,1991
|
|
52
52
|
ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py,sha256=we7BTxRH7B7jKayDT7YfNyfI3zZClz2Bk-HXKQIokgU,956
|
|
53
53
|
ocf_data_sampler/torch_datasets/utils/spatial_slice_for_dataset.py,sha256=Hvz0wHSWMYYamf2oHNiGlzJcM4cAH6pL_7ZEvIBL2dE,1882
|
|
54
|
-
ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py,sha256=
|
|
54
|
+
ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py,sha256=Q_-kCTtUieyEDpSElY1xwJct7Vsw0LAn5MbYSg2O6vg,3621
|
|
55
55
|
ocf_data_sampler/torch_datasets/utils/valid_time_periods.py,sha256=xcy75cVxl0WrglnX5YUAFjXXlO2GwEBHWyqo8TDuiOA,4714
|
|
56
56
|
ocf_data_sampler/torch_datasets/utils/validation_utils.py,sha256=YqmT-lExWlI8_ul3l0EP73Ik002fStr_bhsZh9mQqEU,4735
|
|
57
57
|
scripts/download_gsp_location_data.py,sha256=rRDXMoqX-RYY4jPdxhdlxJGhWdl6r245F5UARgKV6P4,3121
|
|
58
58
|
scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
|
|
59
|
-
ocf_data_sampler-0.5.
|
|
60
|
-
ocf_data_sampler-0.5.
|
|
61
|
-
ocf_data_sampler-0.5.
|
|
62
|
-
ocf_data_sampler-0.5.
|
|
59
|
+
ocf_data_sampler-0.5.15.dist-info/METADATA,sha256=AcLJpUOG6smk3WDSZkj3K8cjhvSg9z0lPoEKM16B6q8,12817
|
|
60
|
+
ocf_data_sampler-0.5.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
61
|
+
ocf_data_sampler-0.5.15.dist-info/top_level.txt,sha256=deUxqmsONNAGZDNbsntbXH7BRA1MqWaUeAJrCo6q_xA,25
|
|
62
|
+
ocf_data_sampler-0.5.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|