ocf-data-sampler 0.0.25__tar.gz → 0.0.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocf-data-sampler might be problematic. Click here for more details.
- {ocf_data_sampler-0.0.25/ocf_data_sampler.egg-info → ocf_data_sampler-0.0.27}/PKG-INFO +1 -1
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/config/model.py +85 -122
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/load_dataset.py +6 -6
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/find_contiguous_time_periods.py +40 -75
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/select_time_slice.py +24 -33
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/spatial_slice_for_dataset.py +4 -4
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/time_slice_for_dataset.py +18 -17
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/torch_datasets/process_and_combine.py +13 -14
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/torch_datasets/pvnet_uk_regional.py +1 -1
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/torch_datasets/site.py +10 -10
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/torch_datasets/valid_time_periods.py +20 -12
- ocf_data_sampler-0.0.25/ocf_data_sampler/time_functions.py → ocf_data_sampler-0.0.27/ocf_data_sampler/utils.py +1 -2
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27/ocf_data_sampler.egg-info}/PKG-INFO +1 -1
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler.egg-info/SOURCES.txt +1 -1
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/pyproject.toml +1 -1
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/config/test_config.py +23 -14
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/conftest.py +7 -5
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/select/test_find_contiguous_time_periods.py +8 -8
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/select/test_select_time_slice.py +31 -43
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/torch_datasets/test_pvnet_uk_regional.py +4 -4
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/torch_datasets/test_site.py +2 -2
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/LICENSE +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/MANIFEST.in +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/README.md +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/config/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/config/load.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/config/save.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/constants.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/data/uk_gsp_locations.csv +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/gsp.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/nwp/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/nwp/nwp.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/nwp/providers/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/nwp/providers/ecmwf.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/nwp/providers/ukv.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/nwp/providers/utils.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/satellite.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/site.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/load/utils.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/numpy_batch/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/numpy_batch/gsp.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/numpy_batch/nwp.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/numpy_batch/satellite.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/numpy_batch/site.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/numpy_batch/sun_position.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/dropout.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/fill_time_periods.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/geospatial.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/location.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/select/select_spatial_slice.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler/torch_datasets/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler.egg-info/dependency_links.txt +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler.egg-info/requires.txt +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/ocf_data_sampler.egg-info/top_level.txt +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/scripts/refactor_site.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/setup.cfg +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/__init__.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/load/test_load_gsp.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/load/test_load_nwp.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/load/test_load_satellite.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/load/test_load_sites.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/numpy_batch/test_gsp.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/numpy_batch/test_nwp.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/numpy_batch/test_satellite.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/numpy_batch/test_sun_position.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/select/test_dropout.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/select/test_fill_time_periods.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/select/test_location.py +0 -0
- {ocf_data_sampler-0.0.25 → ocf_data_sampler-0.0.27}/tests/select/test_select_spatial_slice.py +0 -0
|
@@ -15,6 +15,7 @@ from typing import Dict, List, Optional
|
|
|
15
15
|
from typing_extensions import Self
|
|
16
16
|
|
|
17
17
|
from pydantic import BaseModel, Field, RootModel, field_validator, ValidationInfo, model_validator
|
|
18
|
+
|
|
18
19
|
from ocf_data_sampler.constants import NWP_PROVIDERS
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
@@ -34,26 +35,50 @@ class Base(BaseModel):
|
|
|
34
35
|
class General(Base):
|
|
35
36
|
"""General pydantic model"""
|
|
36
37
|
|
|
37
|
-
name: str = Field("example", description="The name of this configuration file
|
|
38
|
+
name: str = Field("example", description="The name of this configuration file")
|
|
38
39
|
description: str = Field(
|
|
39
40
|
"example configuration", description="Description of this configuration file"
|
|
40
41
|
)
|
|
41
42
|
|
|
42
43
|
|
|
43
|
-
class
|
|
44
|
-
"""Mixin class, to add
|
|
44
|
+
class TimeWindowMixin(Base):
|
|
45
|
+
"""Mixin class, to add interval start, end and resolution minutes"""
|
|
45
46
|
|
|
46
|
-
|
|
47
|
+
time_resolution_minutes: int = Field(
|
|
47
48
|
...,
|
|
48
|
-
|
|
49
|
-
description="
|
|
49
|
+
gt=0,
|
|
50
|
+
description="The temporal resolution of the data in minutes",
|
|
50
51
|
)
|
|
51
|
-
|
|
52
|
+
|
|
53
|
+
interval_start_minutes: int = Field(
|
|
52
54
|
...,
|
|
53
|
-
|
|
54
|
-
description="how many historic minutes to use. ",
|
|
55
|
+
description="Data interval starts at `t0 + interval_start_minutes`",
|
|
55
56
|
)
|
|
56
57
|
|
|
58
|
+
interval_end_minutes: int = Field(
|
|
59
|
+
...,
|
|
60
|
+
description="Data interval ends at `t0 + interval_end_minutes`",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
@model_validator(mode='after')
|
|
64
|
+
def check_interval_range(cls, values):
|
|
65
|
+
if values.interval_start_minutes > values.interval_end_minutes:
|
|
66
|
+
raise ValueError('interval_start_minutes must be <= interval_end_minutes')
|
|
67
|
+
return values
|
|
68
|
+
|
|
69
|
+
@field_validator("interval_start_minutes")
|
|
70
|
+
def interval_start_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
|
|
71
|
+
if v % info.data["time_resolution_minutes"] != 0:
|
|
72
|
+
raise ValueError("interval_start_minutes must be divisible by time_resolution_minutes")
|
|
73
|
+
return v
|
|
74
|
+
|
|
75
|
+
@field_validator("interval_end_minutes")
|
|
76
|
+
def interval_end_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
|
|
77
|
+
if v % info.data["time_resolution_minutes"] != 0:
|
|
78
|
+
raise ValueError("interval_end_minutes must be divisible by time_resolution_minutes")
|
|
79
|
+
return v
|
|
80
|
+
|
|
81
|
+
|
|
57
82
|
|
|
58
83
|
# noinspection PyMethodParameters
|
|
59
84
|
class DropoutMixin(Base):
|
|
@@ -65,7 +90,12 @@ class DropoutMixin(Base):
|
|
|
65
90
|
"negative or zero.",
|
|
66
91
|
)
|
|
67
92
|
|
|
68
|
-
dropout_fraction: float = Field(
|
|
93
|
+
dropout_fraction: float = Field(
|
|
94
|
+
default=0,
|
|
95
|
+
description="Chance of dropout being applied to each sample",
|
|
96
|
+
ge=0,
|
|
97
|
+
le=1,
|
|
98
|
+
)
|
|
69
99
|
|
|
70
100
|
@field_validator("dropout_timedeltas_minutes")
|
|
71
101
|
def dropout_timedeltas_minutes_negative(cls, v: List[int]) -> List[int]:
|
|
@@ -75,12 +105,6 @@ class DropoutMixin(Base):
|
|
|
75
105
|
assert m <= 0, "Dropout timedeltas must be negative"
|
|
76
106
|
return v
|
|
77
107
|
|
|
78
|
-
@field_validator("dropout_fraction")
|
|
79
|
-
def dropout_fraction_valid(cls, v: float) -> float:
|
|
80
|
-
"""Validate 'dropout_fraction'"""
|
|
81
|
-
assert 0 <= v <= 1, "Dropout fraction must be between 0 and 1"
|
|
82
|
-
return v
|
|
83
|
-
|
|
84
108
|
@model_validator(mode="after")
|
|
85
109
|
def dropout_instructions_consistent(self) -> Self:
|
|
86
110
|
if self.dropout_fraction == 0:
|
|
@@ -92,93 +116,51 @@ class DropoutMixin(Base):
|
|
|
92
116
|
return self
|
|
93
117
|
|
|
94
118
|
|
|
95
|
-
|
|
96
|
-
class
|
|
97
|
-
"""Time resolution mix in"""
|
|
119
|
+
class SpatialWindowMixin(Base):
|
|
120
|
+
"""Mixin class, to add path and image size"""
|
|
98
121
|
|
|
99
|
-
|
|
122
|
+
image_size_pixels_height: int = Field(
|
|
100
123
|
...,
|
|
101
|
-
|
|
124
|
+
ge=0,
|
|
125
|
+
description="The number of pixels of the height of the region of interest",
|
|
102
126
|
)
|
|
103
127
|
|
|
104
|
-
|
|
105
|
-
class Site(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
|
|
106
|
-
"""Site configuration model"""
|
|
107
|
-
|
|
108
|
-
file_path: str = Field(
|
|
128
|
+
image_size_pixels_width: int = Field(
|
|
109
129
|
...,
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
metadata_file_path: str = Field(
|
|
113
|
-
...,
|
|
114
|
-
description="The CSV files describing power system",
|
|
130
|
+
ge=0,
|
|
131
|
+
description="The number of pixels of the width of the region of interest",
|
|
115
132
|
)
|
|
116
133
|
|
|
117
|
-
@field_validator("forecast_minutes")
|
|
118
|
-
def forecast_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
|
|
119
|
-
"""Check forecast length requested will give stable number of timesteps"""
|
|
120
|
-
if v % info.data["time_resolution_minutes"] != 0:
|
|
121
|
-
message = "Forecast duration must be divisible by time resolution"
|
|
122
|
-
logger.error(message)
|
|
123
|
-
raise Exception(message)
|
|
124
|
-
return v
|
|
125
|
-
|
|
126
|
-
@field_validator("history_minutes")
|
|
127
|
-
def history_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
|
|
128
|
-
"""Check history length requested will give stable number of timesteps"""
|
|
129
|
-
if v % info.data["time_resolution_minutes"] != 0:
|
|
130
|
-
message = "History duration must be divisible by time resolution"
|
|
131
|
-
logger.error(message)
|
|
132
|
-
raise Exception(message)
|
|
133
|
-
return v
|
|
134
|
-
|
|
135
|
-
# TODO validate the netcdf for sites
|
|
136
|
-
# TODO validate the csv for metadata
|
|
137
134
|
|
|
138
|
-
class Satellite(
|
|
135
|
+
class Satellite(TimeWindowMixin, DropoutMixin, SpatialWindowMixin):
|
|
139
136
|
"""Satellite configuration model"""
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
satellite_zarr_path: str | tuple[str] | list[str] = Field(
|
|
143
|
-
...,
|
|
144
|
-
description="The path or list of paths which hold the satellite zarr",
|
|
145
|
-
)
|
|
146
|
-
satellite_channels: list[str] = Field(
|
|
147
|
-
..., description="the satellite channels that are used"
|
|
148
|
-
)
|
|
149
|
-
satellite_image_size_pixels_height: int = Field(
|
|
137
|
+
|
|
138
|
+
zarr_path: str | tuple[str] | list[str] = Field(
|
|
150
139
|
...,
|
|
151
|
-
description="The
|
|
152
|
-
" for non-HRV satellite channels.",
|
|
140
|
+
description="The path or list of paths which hold the data zarr",
|
|
153
141
|
)
|
|
154
142
|
|
|
155
|
-
|
|
156
|
-
...,
|
|
157
|
-
description="The number of pixels of the width of the region "
|
|
158
|
-
"of interest for non-HRV satellite channels.",
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
live_delay_minutes: int = Field(
|
|
162
|
-
..., description="The expected delay in minutes of the satellite data"
|
|
143
|
+
channels: list[str] = Field(
|
|
144
|
+
..., description="the satellite channels that are used"
|
|
163
145
|
)
|
|
164
146
|
|
|
165
147
|
|
|
166
148
|
# noinspection PyMethodParameters
|
|
167
|
-
class NWP(
|
|
149
|
+
class NWP(TimeWindowMixin, DropoutMixin, SpatialWindowMixin):
|
|
168
150
|
"""NWP configuration model"""
|
|
169
|
-
|
|
170
|
-
|
|
151
|
+
|
|
152
|
+
zarr_path: str | tuple[str] | list[str] = Field(
|
|
171
153
|
...,
|
|
172
|
-
description="The path which
|
|
154
|
+
description="The path or list of paths which hold the data zarr",
|
|
173
155
|
)
|
|
174
|
-
|
|
156
|
+
|
|
157
|
+
channels: list[str] = Field(
|
|
175
158
|
..., description="the channels used in the nwp data"
|
|
176
159
|
)
|
|
177
|
-
nwp_accum_channels: list[str] = Field([], description="the nwp channels which need to be diffed")
|
|
178
|
-
nwp_image_size_pixels_height: int = Field(..., description="The size of NWP spacial crop in pixels")
|
|
179
|
-
nwp_image_size_pixels_width: int = Field(..., description="The size of NWP spacial crop in pixels")
|
|
180
160
|
|
|
181
|
-
|
|
161
|
+
provider: str = Field(..., description="The provider of the NWP data")
|
|
162
|
+
|
|
163
|
+
accum_channels: list[str] = Field([], description="the nwp channels which need to be diffed")
|
|
182
164
|
|
|
183
165
|
max_staleness_minutes: Optional[int] = Field(
|
|
184
166
|
None,
|
|
@@ -188,32 +170,15 @@ class NWP(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
|
|
|
188
170
|
)
|
|
189
171
|
|
|
190
172
|
|
|
191
|
-
@field_validator("
|
|
192
|
-
def
|
|
193
|
-
"""Validate '
|
|
173
|
+
@field_validator("provider")
|
|
174
|
+
def validate_provider(cls, v: str) -> str:
|
|
175
|
+
"""Validate 'provider'"""
|
|
194
176
|
if v.lower() not in NWP_PROVIDERS:
|
|
195
177
|
message = f"NWP provider {v} is not in {NWP_PROVIDERS}"
|
|
196
178
|
logger.warning(message)
|
|
197
179
|
raise Exception(message)
|
|
198
180
|
return v
|
|
199
181
|
|
|
200
|
-
# Todo: put into time mixin when moving intervals there
|
|
201
|
-
@field_validator("forecast_minutes")
|
|
202
|
-
def forecast_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
|
|
203
|
-
if v % info.data["time_resolution_minutes"] != 0:
|
|
204
|
-
message = "Forecast duration must be divisible by time resolution"
|
|
205
|
-
logger.error(message)
|
|
206
|
-
raise Exception(message)
|
|
207
|
-
return v
|
|
208
|
-
|
|
209
|
-
@field_validator("history_minutes")
|
|
210
|
-
def history_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
|
|
211
|
-
if v % info.data["time_resolution_minutes"] != 0:
|
|
212
|
-
message = "History duration must be divisible by time resolution"
|
|
213
|
-
logger.error(message)
|
|
214
|
-
raise Exception(message)
|
|
215
|
-
return v
|
|
216
|
-
|
|
217
182
|
|
|
218
183
|
class MultiNWP(RootModel):
|
|
219
184
|
"""Configuration for multiple NWPs"""
|
|
@@ -241,34 +206,32 @@ class MultiNWP(RootModel):
|
|
|
241
206
|
return self.root.items()
|
|
242
207
|
|
|
243
208
|
|
|
244
|
-
|
|
245
|
-
class GSP(DataSourceMixin, TimeResolutionMixin, DropoutMixin):
|
|
209
|
+
class GSP(TimeWindowMixin, DropoutMixin):
|
|
246
210
|
"""GSP configuration model"""
|
|
247
211
|
|
|
248
|
-
|
|
212
|
+
zarr_path: str = Field(..., description="The path which holds the GSP zarr")
|
|
249
213
|
|
|
250
|
-
@field_validator("forecast_minutes")
|
|
251
|
-
def forecast_minutes_divide_by_time_resolution(cls, v: int, info: ValidationInfo) -> int:
|
|
252
|
-
if v % info.data["time_resolution_minutes"] != 0:
|
|
253
|
-
message = "Forecast duration must be divisible by time resolution"
|
|
254
|
-
logger.error(message)
|
|
255
|
-
raise Exception(message)
|
|
256
|
-
return v
|
|
257
214
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
215
|
+
class Site(TimeWindowMixin, DropoutMixin):
|
|
216
|
+
"""Site configuration model"""
|
|
217
|
+
|
|
218
|
+
file_path: str = Field(
|
|
219
|
+
...,
|
|
220
|
+
description="The NetCDF files holding the power timeseries.",
|
|
221
|
+
)
|
|
222
|
+
metadata_file_path: str = Field(
|
|
223
|
+
...,
|
|
224
|
+
description="The CSV files describing power system",
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# TODO validate the netcdf for sites
|
|
228
|
+
# TODO validate the csv for metadata
|
|
229
|
+
|
|
265
230
|
|
|
266
231
|
|
|
267
232
|
# noinspection PyPep8Naming
|
|
268
233
|
class InputData(Base):
|
|
269
|
-
"""
|
|
270
|
-
Input data model.
|
|
271
|
-
"""
|
|
234
|
+
"""Input data model"""
|
|
272
235
|
|
|
273
236
|
satellite: Optional[Satellite] = None
|
|
274
237
|
nwp: Optional[MultiNWP] = None
|
|
@@ -280,4 +243,4 @@ class Configuration(Base):
|
|
|
280
243
|
"""Configuration model for the dataset"""
|
|
281
244
|
|
|
282
245
|
general: General = General()
|
|
283
|
-
input_data: InputData = InputData()
|
|
246
|
+
input_data: InputData = InputData()
|
|
@@ -20,8 +20,8 @@ def get_dataset_dict(config: Configuration) -> dict[str, dict[xr.DataArray]]:
|
|
|
20
20
|
datasets_dict = {}
|
|
21
21
|
|
|
22
22
|
# Load GSP data unless the path is None
|
|
23
|
-
if in_config.gsp and in_config.gsp.
|
|
24
|
-
da_gsp = open_gsp(zarr_path=in_config.gsp.
|
|
23
|
+
if in_config.gsp and in_config.gsp.zarr_path:
|
|
24
|
+
da_gsp = open_gsp(zarr_path=in_config.gsp.zarr_path).compute()
|
|
25
25
|
|
|
26
26
|
# Remove national GSP
|
|
27
27
|
datasets_dict["gsp"] = da_gsp.sel(gsp_id=slice(1, None))
|
|
@@ -32,9 +32,9 @@ def get_dataset_dict(config: Configuration) -> dict[str, dict[xr.DataArray]]:
|
|
|
32
32
|
datasets_dict["nwp"] = {}
|
|
33
33
|
for nwp_source, nwp_config in in_config.nwp.items():
|
|
34
34
|
|
|
35
|
-
da_nwp = open_nwp(nwp_config.
|
|
35
|
+
da_nwp = open_nwp(nwp_config.zarr_path, provider=nwp_config.provider)
|
|
36
36
|
|
|
37
|
-
da_nwp = da_nwp.sel(channel=list(nwp_config.
|
|
37
|
+
da_nwp = da_nwp.sel(channel=list(nwp_config.channels))
|
|
38
38
|
|
|
39
39
|
datasets_dict["nwp"][nwp_source] = da_nwp
|
|
40
40
|
|
|
@@ -42,9 +42,9 @@ def get_dataset_dict(config: Configuration) -> dict[str, dict[xr.DataArray]]:
|
|
|
42
42
|
if in_config.satellite:
|
|
43
43
|
sat_config = config.input_data.satellite
|
|
44
44
|
|
|
45
|
-
da_sat = open_sat_data(sat_config.
|
|
45
|
+
da_sat = open_sat_data(sat_config.zarr_path)
|
|
46
46
|
|
|
47
|
-
da_sat = da_sat.sel(channel=list(sat_config.
|
|
47
|
+
da_sat = da_sat.sel(channel=list(sat_config.channels))
|
|
48
48
|
|
|
49
49
|
datasets_dict["sat"] = da_sat
|
|
50
50
|
|
|
@@ -63,16 +63,16 @@ def find_contiguous_time_periods(
|
|
|
63
63
|
|
|
64
64
|
def trim_contiguous_time_periods(
|
|
65
65
|
contiguous_time_periods: pd.DataFrame,
|
|
66
|
-
|
|
67
|
-
|
|
66
|
+
interval_start: pd.Timedelta,
|
|
67
|
+
interval_end: pd.Timedelta,
|
|
68
68
|
) -> pd.DataFrame:
|
|
69
69
|
"""Trim the contiguous time periods to allow for history and forecast durations.
|
|
70
70
|
|
|
71
71
|
Args:
|
|
72
72
|
contiguous_time_periods: DataFrame where each row represents a single time period. The
|
|
73
73
|
DataFrame must have `start_dt` and `end_dt` columns.
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
interval_start: The start of the interval with respect to t0
|
|
75
|
+
interval_end: The end of the interval with respect to t0
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
Returns:
|
|
@@ -80,8 +80,8 @@ def trim_contiguous_time_periods(
|
|
|
80
80
|
"""
|
|
81
81
|
contiguous_time_periods = contiguous_time_periods.copy()
|
|
82
82
|
|
|
83
|
-
contiguous_time_periods["start_dt"]
|
|
84
|
-
contiguous_time_periods["end_dt"] -=
|
|
83
|
+
contiguous_time_periods["start_dt"] -= interval_start
|
|
84
|
+
contiguous_time_periods["end_dt"] -= interval_end
|
|
85
85
|
|
|
86
86
|
valid_mask = contiguous_time_periods["start_dt"] <= contiguous_time_periods["end_dt"]
|
|
87
87
|
contiguous_time_periods = contiguous_time_periods.loc[valid_mask]
|
|
@@ -92,16 +92,16 @@ def trim_contiguous_time_periods(
|
|
|
92
92
|
|
|
93
93
|
def find_contiguous_t0_periods(
|
|
94
94
|
datetimes: pd.DatetimeIndex,
|
|
95
|
-
|
|
96
|
-
|
|
95
|
+
interval_start: pd.Timedelta,
|
|
96
|
+
interval_end: pd.Timedelta,
|
|
97
97
|
sample_period_duration: pd.Timedelta,
|
|
98
98
|
) -> pd.DataFrame:
|
|
99
99
|
"""Return a pd.DataFrame where each row records the boundary of a contiguous time period.
|
|
100
100
|
|
|
101
101
|
Args:
|
|
102
102
|
datetimes: pd.DatetimeIndex. Must be sorted.
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
interval_start: The start of the interval with respect to t0
|
|
104
|
+
interval_end: The end of the interval with respect to t0
|
|
105
105
|
sample_period_duration: The sample frequency of the timeseries
|
|
106
106
|
|
|
107
107
|
|
|
@@ -109,7 +109,7 @@ def find_contiguous_t0_periods(
|
|
|
109
109
|
pd.DataFrame where each row represents a single time period. The pd.DataFrame
|
|
110
110
|
has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
|
|
111
111
|
"""
|
|
112
|
-
total_duration =
|
|
112
|
+
total_duration = interval_end - interval_start
|
|
113
113
|
|
|
114
114
|
contiguous_time_periods = find_contiguous_time_periods(
|
|
115
115
|
datetimes=datetimes,
|
|
@@ -119,8 +119,8 @@ def find_contiguous_t0_periods(
|
|
|
119
119
|
|
|
120
120
|
contiguous_t0_periods = trim_contiguous_time_periods(
|
|
121
121
|
contiguous_time_periods=contiguous_time_periods,
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
interval_start=interval_start,
|
|
123
|
+
interval_end=interval_end,
|
|
124
124
|
)
|
|
125
125
|
|
|
126
126
|
assert len(contiguous_t0_periods) > 0
|
|
@@ -128,92 +128,57 @@ def find_contiguous_t0_periods(
|
|
|
128
128
|
return contiguous_t0_periods
|
|
129
129
|
|
|
130
130
|
|
|
131
|
-
def _find_contiguous_t0_periods_nwp(
|
|
132
|
-
ds,
|
|
133
|
-
history_duration: pd.Timedelta,
|
|
134
|
-
forecast_duration: pd.Timedelta,
|
|
135
|
-
max_staleness: pd.Timedelta | None = None,
|
|
136
|
-
max_dropout: pd.Timedelta = pd.Timedelta(0),
|
|
137
|
-
time_dim: str = "init_time_utc",
|
|
138
|
-
end_buffer: pd.Timedelta = pd.Timedelta(0),
|
|
139
|
-
):
|
|
140
|
-
|
|
141
|
-
assert "step" in ds.coords
|
|
142
|
-
# It is possible to use up to this amount of max staleness for the dataset and slice
|
|
143
|
-
# required
|
|
144
|
-
possible_max_staleness = (
|
|
145
|
-
pd.Timedelta(ds["step"].max().item())
|
|
146
|
-
- forecast_duration
|
|
147
|
-
- end_buffer
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
# If max_staleness is set to None we set it based on the max step ahead of the input
|
|
151
|
-
# forecast data
|
|
152
|
-
if max_staleness is None:
|
|
153
|
-
max_staleness = possible_max_staleness
|
|
154
|
-
else:
|
|
155
|
-
# Make sure the max acceptable staleness isn't longer than the max possible
|
|
156
|
-
assert max_staleness <= possible_max_staleness
|
|
157
|
-
max_staleness = max_staleness
|
|
158
|
-
|
|
159
|
-
contiguous_time_periods = find_contiguous_t0_periods_nwp(
|
|
160
|
-
datetimes=pd.DatetimeIndex(ds[time_dim]),
|
|
161
|
-
history_duration=history_duration,
|
|
162
|
-
max_staleness=max_staleness,
|
|
163
|
-
max_dropout=max_dropout,
|
|
164
|
-
)
|
|
165
|
-
return contiguous_time_periods
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
131
|
def find_contiguous_t0_periods_nwp(
|
|
170
|
-
|
|
171
|
-
|
|
132
|
+
init_times: pd.DatetimeIndex,
|
|
133
|
+
interval_start: pd.Timedelta,
|
|
172
134
|
max_staleness: pd.Timedelta,
|
|
173
135
|
max_dropout: pd.Timedelta = pd.Timedelta(0),
|
|
136
|
+
first_forecast_step: pd.Timedelta = pd.Timedelta(0),
|
|
137
|
+
|
|
174
138
|
) -> pd.DataFrame:
|
|
175
139
|
"""Get all time periods from the NWP init times which are valid as t0 datetimes.
|
|
176
140
|
|
|
177
141
|
Args:
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
max_staleness: Up to how long after an
|
|
181
|
-
|
|
182
|
-
valid time.
|
|
142
|
+
init_times: The initialisation times of the available forecasts
|
|
143
|
+
interval_start: The start of the desired data interval with respect to t0
|
|
144
|
+
max_staleness: Up to how long after an init time are we willing to use the forecast. Each
|
|
145
|
+
init time will only be used up to this t0 time regardless of the forecast valid time.
|
|
183
146
|
max_dropout: What is the maximum amount of dropout that will be used. This must be <=
|
|
184
147
|
max_staleness.
|
|
148
|
+
first_forecast_step: The timedelta of the first step of the forecast. By default we assume
|
|
149
|
+
the first valid time of the forecast is the same as its init time.
|
|
185
150
|
|
|
186
151
|
Returns:
|
|
187
152
|
pd.DataFrame where each row represents a single time period. The pd.DataFrame
|
|
188
153
|
has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
|
|
189
154
|
"""
|
|
190
155
|
# Sanity checks.
|
|
191
|
-
assert len(
|
|
192
|
-
assert
|
|
193
|
-
assert
|
|
194
|
-
assert history_duration >= pd.Timedelta(0)
|
|
156
|
+
assert len(init_times) > 0
|
|
157
|
+
assert init_times.is_monotonic_increasing
|
|
158
|
+
assert init_times.is_unique
|
|
195
159
|
assert max_staleness >= pd.Timedelta(0)
|
|
196
|
-
assert max_dropout <= max_staleness
|
|
160
|
+
assert pd.Timedelta(0) <= max_dropout <= max_staleness
|
|
197
161
|
|
|
198
|
-
hist_drop_buffer = max(
|
|
162
|
+
hist_drop_buffer = max(first_forecast_step-interval_start, max_dropout)
|
|
199
163
|
|
|
200
164
|
# Store contiguous periods
|
|
201
165
|
contiguous_periods = []
|
|
202
166
|
|
|
203
|
-
#
|
|
204
|
-
|
|
167
|
+
# Begin the first period allowing for the time to the first_forecast_step, the length of the
|
|
168
|
+
# interval sampled from before t0, and the dropout
|
|
169
|
+
start_this_period = init_times[0] + hist_drop_buffer
|
|
205
170
|
|
|
206
171
|
# The first forecast is valid up to the max staleness
|
|
207
|
-
end_this_period =
|
|
208
|
-
|
|
209
|
-
for dt_init in
|
|
210
|
-
# If the previous init time becomes stale before the next init becomes valid whilst also
|
|
211
|
-
# considering dropout
|
|
212
|
-
#
|
|
213
|
-
|
|
172
|
+
end_this_period = init_times[0] + max_staleness
|
|
173
|
+
|
|
174
|
+
for dt_init in init_times[1:]:
|
|
175
|
+
# If the previous init time becomes stale before the next init becomes valid (whilst also
|
|
176
|
+
# considering dropout) then the contiguous period breaks
|
|
177
|
+
# Else if the previous init time becomes stale before the fist step of the next forecast
|
|
178
|
+
# then this also causes a break in the contiguous period
|
|
179
|
+
if (end_this_period < dt_init + max(max_dropout, first_forecast_step)):
|
|
214
180
|
contiguous_periods.append([start_this_period, end_this_period])
|
|
215
|
-
|
|
216
|
-
# And start a new period
|
|
181
|
+
# The new period begins with the same conditions as the first period
|
|
217
182
|
start_this_period = dt_init + hist_drop_buffer
|
|
218
183
|
end_this_period = dt_init + max_staleness
|
|
219
184
|
|