ocf-data-sampler 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ocf-data-sampler might be problematic. Click here for more details.
- ocf_data_sampler/config/load.py +3 -3
- ocf_data_sampler/config/model.py +146 -64
- ocf_data_sampler/config/save.py +5 -4
- ocf_data_sampler/load/gsp.py +6 -5
- ocf_data_sampler/load/load_dataset.py +5 -6
- ocf_data_sampler/load/nwp/nwp.py +17 -5
- ocf_data_sampler/load/nwp/providers/ecmwf.py +6 -7
- ocf_data_sampler/load/nwp/providers/gfs.py +36 -0
- ocf_data_sampler/load/nwp/providers/icon.py +46 -0
- ocf_data_sampler/load/nwp/providers/ukv.py +4 -5
- ocf_data_sampler/load/nwp/providers/utils.py +3 -1
- ocf_data_sampler/load/satellite.py +9 -10
- ocf_data_sampler/load/site.py +10 -6
- ocf_data_sampler/load/utils.py +21 -16
- ocf_data_sampler/numpy_sample/collate.py +10 -9
- ocf_data_sampler/numpy_sample/datetime_features.py +3 -5
- ocf_data_sampler/numpy_sample/gsp.py +12 -14
- ocf_data_sampler/numpy_sample/nwp.py +12 -12
- ocf_data_sampler/numpy_sample/satellite.py +9 -9
- ocf_data_sampler/numpy_sample/site.py +5 -8
- ocf_data_sampler/numpy_sample/sun_position.py +16 -21
- ocf_data_sampler/sample/base.py +15 -17
- ocf_data_sampler/sample/site.py +13 -20
- ocf_data_sampler/sample/uk_regional.py +29 -35
- ocf_data_sampler/select/dropout.py +16 -14
- ocf_data_sampler/select/fill_time_periods.py +15 -5
- ocf_data_sampler/select/find_contiguous_time_periods.py +88 -75
- ocf_data_sampler/select/geospatial.py +63 -54
- ocf_data_sampler/select/location.py +16 -51
- ocf_data_sampler/select/select_spatial_slice.py +105 -89
- ocf_data_sampler/select/select_time_slice.py +71 -58
- ocf_data_sampler/select/spatial_slice_for_dataset.py +7 -6
- ocf_data_sampler/select/time_slice_for_dataset.py +17 -16
- ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py +140 -131
- ocf_data_sampler/torch_datasets/datasets/site.py +152 -112
- ocf_data_sampler/torch_datasets/utils/__init__.py +3 -0
- ocf_data_sampler/torch_datasets/utils/channel_dict_to_dataarray.py +11 -0
- ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py +6 -2
- ocf_data_sampler/torch_datasets/utils/valid_time_periods.py +23 -22
- ocf_data_sampler/utils.py +3 -1
- {ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/METADATA +7 -18
- ocf_data_sampler-0.1.17.dist-info/RECORD +56 -0
- {ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/WHEEL +1 -1
- {ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/top_level.txt +1 -1
- scripts/refactor_site.py +63 -33
- utils/compute_icon_mean_stddev.py +72 -0
- ocf_data_sampler/constants.py +0 -222
- ocf_data_sampler/torch_datasets/utils/validate_channels.py +0 -82
- ocf_data_sampler-0.1.11.dist-info/LICENSE +0 -21
- ocf_data_sampler-0.1.11.dist-info/RECORD +0 -82
- tests/__init__.py +0 -0
- tests/config/test_config.py +0 -113
- tests/config/test_load.py +0 -7
- tests/config/test_save.py +0 -28
- tests/conftest.py +0 -319
- tests/load/test_load_gsp.py +0 -15
- tests/load/test_load_nwp.py +0 -21
- tests/load/test_load_satellite.py +0 -17
- tests/load/test_load_sites.py +0 -14
- tests/numpy_sample/test_collate.py +0 -21
- tests/numpy_sample/test_datetime_features.py +0 -37
- tests/numpy_sample/test_gsp.py +0 -38
- tests/numpy_sample/test_nwp.py +0 -13
- tests/numpy_sample/test_satellite.py +0 -40
- tests/numpy_sample/test_sun_position.py +0 -81
- tests/select/test_dropout.py +0 -69
- tests/select/test_fill_time_periods.py +0 -28
- tests/select/test_find_contiguous_time_periods.py +0 -202
- tests/select/test_location.py +0 -67
- tests/select/test_select_spatial_slice.py +0 -154
- tests/select/test_select_time_slice.py +0 -275
- tests/test_sample/test_base.py +0 -164
- tests/test_sample/test_site_sample.py +0 -165
- tests/test_sample/test_uk_regional_sample.py +0 -136
- tests/torch_datasets/test_merge_and_fill_utils.py +0 -40
- tests/torch_datasets/test_pvnet_uk.py +0 -154
- tests/torch_datasets/test_site.py +0 -226
- tests/torch_datasets/test_validate_channels_utils.py +0 -78
|
@@ -1,34 +1,31 @@
|
|
|
1
|
+
"""Functions pertaining to finding valid time periods for the input data."""
|
|
2
|
+
|
|
1
3
|
import numpy as np
|
|
2
4
|
import pandas as pd
|
|
3
5
|
|
|
4
6
|
from ocf_data_sampler.config import Configuration
|
|
5
7
|
from ocf_data_sampler.select.find_contiguous_time_periods import (
|
|
8
|
+
find_contiguous_t0_periods,
|
|
6
9
|
find_contiguous_t0_periods_nwp,
|
|
7
|
-
find_contiguous_t0_periods,
|
|
8
10
|
intersection_of_multiple_dataframes_of_periods,
|
|
9
11
|
)
|
|
10
12
|
from ocf_data_sampler.utils import minutes
|
|
11
13
|
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
datasets_dict: dict,
|
|
16
|
-
config: Configuration,
|
|
17
|
-
):
|
|
18
|
-
"""Find the t0 times where all of the requested input data is available
|
|
15
|
+
def find_valid_time_periods(datasets_dict: dict, config: Configuration) -> pd.DataFrame:
|
|
16
|
+
"""Find the t0 times where all of the requested input data is available.
|
|
19
17
|
|
|
20
18
|
Args:
|
|
21
19
|
datasets_dict: A dictionary of input datasets
|
|
22
20
|
config: Configuration file
|
|
23
21
|
"""
|
|
22
|
+
if not set(datasets_dict.keys()).issubset({"nwp", "sat", "gsp"}):
|
|
23
|
+
raise ValueError(f"Invalid keys in datasets_dict: {datasets_dict.keys()}")
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
contiguous_time_periods: dict[str: pd.DataFrame] = {} # Used to store contiguous time periods from each data source
|
|
28
|
-
|
|
25
|
+
# Used to store contiguous time periods from each data source
|
|
26
|
+
contiguous_time_periods: dict[str : pd.DataFrame] = {}
|
|
29
27
|
if "nwp" in datasets_dict:
|
|
30
28
|
for nwp_key, nwp_config in config.input_data.nwp.items():
|
|
31
|
-
|
|
32
29
|
da = datasets_dict["nwp"][nwp_key]
|
|
33
30
|
|
|
34
31
|
if nwp_config.dropout_timedeltas_minutes is None:
|
|
@@ -59,8 +56,12 @@ def find_valid_time_periods(
|
|
|
59
56
|
max_staleness = max_possible_staleness
|
|
60
57
|
else:
|
|
61
58
|
# Make sure the max acceptable staleness isn't longer than the max possible
|
|
62
|
-
|
|
63
|
-
|
|
59
|
+
if max_staleness > max_possible_staleness:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"max_staleness_minutes is too long for the input data, "
|
|
62
|
+
f"{max_staleness=}, {max_possible_staleness=}",
|
|
63
|
+
)
|
|
64
|
+
|
|
64
65
|
# Find the first forecast step
|
|
65
66
|
first_forecast_step = pd.Timedelta(da["step"].min().item())
|
|
66
67
|
|
|
@@ -69,34 +70,34 @@ def find_valid_time_periods(
|
|
|
69
70
|
interval_start=minutes(nwp_config.interval_start_minutes),
|
|
70
71
|
max_staleness=max_staleness,
|
|
71
72
|
max_dropout=max_dropout,
|
|
72
|
-
first_forecast_step
|
|
73
|
+
first_forecast_step=first_forecast_step,
|
|
73
74
|
)
|
|
74
75
|
|
|
75
|
-
contiguous_time_periods[f
|
|
76
|
+
contiguous_time_periods[f"nwp_{nwp_key}"] = time_periods
|
|
76
77
|
|
|
77
78
|
if "sat" in datasets_dict:
|
|
78
79
|
sat_config = config.input_data.satellite
|
|
79
80
|
|
|
80
81
|
time_periods = find_contiguous_t0_periods(
|
|
81
82
|
pd.DatetimeIndex(datasets_dict["sat"]["time_utc"]),
|
|
82
|
-
|
|
83
|
+
time_resolution=minutes(sat_config.time_resolution_minutes),
|
|
83
84
|
interval_start=minutes(sat_config.interval_start_minutes),
|
|
84
85
|
interval_end=minutes(sat_config.interval_end_minutes),
|
|
85
86
|
)
|
|
86
87
|
|
|
87
|
-
contiguous_time_periods[
|
|
88
|
+
contiguous_time_periods["sat"] = time_periods
|
|
88
89
|
|
|
89
90
|
if "gsp" in datasets_dict:
|
|
90
91
|
gsp_config = config.input_data.gsp
|
|
91
92
|
|
|
92
93
|
time_periods = find_contiguous_t0_periods(
|
|
93
94
|
pd.DatetimeIndex(datasets_dict["gsp"]["time_utc"]),
|
|
94
|
-
|
|
95
|
+
time_resolution=minutes(gsp_config.time_resolution_minutes),
|
|
95
96
|
interval_start=minutes(gsp_config.interval_start_minutes),
|
|
96
97
|
interval_end=minutes(gsp_config.interval_end_minutes),
|
|
97
98
|
)
|
|
98
99
|
|
|
99
|
-
contiguous_time_periods[
|
|
100
|
+
contiguous_time_periods["gsp"] = time_periods
|
|
100
101
|
|
|
101
102
|
# just get the values (not the keys)
|
|
102
103
|
contiguous_time_periods_values = list(contiguous_time_periods.values())
|
|
@@ -104,7 +105,7 @@ def find_valid_time_periods(
|
|
|
104
105
|
# Find joint overlapping contiguous time periods
|
|
105
106
|
if len(contiguous_time_periods_values) > 1:
|
|
106
107
|
valid_time_periods = intersection_of_multiple_dataframes_of_periods(
|
|
107
|
-
contiguous_time_periods_values
|
|
108
|
+
contiguous_time_periods_values,
|
|
108
109
|
)
|
|
109
110
|
else:
|
|
110
111
|
valid_time_periods = contiguous_time_periods_values[0]
|
|
@@ -113,4 +114,4 @@ def find_valid_time_periods(
|
|
|
113
114
|
if len(valid_time_periods) == 0:
|
|
114
115
|
raise ValueError(f"No valid time periods found, {contiguous_time_periods=}")
|
|
115
116
|
|
|
116
|
-
return valid_time_periods
|
|
117
|
+
return valid_time_periods
|
ocf_data_sampler/utils.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
|
-
Name:
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
|
|
5
|
-
Author:
|
|
6
|
-
Author-email: info@openclimatefix.org
|
|
7
|
-
Maintainer: Open Climate Fix Ltd
|
|
2
|
+
Name: ocf-data-sampler
|
|
3
|
+
Version: 0.1.17
|
|
4
|
+
Author: James Fulton, Peter Dudfield
|
|
5
|
+
Author-email: Open Climate Fix team <info@openclimatefix.org>
|
|
8
6
|
License: MIT License
|
|
9
7
|
|
|
10
8
|
Copyright (c) 2023 Open Climate Fix
|
|
@@ -27,21 +25,18 @@ License: MIT License
|
|
|
27
25
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
26
|
SOFTWARE.
|
|
29
27
|
|
|
30
|
-
Project-URL: homepage, https://github.com/openclimatefix
|
|
31
28
|
Project-URL: repository, https://github.com/openclimatefix/ocf-data-sampler
|
|
32
|
-
|
|
29
|
+
Classifier: Programming Language :: Python :: 3
|
|
33
30
|
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
-
|
|
35
|
-
Classifier: Operating System :: POSIX :: Linux
|
|
36
|
-
Requires-Python: >=3.8
|
|
31
|
+
Requires-Python: >=3.10
|
|
37
32
|
Description-Content-Type: text/markdown
|
|
38
|
-
License-File: LICENSE
|
|
39
33
|
Requires-Dist: torch
|
|
40
34
|
Requires-Dist: numpy
|
|
41
35
|
Requires-Dist: pandas
|
|
42
36
|
Requires-Dist: xarray
|
|
43
37
|
Requires-Dist: zarr==2.18.3
|
|
44
38
|
Requires-Dist: dask
|
|
39
|
+
Requires-Dist: matplotlib
|
|
45
40
|
Requires-Dist: ocf_blosc2
|
|
46
41
|
Requires-Dist: pvlib
|
|
47
42
|
Requires-Dist: pydantic
|
|
@@ -50,11 +45,6 @@ Requires-Dist: pathy
|
|
|
50
45
|
Requires-Dist: pyaml_env
|
|
51
46
|
Requires-Dist: pyresample
|
|
52
47
|
Requires-Dist: h5netcdf
|
|
53
|
-
Provides-Extra: docs
|
|
54
|
-
Requires-Dist: mkdocs>=1.2; extra == "docs"
|
|
55
|
-
Requires-Dist: mkdocs-material>=8.0; extra == "docs"
|
|
56
|
-
Provides-Extra: plot
|
|
57
|
-
Requires-Dist: matplotlib; extra == "plot"
|
|
58
48
|
|
|
59
49
|
# ocf-data-sampler
|
|
60
50
|
|
|
@@ -77,7 +67,6 @@ We are currently migrating to this repo from [ocf_datapipes](https://github.com/
|
|
|
77
67
|
|
|
78
68
|
**ocf-data-sampler** doesn't have external documentation _yet_; you can read a bit about how our torch datasets work in the README [here](ocf_data_sampler/torch_datasets/README.md).
|
|
79
69
|
|
|
80
|
-
|
|
81
70
|
## FAQ
|
|
82
71
|
|
|
83
72
|
If you have any questions about this or any other of our repos, don't hesitate to hop to our [Discussions Page](https://github.com/orgs/openclimatefix/discussions)!
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
ocf_data_sampler/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
2
|
+
ocf_data_sampler/utils.py,sha256=DjuneGGisl08ENvPZV_lrcX4b2NCKJC1ZpXgIpxuQi4,290
|
|
3
|
+
ocf_data_sampler/config/__init__.py,sha256=O29mbH0XG2gIY1g3BaveGCnpBO2SFqdu-qzJ7a6evl0,223
|
|
4
|
+
ocf_data_sampler/config/load.py,sha256=LL-7wemI8o4KPkx35j-wQ3HjsMvDgqXr7G46IcASfnU,632
|
|
5
|
+
ocf_data_sampler/config/model.py,sha256=y8maV_1z0LL_m0J607ka_yJ0KI-0ssYDn5Ghk8aNgR0,10189
|
|
6
|
+
ocf_data_sampler/config/save.py,sha256=m8SPw5rXjkMm1rByjh3pK5StdBi4e8ysnn3jQopdRaI,1064
|
|
7
|
+
ocf_data_sampler/data/uk_gsp_locations.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZtI58td4d4DhGos,10415772
|
|
8
|
+
ocf_data_sampler/load/__init__.py,sha256=T5Zj1PGt0aiiNEN7Ra1Ac-cBsNKhphmmHy_8g7XU_w0,219
|
|
9
|
+
ocf_data_sampler/load/gsp.py,sha256=keB3Nv_CNK1P6pS9Kdfc8PoZXTI1_YFN-spsvEv_Ewc,899
|
|
10
|
+
ocf_data_sampler/load/load_dataset.py,sha256=0NyDxCDfgE_esKVW3s-rZEe16WB30FQ74ClWlrIo72M,1602
|
|
11
|
+
ocf_data_sampler/load/satellite.py,sha256=E7Ln7Y60Qr1RTV-_R71YoxXQM-Ca7Y1faIo3oKB2eFk,2292
|
|
12
|
+
ocf_data_sampler/load/site.py,sha256=zOzlWk6pYZBB5daqG8URGksmDXWKrkutUvN8uALAIh8,1468
|
|
13
|
+
ocf_data_sampler/load/utils.py,sha256=Jwbr1rpEa3cefjw-OTVRaxnIHyGixYB3TlTlta0BOdU,1727
|
|
14
|
+
ocf_data_sampler/load/nwp/__init__.py,sha256=SmcrnbygO5xtCKmGR4wtHrj-HI7nOAvnAtfuvRufBGQ,25
|
|
15
|
+
ocf_data_sampler/load/nwp/nwp.py,sha256=0AIHQTJLUtwP2Toz_PskOTYFJXfMvGhk8faAcNvI9jk,922
|
|
16
|
+
ocf_data_sampler/load/nwp/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
ocf_data_sampler/load/nwp/providers/ecmwf.py,sha256=5AzktPJgertCx6oij6aePRosPuZHGFznMxTgtkk_mgc,994
|
|
18
|
+
ocf_data_sampler/load/nwp/providers/gfs.py,sha256=JSDeh4YG1wibV8--P3X-zTO8LP0dsJcpFvIyglBbhi0,979
|
|
19
|
+
ocf_data_sampler/load/nwp/providers/icon.py,sha256=yYUrs5HgjU0C5pMHBB6FGn3tLjswi990IY6QCXS1Zmw,1569
|
|
20
|
+
ocf_data_sampler/load/nwp/providers/ukv.py,sha256=-0v8JCLH8ypz8GMXZ6Rrx-I0LoHuHO8sXFupbC1RpM0,1013
|
|
21
|
+
ocf_data_sampler/load/nwp/providers/utils.py,sha256=cJZ9JA4W_ZeTcLQ5z71w46_DJaPcW_2JMmBdjP9r3qs,835
|
|
22
|
+
ocf_data_sampler/numpy_sample/__init__.py,sha256=nY5C6CcuxiWZ_jrXRzWtN7WyKXhJImSiVTIG6Rz4B_4,401
|
|
23
|
+
ocf_data_sampler/numpy_sample/collate.py,sha256=I9YPcbxOwHYaDGKbzxqdV-3DFEHkzqdhAwnW7_tZH2w,1966
|
|
24
|
+
ocf_data_sampler/numpy_sample/datetime_features.py,sha256=INudxHcoB_c-GvYXe08S4Up_8TU5zOJ39PWRrTKfLp8,1203
|
|
25
|
+
ocf_data_sampler/numpy_sample/gsp.py,sha256=EDaQdOVEDBJGrXsq54UNBfpXTzi0ky_WpgBbmlyxOXM,1074
|
|
26
|
+
ocf_data_sampler/numpy_sample/nwp.py,sha256=iBGOdLMn-F5yR3juX3l4G2oXDpvGNuUdcR6ZCZkCqZk,1037
|
|
27
|
+
ocf_data_sampler/numpy_sample/satellite.py,sha256=oBlyNpO-syoyK4SSghoHqIDNyhcBqyd1L6eXSSw0k3w,1036
|
|
28
|
+
ocf_data_sampler/numpy_sample/site.py,sha256=tpX7j6dTOz2YmOFIzVYqTfWvIduKlOnBcLITsuPMgxU,1250
|
|
29
|
+
ocf_data_sampler/numpy_sample/sun_position.py,sha256=nkfgN6NmiLGoLSuJZrDsM-6nsIzJN75tWfN20Z7n8xo,1480
|
|
30
|
+
ocf_data_sampler/sample/__init__.py,sha256=zdS73NTnxFX_j8uh9tT-IXiURB6635wbneM1koWYV1o,169
|
|
31
|
+
ocf_data_sampler/sample/base.py,sha256=lnr-MNRpAxjVFJHCEvCZL86NrYy9LWnNOsLWBGDL8kc,2359
|
|
32
|
+
ocf_data_sampler/sample/site.py,sha256=4aJys40CQ-2CRKo_dgvm3rINTdfyTGWQGEaXGbh58qQ,1236
|
|
33
|
+
ocf_data_sampler/sample/uk_regional.py,sha256=uMtLdqZCsKttjFmhIC6JITzu2JDZh-VQdYUfbpyhgFM,2409
|
|
34
|
+
ocf_data_sampler/select/__init__.py,sha256=E4AJulEbO2K-o0UlG1fgaEteuf_1ZFjHTvrotXSb4YU,332
|
|
35
|
+
ocf_data_sampler/select/dropout.py,sha256=_rzXl8_4VHTY_JMjbaoWopaFCJmLdaBpqfYF4vr24tk,1638
|
|
36
|
+
ocf_data_sampler/select/fill_time_periods.py,sha256=TlGxp1xiAqnhdWfLy0pv3FuZc00dtimjWdLzr4JoTGA,865
|
|
37
|
+
ocf_data_sampler/select/find_contiguous_time_periods.py,sha256=cEXrQDzk8pXknxB0q3v5DakosagHMoLDAj302B8Xpw0,11537
|
|
38
|
+
ocf_data_sampler/select/geospatial.py,sha256=CDExkl36eZOKmdJPzUr_K0Wn3axHqv5nYo-EkSiINcc,5032
|
|
39
|
+
ocf_data_sampler/select/location.py,sha256=AZvGR8y62opiW7zACGXjoOtBEWRfSLOZIA73O5Deu0c,1037
|
|
40
|
+
ocf_data_sampler/select/select_spatial_slice.py,sha256=qY2Ll00EPA80oBtzwMoR5nk0UIpoWZF9oXl22YwWr0Q,12341
|
|
41
|
+
ocf_data_sampler/select/select_time_slice.py,sha256=q5QdgHPIXQb49uT5NwbOguY1GhjWc_o3c-2cDb5kLAo,5455
|
|
42
|
+
ocf_data_sampler/select/spatial_slice_for_dataset.py,sha256=Hvz0wHSWMYYamf2oHNiGlzJcM4cAH6pL_7ZEvIBL2dE,1882
|
|
43
|
+
ocf_data_sampler/select/time_slice_for_dataset.py,sha256=1DN6VsWWdLvkpJxodZtBRDUgC4vJE2td_RP5J3ZqPNw,4268
|
|
44
|
+
ocf_data_sampler/torch_datasets/datasets/__init__.py,sha256=jfJSFcR0eO1AqeH7S3KnGjsBqVZT5w3oyi784PUR6Q0,146
|
|
45
|
+
ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=ZkXm0IQEIzZUi8O-qJJz2PbJr9T4ZvutL424yRQUJhc,12878
|
|
46
|
+
ocf_data_sampler/torch_datasets/datasets/site.py,sha256=j29cWPIcksRbge014MxR0_OgJqoskdki6KqvtoHtxpY,18023
|
|
47
|
+
ocf_data_sampler/torch_datasets/utils/__init__.py,sha256=7Yt4anQVU9y27nj4Wx1tRLqbAQLbzW0ED71UL65LvxA,187
|
|
48
|
+
ocf_data_sampler/torch_datasets/utils/channel_dict_to_dataarray.py,sha256=MGylKhXxXLQC2fYv-8L_GVoYhov3LcEwC0Q21xItDSk,353
|
|
49
|
+
ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py,sha256=we7BTxRH7B7jKayDT7YfNyfI3zZClz2Bk-HXKQIokgU,956
|
|
50
|
+
ocf_data_sampler/torch_datasets/utils/valid_time_periods.py,sha256=LdHgLPAYUVoCRMk2nnFdsMpygGS2kbps3h-7_bZnETw,4718
|
|
51
|
+
scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
|
|
52
|
+
utils/compute_icon_mean_stddev.py,sha256=a1oWMRMnny39rV-dvu8rcx85sb4bXzPFrR1gkUr4Jpg,2296
|
|
53
|
+
ocf_data_sampler-0.1.17.dist-info/METADATA,sha256=RI0JClDkwWGjw7gel_j-k2B-SmMKMFLwHdDqEVP5R0U,11713
|
|
54
|
+
ocf_data_sampler-0.1.17.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
55
|
+
ocf_data_sampler-0.1.17.dist-info/top_level.txt,sha256=LEFU4Uk-PEo72QGLAfnVZIUEm37Q8mKuMeg_Xk-p33g,31
|
|
56
|
+
ocf_data_sampler-0.1.17.dist-info/RECORD,,
|
scripts/refactor_site.py
CHANGED
|
@@ -1,50 +1,80 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Refactor legacy site data into a more structured format."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
2
4
|
import xarray as xr
|
|
3
5
|
|
|
4
|
-
def legacy_format(data_ds, metadata_df):
|
|
5
|
-
"""This formats old legacy data to the new format.
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
def legacy_format(data_ds: xr.Dataset, metadata_df: pd.DataFrame) -> xr.Dataset:
|
|
8
|
+
"""Converts old legacy site data into a more structured format.
|
|
9
|
+
|
|
10
|
+
This function does three main things:
|
|
11
|
+
1. Renames some columns in the metadata to keep things consistent.
|
|
12
|
+
2. Reshapes site data so that instead of having separate variables for each site,
|
|
13
|
+
we use a `site_id` dimension—makes life easier for analysis.
|
|
14
|
+
3. Adds `capacity_kwp` as a time series so that each site has its capacity info.
|
|
15
|
+
|
|
16
|
+
Parameters:
|
|
17
|
+
data_ds (xr.Dataset): The dataset containing legacy site data.
|
|
18
|
+
metadata_df (pd.DataFrame): A DataFrame with metadata about the sites.
|
|
11
19
|
|
|
20
|
+
Returns:
|
|
21
|
+
xr.Dataset: Reformatted dataset with `generation_kw` and `capacity_kwp`.
|
|
22
|
+
"""
|
|
23
|
+
# Step 1: Rename metadata columns to match the new expected format
|
|
12
24
|
if "system_id" in metadata_df.columns:
|
|
13
|
-
metadata_df
|
|
25
|
+
metadata_df = metadata_df.rename(columns={"system_id": "site_id"})
|
|
14
26
|
|
|
27
|
+
# Convert capacity from megawatts to kilowatts if needed
|
|
15
28
|
if "capacity_megawatts" in metadata_df.columns:
|
|
16
29
|
metadata_df["capacity_kwp"] = metadata_df["capacity_megawatts"] * 1000
|
|
17
30
|
|
|
18
|
-
#
|
|
19
|
-
|
|
31
|
+
# Quick sanity check to ensure we have what we need
|
|
32
|
+
if "site_id" not in metadata_df.columns or "capacity_kwp" not in metadata_df.columns:
|
|
33
|
+
raise ValueError("Metadata is missing required columns: 'site_id' and 'capacity_kwp'.")
|
|
34
|
+
|
|
35
|
+
# Step 2: Transform the dataset
|
|
36
|
+
# Check if we actually have site data in the expected format
|
|
20
37
|
if "0" in data_ds:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
38
|
+
# Convert the dataset into a DataFrame so we can manipulate it more easily
|
|
39
|
+
site_data_df = data_ds.to_dataframe()
|
|
40
|
+
|
|
41
|
+
# Create a DataArray for generation data
|
|
42
|
+
generation_da = xr.DataArray(
|
|
43
|
+
data=site_data_df.values,
|
|
44
|
+
coords={
|
|
45
|
+
"time_utc": site_data_df.index.values,
|
|
46
|
+
"site_id": metadata_df["site_id"].values,
|
|
47
|
+
},
|
|
48
|
+
dims=["time_utc", "site_id"],
|
|
28
49
|
name="generation_kw",
|
|
29
50
|
)
|
|
30
51
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
52
|
+
# Step 3: Attach capacity information
|
|
53
|
+
# Map site_ids to their respective capacities
|
|
54
|
+
site_ids = site_data_df.columns
|
|
55
|
+
capacities = metadata_df.set_index("site_id").loc[site_ids, "capacity_kwp"]
|
|
56
|
+
|
|
57
|
+
# Broadcast capacities across all timestamps
|
|
58
|
+
capacity_df = pd.DataFrame(
|
|
59
|
+
{site_id: [capacities[site_id]] * len(site_data_df) for site_id in site_ids},
|
|
60
|
+
index=site_data_df.index,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Create a DataArray for capacity data
|
|
36
64
|
capacity_da = xr.DataArray(
|
|
37
65
|
data=capacity_df.values,
|
|
38
|
-
coords=
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
66
|
+
coords={
|
|
67
|
+
"time_utc": site_data_df.index.values,
|
|
68
|
+
"site_id": metadata_df["site_id"].values,
|
|
69
|
+
},
|
|
70
|
+
dims=["time_utc", "site_id"],
|
|
42
71
|
name="capacity_kwp",
|
|
43
72
|
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
)
|
|
50
|
-
|
|
73
|
+
|
|
74
|
+
# Finally, bundle everything into a single Dataset
|
|
75
|
+
data_ds = xr.Dataset({
|
|
76
|
+
"generation_kw": generation_da,
|
|
77
|
+
"capacity_kwp": capacity_da,
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
return data_ds
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Script to compute normalisation constants from NWP data."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import glob
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import xarray as xr
|
|
9
|
+
|
|
10
|
+
from ocf_data_sampler.load.nwp.providers.icon import open_icon_eu
|
|
11
|
+
|
|
12
|
+
# Configure logging
|
|
13
|
+
logging.basicConfig(level=logging.INFO)
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Add argument parser
|
|
17
|
+
parser = argparse.ArgumentParser(description="Compute normalization constants from NWP data")
|
|
18
|
+
parser.add_argument("--data-path", type=str, required=True,
|
|
19
|
+
help='Path pattern to zarr files (e.g., "/path/to/data/*.zarr.zip")')
|
|
20
|
+
parser.add_argument("--n-samples", type=int, default=2000,
|
|
21
|
+
help="Number of random samples to use (default: 2000)")
|
|
22
|
+
|
|
23
|
+
args = parser.parse_args()
|
|
24
|
+
|
|
25
|
+
zarr_files = glob.glob(args.data_path)
|
|
26
|
+
n_samples = args.n_samples
|
|
27
|
+
|
|
28
|
+
ds = open_icon_eu(zarr_files)
|
|
29
|
+
|
|
30
|
+
n_init_times = ds.sizes["init_time_utc"]
|
|
31
|
+
n_lats = ds.sizes["latitude"]
|
|
32
|
+
n_longs = ds.sizes["longitude"]
|
|
33
|
+
n_steps = ds.sizes["step"]
|
|
34
|
+
|
|
35
|
+
random_init_times = np.random.choice(n_init_times, size=n_samples, replace=True)
|
|
36
|
+
random_lats = np.random.choice(n_lats, size=n_samples, replace=True)
|
|
37
|
+
random_longs = np.random.choice(n_longs, size=n_samples, replace=True)
|
|
38
|
+
random_steps = np.random.choice(n_steps, size=n_samples, replace=True)
|
|
39
|
+
|
|
40
|
+
samples = []
|
|
41
|
+
for i in range(n_samples):
|
|
42
|
+
sample = ds.isel(init_time_utc=random_init_times[i],
|
|
43
|
+
latitude=random_lats[i],
|
|
44
|
+
longitude=random_longs[i],
|
|
45
|
+
step=random_steps[i])
|
|
46
|
+
samples.append(sample)
|
|
47
|
+
|
|
48
|
+
samples_stack = xr.concat(samples, dim="samples")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
available_channels = samples_stack.channel.values.tolist()
|
|
52
|
+
logger.info("Available channels: %s", available_channels)
|
|
53
|
+
|
|
54
|
+
ICON_EU_MEAN = {}
|
|
55
|
+
ICON_EU_STD = {}
|
|
56
|
+
|
|
57
|
+
for var in available_channels:
|
|
58
|
+
if var not in available_channels:
|
|
59
|
+
logger.warning("Variable '%s' not found in the channel coordinate; skipping.", var)
|
|
60
|
+
continue
|
|
61
|
+
var_data = samples_stack.sel(channel=var)
|
|
62
|
+
var_mean = float(var_data.mean().compute())
|
|
63
|
+
var_std = float(var_data.std().compute())
|
|
64
|
+
|
|
65
|
+
ICON_EU_MEAN[var] = var_mean
|
|
66
|
+
ICON_EU_STD[var] = var_std
|
|
67
|
+
|
|
68
|
+
logger.info("Processed %s: mean=%.4f, std=%.4f", var, var_mean, var_std)
|
|
69
|
+
|
|
70
|
+
logger.info("\nMean values:\n%s", ICON_EU_MEAN)
|
|
71
|
+
logger.info("\nStandard deviations:\n%s", ICON_EU_STD)
|
|
72
|
+
|
ocf_data_sampler/constants.py
DELETED
|
@@ -1,222 +0,0 @@
|
|
|
1
|
-
import xarray as xr
|
|
2
|
-
import numpy as np
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
NWP_PROVIDERS = [
|
|
6
|
-
"ukv",
|
|
7
|
-
"ecmwf",
|
|
8
|
-
"gfs"
|
|
9
|
-
]
|
|
10
|
-
# TODO add ICON
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def _to_data_array(d):
|
|
14
|
-
return xr.DataArray(
|
|
15
|
-
[d[k] for k in d.keys()],
|
|
16
|
-
coords={"channel": [k for k in d.keys()]},
|
|
17
|
-
).astype(np.float32)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class NWPStatDict(dict):
|
|
21
|
-
"""Custom dictionary class to hold NWP normalization stats"""
|
|
22
|
-
|
|
23
|
-
def __getitem__(self, key):
|
|
24
|
-
if key not in NWP_PROVIDERS:
|
|
25
|
-
raise KeyError(f"{key} is not a supported NWP provider - {NWP_PROVIDERS}")
|
|
26
|
-
elif key in self.keys():
|
|
27
|
-
return super().__getitem__(key)
|
|
28
|
-
else:
|
|
29
|
-
raise KeyError(
|
|
30
|
-
f"Values for {key} not yet available in ocf-data-sampler {list(self.keys())}"
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
# ------ UKV
|
|
35
|
-
# Means and std computed WITH version_7 and higher, MetOffice values
|
|
36
|
-
UKV_STD = {
|
|
37
|
-
"cdcb": 2126.99350113,
|
|
38
|
-
"lcc": 39.33210726,
|
|
39
|
-
"mcc": 41.91144559,
|
|
40
|
-
"hcc": 38.07184418,
|
|
41
|
-
"sde": 0.1029753,
|
|
42
|
-
"hcct": 18382.63958991,
|
|
43
|
-
"dswrf": 190.47216887,
|
|
44
|
-
"dlwrf": 39.45988077,
|
|
45
|
-
"h": 1075.77812282,
|
|
46
|
-
"t": 4.38818501,
|
|
47
|
-
"r": 11.45012499,
|
|
48
|
-
"dpt": 4.57250482,
|
|
49
|
-
"vis": 21578.97975625,
|
|
50
|
-
"si10": 3.94718813,
|
|
51
|
-
"wdir10": 94.08407495,
|
|
52
|
-
"prmsl": 1252.71790539,
|
|
53
|
-
"prate": 0.00021497,
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
UKV_MEAN = {
|
|
57
|
-
"cdcb": 1412.26599062,
|
|
58
|
-
"lcc": 50.08362643,
|
|
59
|
-
"mcc": 40.88984494,
|
|
60
|
-
"hcc": 29.11949682,
|
|
61
|
-
"sde": 0.00289545,
|
|
62
|
-
"hcct": -18345.97478167,
|
|
63
|
-
"dswrf": 111.28265039,
|
|
64
|
-
"dlwrf": 325.03130139,
|
|
65
|
-
"h": 2096.51991356,
|
|
66
|
-
"t": 283.64913206,
|
|
67
|
-
"r": 81.79229501,
|
|
68
|
-
"dpt": 280.54379901,
|
|
69
|
-
"vis": 32262.03285118,
|
|
70
|
-
"si10": 6.88348448,
|
|
71
|
-
"wdir10": 199.41891636,
|
|
72
|
-
"prmsl": 101321.61574029,
|
|
73
|
-
"prate": 3.45793433e-05,
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
UKV_STD = _to_data_array(UKV_STD)
|
|
77
|
-
UKV_MEAN = _to_data_array(UKV_MEAN)
|
|
78
|
-
|
|
79
|
-
# ------ ECMWF
|
|
80
|
-
# These were calculated from 100 random init times of UK data from 2020-2023
|
|
81
|
-
ECMWF_STD = {
|
|
82
|
-
"dlwrf": 15855867.0,
|
|
83
|
-
"dswrf": 13025427.0,
|
|
84
|
-
"duvrs": 1445635.25,
|
|
85
|
-
"hcc": 0.42244860529899597,
|
|
86
|
-
"lcc": 0.3791404366493225,
|
|
87
|
-
"mcc": 0.38039860129356384,
|
|
88
|
-
"prate": 9.81039775069803e-05,
|
|
89
|
-
"sd": 0.000913831521756947,
|
|
90
|
-
"sr": 16294988.0,
|
|
91
|
-
"t2m": 3.692270040512085,
|
|
92
|
-
"tcc": 0.37487083673477173,
|
|
93
|
-
"u10": 5.531515598297119,
|
|
94
|
-
"u100": 7.2320556640625,
|
|
95
|
-
"u200": 8.049470901489258,
|
|
96
|
-
"v10": 5.411230564117432,
|
|
97
|
-
"v100": 6.944501876831055,
|
|
98
|
-
"v200": 7.561611652374268,
|
|
99
|
-
"diff_dlwrf": 131942.03125,
|
|
100
|
-
"diff_dswrf": 715366.3125,
|
|
101
|
-
"diff_duvrs": 81605.25,
|
|
102
|
-
"diff_sr": 818950.6875,
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
ECMWF_MEAN = {
|
|
106
|
-
"dlwrf": 27187026.0,
|
|
107
|
-
"dswrf": 11458988.0,
|
|
108
|
-
"duvrs": 1305651.25,
|
|
109
|
-
"hcc": 0.3961029052734375,
|
|
110
|
-
"lcc": 0.44901806116104126,
|
|
111
|
-
"mcc": 0.3288780450820923,
|
|
112
|
-
"prate": 3.108070450252853e-05,
|
|
113
|
-
"sd": 8.107526082312688e-05,
|
|
114
|
-
"sr": 12905302.0,
|
|
115
|
-
"t2m": 283.48333740234375,
|
|
116
|
-
"tcc": 0.7049227356910706,
|
|
117
|
-
"u10": 1.7677178382873535,
|
|
118
|
-
"u100": 2.393547296524048,
|
|
119
|
-
"u200": 2.7963004112243652,
|
|
120
|
-
"v10": 0.985887885093689,
|
|
121
|
-
"v100": 1.4244288206100464,
|
|
122
|
-
"v200": 1.6010299921035767,
|
|
123
|
-
"diff_dlwrf": 1136464.0,
|
|
124
|
-
"diff_dswrf": 420584.6875,
|
|
125
|
-
"diff_duvrs": 48265.4765625,
|
|
126
|
-
"diff_sr": 469169.5,
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
ECMWF_STD = _to_data_array(ECMWF_STD)
|
|
130
|
-
ECMWF_MEAN = _to_data_array(ECMWF_MEAN)
|
|
131
|
-
|
|
132
|
-
# ------ GFS
|
|
133
|
-
GFS_STD = {
|
|
134
|
-
"dlwrf": 96.305916,
|
|
135
|
-
"dswrf": 246.18533,
|
|
136
|
-
"hcc": 42.525383,
|
|
137
|
-
"lcc": 44.3732,
|
|
138
|
-
"mcc": 43.150745,
|
|
139
|
-
"prate": 0.00010159573,
|
|
140
|
-
"r": 25.440672,
|
|
141
|
-
"sde": 0.43345627,
|
|
142
|
-
"t": 22.825893,
|
|
143
|
-
"tcc": 41.030598,
|
|
144
|
-
"u10": 5.470838,
|
|
145
|
-
"u100": 6.8899174,
|
|
146
|
-
"v10": 4.7401133,
|
|
147
|
-
"v100": 6.076132,
|
|
148
|
-
"vis": 8294.022,
|
|
149
|
-
"u": 10.614556,
|
|
150
|
-
"v": 7.176398,
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
GFS_MEAN = {
|
|
154
|
-
"dlwrf": 298.342,
|
|
155
|
-
"dswrf": 168.12321,
|
|
156
|
-
"hcc": 35.272,
|
|
157
|
-
"lcc": 43.578342,
|
|
158
|
-
"mcc": 33.738823,
|
|
159
|
-
"prate": 2.8190969e-05,
|
|
160
|
-
"r": 18.359747,
|
|
161
|
-
"sde": 0.36937004,
|
|
162
|
-
"t": 278.5223,
|
|
163
|
-
"tcc": 66.841606,
|
|
164
|
-
"u10": -0.0022310058,
|
|
165
|
-
"u100": 0.0823025,
|
|
166
|
-
"v10": 0.06219831,
|
|
167
|
-
"v100": 0.0797807,
|
|
168
|
-
"vis": 19628.32,
|
|
169
|
-
"u": 11.645444,
|
|
170
|
-
"v": 0.12330122,
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
GFS_STD = _to_data_array(GFS_STD)
|
|
174
|
-
GFS_MEAN = _to_data_array(GFS_MEAN)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
NWP_STDS = NWPStatDict(
|
|
178
|
-
ukv=UKV_STD,
|
|
179
|
-
ecmwf=ECMWF_STD,
|
|
180
|
-
gfs=GFS_STD
|
|
181
|
-
)
|
|
182
|
-
NWP_MEANS = NWPStatDict(
|
|
183
|
-
ukv=UKV_MEAN,
|
|
184
|
-
ecmwf=ECMWF_MEAN,
|
|
185
|
-
gfs=GFS_MEAN
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
# ------ Satellite
|
|
189
|
-
# RSS Mean and std values from randomised 20% of 2020 imagery
|
|
190
|
-
|
|
191
|
-
RSS_STD = {
|
|
192
|
-
"HRV": 0.11405209,
|
|
193
|
-
"IR_016": 0.21462157,
|
|
194
|
-
"IR_039": 0.04618041,
|
|
195
|
-
"IR_087": 0.06687243,
|
|
196
|
-
"IR_097": 0.0468558,
|
|
197
|
-
"IR_108": 0.17482725,
|
|
198
|
-
"IR_120": 0.06115861,
|
|
199
|
-
"IR_134": 0.04492306,
|
|
200
|
-
"VIS006": 0.12184761,
|
|
201
|
-
"VIS008": 0.13090034,
|
|
202
|
-
"WV_062": 0.16111417,
|
|
203
|
-
"WV_073": 0.12924142,
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
RSS_MEAN = {
|
|
207
|
-
"HRV": 0.09298719,
|
|
208
|
-
"IR_016": 0.17594202,
|
|
209
|
-
"IR_039": 0.86167645,
|
|
210
|
-
"IR_087": 0.7719318,
|
|
211
|
-
"IR_097": 0.8014212,
|
|
212
|
-
"IR_108": 0.71254843,
|
|
213
|
-
"IR_120": 0.89058584,
|
|
214
|
-
"IR_134": 0.944365,
|
|
215
|
-
"VIS006": 0.09633306,
|
|
216
|
-
"VIS008": 0.11426069,
|
|
217
|
-
"WV_062": 0.7359355,
|
|
218
|
-
"WV_073": 0.62479186,
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
RSS_STD = _to_data_array(RSS_STD)
|
|
222
|
-
RSS_MEAN = _to_data_array(RSS_MEAN)
|