ocf-data-sampler 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (78) hide show
  1. ocf_data_sampler/config/load.py +3 -3
  2. ocf_data_sampler/config/model.py +146 -64
  3. ocf_data_sampler/config/save.py +5 -4
  4. ocf_data_sampler/load/gsp.py +6 -5
  5. ocf_data_sampler/load/load_dataset.py +5 -6
  6. ocf_data_sampler/load/nwp/nwp.py +17 -5
  7. ocf_data_sampler/load/nwp/providers/ecmwf.py +6 -7
  8. ocf_data_sampler/load/nwp/providers/gfs.py +36 -0
  9. ocf_data_sampler/load/nwp/providers/icon.py +46 -0
  10. ocf_data_sampler/load/nwp/providers/ukv.py +4 -5
  11. ocf_data_sampler/load/nwp/providers/utils.py +3 -1
  12. ocf_data_sampler/load/satellite.py +9 -10
  13. ocf_data_sampler/load/site.py +10 -6
  14. ocf_data_sampler/load/utils.py +21 -16
  15. ocf_data_sampler/numpy_sample/collate.py +10 -9
  16. ocf_data_sampler/numpy_sample/datetime_features.py +3 -5
  17. ocf_data_sampler/numpy_sample/gsp.py +12 -14
  18. ocf_data_sampler/numpy_sample/nwp.py +12 -12
  19. ocf_data_sampler/numpy_sample/satellite.py +9 -9
  20. ocf_data_sampler/numpy_sample/site.py +5 -8
  21. ocf_data_sampler/numpy_sample/sun_position.py +16 -21
  22. ocf_data_sampler/sample/base.py +15 -17
  23. ocf_data_sampler/sample/site.py +13 -20
  24. ocf_data_sampler/sample/uk_regional.py +29 -35
  25. ocf_data_sampler/select/dropout.py +16 -14
  26. ocf_data_sampler/select/fill_time_periods.py +15 -5
  27. ocf_data_sampler/select/find_contiguous_time_periods.py +88 -75
  28. ocf_data_sampler/select/geospatial.py +63 -54
  29. ocf_data_sampler/select/location.py +16 -51
  30. ocf_data_sampler/select/select_spatial_slice.py +105 -89
  31. ocf_data_sampler/select/select_time_slice.py +71 -58
  32. ocf_data_sampler/select/spatial_slice_for_dataset.py +7 -6
  33. ocf_data_sampler/select/time_slice_for_dataset.py +17 -16
  34. ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py +140 -131
  35. ocf_data_sampler/torch_datasets/datasets/site.py +152 -112
  36. ocf_data_sampler/torch_datasets/utils/__init__.py +3 -0
  37. ocf_data_sampler/torch_datasets/utils/channel_dict_to_dataarray.py +11 -0
  38. ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py +6 -2
  39. ocf_data_sampler/torch_datasets/utils/valid_time_periods.py +23 -22
  40. ocf_data_sampler/utils.py +3 -1
  41. {ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/METADATA +7 -18
  42. ocf_data_sampler-0.1.17.dist-info/RECORD +56 -0
  43. {ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/WHEEL +1 -1
  44. {ocf_data_sampler-0.1.11.dist-info → ocf_data_sampler-0.1.17.dist-info}/top_level.txt +1 -1
  45. scripts/refactor_site.py +63 -33
  46. utils/compute_icon_mean_stddev.py +72 -0
  47. ocf_data_sampler/constants.py +0 -222
  48. ocf_data_sampler/torch_datasets/utils/validate_channels.py +0 -82
  49. ocf_data_sampler-0.1.11.dist-info/LICENSE +0 -21
  50. ocf_data_sampler-0.1.11.dist-info/RECORD +0 -82
  51. tests/__init__.py +0 -0
  52. tests/config/test_config.py +0 -113
  53. tests/config/test_load.py +0 -7
  54. tests/config/test_save.py +0 -28
  55. tests/conftest.py +0 -319
  56. tests/load/test_load_gsp.py +0 -15
  57. tests/load/test_load_nwp.py +0 -21
  58. tests/load/test_load_satellite.py +0 -17
  59. tests/load/test_load_sites.py +0 -14
  60. tests/numpy_sample/test_collate.py +0 -21
  61. tests/numpy_sample/test_datetime_features.py +0 -37
  62. tests/numpy_sample/test_gsp.py +0 -38
  63. tests/numpy_sample/test_nwp.py +0 -13
  64. tests/numpy_sample/test_satellite.py +0 -40
  65. tests/numpy_sample/test_sun_position.py +0 -81
  66. tests/select/test_dropout.py +0 -69
  67. tests/select/test_fill_time_periods.py +0 -28
  68. tests/select/test_find_contiguous_time_periods.py +0 -202
  69. tests/select/test_location.py +0 -67
  70. tests/select/test_select_spatial_slice.py +0 -154
  71. tests/select/test_select_time_slice.py +0 -275
  72. tests/test_sample/test_base.py +0 -164
  73. tests/test_sample/test_site_sample.py +0 -165
  74. tests/test_sample/test_uk_regional_sample.py +0 -136
  75. tests/torch_datasets/test_merge_and_fill_utils.py +0 -40
  76. tests/torch_datasets/test_pvnet_uk.py +0 -154
  77. tests/torch_datasets/test_site.py +0 -226
  78. tests/torch_datasets/test_validate_channels_utils.py +0 -78
@@ -1,34 +1,31 @@
1
+ """Functions pertaining to finding valid time periods for the input data."""
2
+
1
3
  import numpy as np
2
4
  import pandas as pd
3
5
 
4
6
  from ocf_data_sampler.config import Configuration
5
7
  from ocf_data_sampler.select.find_contiguous_time_periods import (
8
+ find_contiguous_t0_periods,
6
9
  find_contiguous_t0_periods_nwp,
7
- find_contiguous_t0_periods,
8
10
  intersection_of_multiple_dataframes_of_periods,
9
11
  )
10
12
  from ocf_data_sampler.utils import minutes
11
13
 
12
14
 
13
-
14
- def find_valid_time_periods(
15
- datasets_dict: dict,
16
- config: Configuration,
17
- ):
18
- """Find the t0 times where all of the requested input data is available
15
+ def find_valid_time_periods(datasets_dict: dict, config: Configuration) -> pd.DataFrame:
16
+ """Find the t0 times where all of the requested input data is available.
19
17
 
20
18
  Args:
21
19
  datasets_dict: A dictionary of input datasets
22
20
  config: Configuration file
23
21
  """
22
+ if not set(datasets_dict.keys()).issubset({"nwp", "sat", "gsp"}):
23
+ raise ValueError(f"Invalid keys in datasets_dict: {datasets_dict.keys()}")
24
24
 
25
- assert set(datasets_dict.keys()).issubset({"nwp", "sat", "gsp"})
26
-
27
- contiguous_time_periods: dict[str: pd.DataFrame] = {} # Used to store contiguous time periods from each data source
28
-
25
+ # Used to store contiguous time periods from each data source
26
+ contiguous_time_periods: dict[str : pd.DataFrame] = {}
29
27
  if "nwp" in datasets_dict:
30
28
  for nwp_key, nwp_config in config.input_data.nwp.items():
31
-
32
29
  da = datasets_dict["nwp"][nwp_key]
33
30
 
34
31
  if nwp_config.dropout_timedeltas_minutes is None:
@@ -59,8 +56,12 @@ def find_valid_time_periods(
59
56
  max_staleness = max_possible_staleness
60
57
  else:
61
58
  # Make sure the max acceptable staleness isn't longer than the max possible
62
- assert max_staleness <= max_possible_staleness
63
-
59
+ if max_staleness > max_possible_staleness:
60
+ raise ValueError(
61
+ f"max_staleness_minutes is too long for the input data, "
62
+ f"{max_staleness=}, {max_possible_staleness=}",
63
+ )
64
+
64
65
  # Find the first forecast step
65
66
  first_forecast_step = pd.Timedelta(da["step"].min().item())
66
67
 
@@ -69,34 +70,34 @@ def find_valid_time_periods(
69
70
  interval_start=minutes(nwp_config.interval_start_minutes),
70
71
  max_staleness=max_staleness,
71
72
  max_dropout=max_dropout,
72
- first_forecast_step = first_forecast_step,
73
+ first_forecast_step=first_forecast_step,
73
74
  )
74
75
 
75
- contiguous_time_periods[f'nwp_{nwp_key}'] = time_periods
76
+ contiguous_time_periods[f"nwp_{nwp_key}"] = time_periods
76
77
 
77
78
  if "sat" in datasets_dict:
78
79
  sat_config = config.input_data.satellite
79
80
 
80
81
  time_periods = find_contiguous_t0_periods(
81
82
  pd.DatetimeIndex(datasets_dict["sat"]["time_utc"]),
82
- sample_period_duration=minutes(sat_config.time_resolution_minutes),
83
+ time_resolution=minutes(sat_config.time_resolution_minutes),
83
84
  interval_start=minutes(sat_config.interval_start_minutes),
84
85
  interval_end=minutes(sat_config.interval_end_minutes),
85
86
  )
86
87
 
87
- contiguous_time_periods['sat'] = time_periods
88
+ contiguous_time_periods["sat"] = time_periods
88
89
 
89
90
  if "gsp" in datasets_dict:
90
91
  gsp_config = config.input_data.gsp
91
92
 
92
93
  time_periods = find_contiguous_t0_periods(
93
94
  pd.DatetimeIndex(datasets_dict["gsp"]["time_utc"]),
94
- sample_period_duration=minutes(gsp_config.time_resolution_minutes),
95
+ time_resolution=minutes(gsp_config.time_resolution_minutes),
95
96
  interval_start=minutes(gsp_config.interval_start_minutes),
96
97
  interval_end=minutes(gsp_config.interval_end_minutes),
97
98
  )
98
99
 
99
- contiguous_time_periods['gsp'] = time_periods
100
+ contiguous_time_periods["gsp"] = time_periods
100
101
 
101
102
  # just get the values (not the keys)
102
103
  contiguous_time_periods_values = list(contiguous_time_periods.values())
@@ -104,7 +105,7 @@ def find_valid_time_periods(
104
105
  # Find joint overlapping contiguous time periods
105
106
  if len(contiguous_time_periods_values) > 1:
106
107
  valid_time_periods = intersection_of_multiple_dataframes_of_periods(
107
- contiguous_time_periods_values
108
+ contiguous_time_periods_values,
108
109
  )
109
110
  else:
110
111
  valid_time_periods = contiguous_time_periods_values[0]
@@ -113,4 +114,4 @@ def find_valid_time_periods(
113
114
  if len(valid_time_periods) == 0:
114
115
  raise ValueError(f"No valid time periods found, {contiguous_time_periods=}")
115
116
 
116
- return valid_time_periods
117
+ return valid_time_periods
ocf_data_sampler/utils.py CHANGED
@@ -1,8 +1,10 @@
1
+ """Miscellaneous helper functions."""
2
+
1
3
  import pandas as pd
2
4
 
3
5
 
4
6
  def minutes(minutes: int | list[float]) -> pd.Timedelta | pd.TimedeltaIndex:
5
- """Timedelta minutes
7
+ """Timedelta minutes.
6
8
 
7
9
  Args:
8
10
  minutes: the number of minutes, single value or list
@@ -1,10 +1,8 @@
1
1
  Metadata-Version: 2.2
2
- Name: ocf_data_sampler
3
- Version: 0.1.11
4
- Summary: Sample from weather data for renewable energy prediction
5
- Author: James Fulton, Peter Dudfield, and the Open Climate Fix team
6
- Author-email: info@openclimatefix.org
7
- Maintainer: Open Climate Fix Ltd
2
+ Name: ocf-data-sampler
3
+ Version: 0.1.17
4
+ Author: James Fulton, Peter Dudfield
5
+ Author-email: Open Climate Fix team <info@openclimatefix.org>
8
6
  License: MIT License
9
7
 
10
8
  Copyright (c) 2023 Open Climate Fix
@@ -27,21 +25,18 @@ License: MIT License
27
25
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
26
  SOFTWARE.
29
27
 
30
- Project-URL: homepage, https://github.com/openclimatefix
31
28
  Project-URL: repository, https://github.com/openclimatefix/ocf-data-sampler
32
- Keywords: weather data,renewable energy prediction,sample weather data
29
+ Classifier: Programming Language :: Python :: 3
33
30
  Classifier: License :: OSI Approved :: MIT License
34
- Classifier: Programming Language :: Python :: 3.8
35
- Classifier: Operating System :: POSIX :: Linux
36
- Requires-Python: >=3.8
31
+ Requires-Python: >=3.10
37
32
  Description-Content-Type: text/markdown
38
- License-File: LICENSE
39
33
  Requires-Dist: torch
40
34
  Requires-Dist: numpy
41
35
  Requires-Dist: pandas
42
36
  Requires-Dist: xarray
43
37
  Requires-Dist: zarr==2.18.3
44
38
  Requires-Dist: dask
39
+ Requires-Dist: matplotlib
45
40
  Requires-Dist: ocf_blosc2
46
41
  Requires-Dist: pvlib
47
42
  Requires-Dist: pydantic
@@ -50,11 +45,6 @@ Requires-Dist: pathy
50
45
  Requires-Dist: pyaml_env
51
46
  Requires-Dist: pyresample
52
47
  Requires-Dist: h5netcdf
53
- Provides-Extra: docs
54
- Requires-Dist: mkdocs>=1.2; extra == "docs"
55
- Requires-Dist: mkdocs-material>=8.0; extra == "docs"
56
- Provides-Extra: plot
57
- Requires-Dist: matplotlib; extra == "plot"
58
48
 
59
49
  # ocf-data-sampler
60
50
 
@@ -77,7 +67,6 @@ We are currently migrating to this repo from [ocf_datapipes](https://github.com/
77
67
 
78
68
  **ocf-data-sampler** doesn't have external documentation _yet_; you can read a bit about how our torch datasets work in the README [here](ocf_data_sampler/torch_datasets/README.md).
79
69
 
80
-
81
70
  ## FAQ
82
71
 
83
72
  If you have any questions about this or any other of our repos, don't hesitate to hop to our [Discussions Page](https://github.com/orgs/openclimatefix/discussions)!
@@ -0,0 +1,56 @@
1
+ ocf_data_sampler/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
2
+ ocf_data_sampler/utils.py,sha256=DjuneGGisl08ENvPZV_lrcX4b2NCKJC1ZpXgIpxuQi4,290
3
+ ocf_data_sampler/config/__init__.py,sha256=O29mbH0XG2gIY1g3BaveGCnpBO2SFqdu-qzJ7a6evl0,223
4
+ ocf_data_sampler/config/load.py,sha256=LL-7wemI8o4KPkx35j-wQ3HjsMvDgqXr7G46IcASfnU,632
5
+ ocf_data_sampler/config/model.py,sha256=y8maV_1z0LL_m0J607ka_yJ0KI-0ssYDn5Ghk8aNgR0,10189
6
+ ocf_data_sampler/config/save.py,sha256=m8SPw5rXjkMm1rByjh3pK5StdBi4e8ysnn3jQopdRaI,1064
7
+ ocf_data_sampler/data/uk_gsp_locations.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZtI58td4d4DhGos,10415772
8
+ ocf_data_sampler/load/__init__.py,sha256=T5Zj1PGt0aiiNEN7Ra1Ac-cBsNKhphmmHy_8g7XU_w0,219
9
+ ocf_data_sampler/load/gsp.py,sha256=keB3Nv_CNK1P6pS9Kdfc8PoZXTI1_YFN-spsvEv_Ewc,899
10
+ ocf_data_sampler/load/load_dataset.py,sha256=0NyDxCDfgE_esKVW3s-rZEe16WB30FQ74ClWlrIo72M,1602
11
+ ocf_data_sampler/load/satellite.py,sha256=E7Ln7Y60Qr1RTV-_R71YoxXQM-Ca7Y1faIo3oKB2eFk,2292
12
+ ocf_data_sampler/load/site.py,sha256=zOzlWk6pYZBB5daqG8URGksmDXWKrkutUvN8uALAIh8,1468
13
+ ocf_data_sampler/load/utils.py,sha256=Jwbr1rpEa3cefjw-OTVRaxnIHyGixYB3TlTlta0BOdU,1727
14
+ ocf_data_sampler/load/nwp/__init__.py,sha256=SmcrnbygO5xtCKmGR4wtHrj-HI7nOAvnAtfuvRufBGQ,25
15
+ ocf_data_sampler/load/nwp/nwp.py,sha256=0AIHQTJLUtwP2Toz_PskOTYFJXfMvGhk8faAcNvI9jk,922
16
+ ocf_data_sampler/load/nwp/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ ocf_data_sampler/load/nwp/providers/ecmwf.py,sha256=5AzktPJgertCx6oij6aePRosPuZHGFznMxTgtkk_mgc,994
18
+ ocf_data_sampler/load/nwp/providers/gfs.py,sha256=JSDeh4YG1wibV8--P3X-zTO8LP0dsJcpFvIyglBbhi0,979
19
+ ocf_data_sampler/load/nwp/providers/icon.py,sha256=yYUrs5HgjU0C5pMHBB6FGn3tLjswi990IY6QCXS1Zmw,1569
20
+ ocf_data_sampler/load/nwp/providers/ukv.py,sha256=-0v8JCLH8ypz8GMXZ6Rrx-I0LoHuHO8sXFupbC1RpM0,1013
21
+ ocf_data_sampler/load/nwp/providers/utils.py,sha256=cJZ9JA4W_ZeTcLQ5z71w46_DJaPcW_2JMmBdjP9r3qs,835
22
+ ocf_data_sampler/numpy_sample/__init__.py,sha256=nY5C6CcuxiWZ_jrXRzWtN7WyKXhJImSiVTIG6Rz4B_4,401
23
+ ocf_data_sampler/numpy_sample/collate.py,sha256=I9YPcbxOwHYaDGKbzxqdV-3DFEHkzqdhAwnW7_tZH2w,1966
24
+ ocf_data_sampler/numpy_sample/datetime_features.py,sha256=INudxHcoB_c-GvYXe08S4Up_8TU5zOJ39PWRrTKfLp8,1203
25
+ ocf_data_sampler/numpy_sample/gsp.py,sha256=EDaQdOVEDBJGrXsq54UNBfpXTzi0ky_WpgBbmlyxOXM,1074
26
+ ocf_data_sampler/numpy_sample/nwp.py,sha256=iBGOdLMn-F5yR3juX3l4G2oXDpvGNuUdcR6ZCZkCqZk,1037
27
+ ocf_data_sampler/numpy_sample/satellite.py,sha256=oBlyNpO-syoyK4SSghoHqIDNyhcBqyd1L6eXSSw0k3w,1036
28
+ ocf_data_sampler/numpy_sample/site.py,sha256=tpX7j6dTOz2YmOFIzVYqTfWvIduKlOnBcLITsuPMgxU,1250
29
+ ocf_data_sampler/numpy_sample/sun_position.py,sha256=nkfgN6NmiLGoLSuJZrDsM-6nsIzJN75tWfN20Z7n8xo,1480
30
+ ocf_data_sampler/sample/__init__.py,sha256=zdS73NTnxFX_j8uh9tT-IXiURB6635wbneM1koWYV1o,169
31
+ ocf_data_sampler/sample/base.py,sha256=lnr-MNRpAxjVFJHCEvCZL86NrYy9LWnNOsLWBGDL8kc,2359
32
+ ocf_data_sampler/sample/site.py,sha256=4aJys40CQ-2CRKo_dgvm3rINTdfyTGWQGEaXGbh58qQ,1236
33
+ ocf_data_sampler/sample/uk_regional.py,sha256=uMtLdqZCsKttjFmhIC6JITzu2JDZh-VQdYUfbpyhgFM,2409
34
+ ocf_data_sampler/select/__init__.py,sha256=E4AJulEbO2K-o0UlG1fgaEteuf_1ZFjHTvrotXSb4YU,332
35
+ ocf_data_sampler/select/dropout.py,sha256=_rzXl8_4VHTY_JMjbaoWopaFCJmLdaBpqfYF4vr24tk,1638
36
+ ocf_data_sampler/select/fill_time_periods.py,sha256=TlGxp1xiAqnhdWfLy0pv3FuZc00dtimjWdLzr4JoTGA,865
37
+ ocf_data_sampler/select/find_contiguous_time_periods.py,sha256=cEXrQDzk8pXknxB0q3v5DakosagHMoLDAj302B8Xpw0,11537
38
+ ocf_data_sampler/select/geospatial.py,sha256=CDExkl36eZOKmdJPzUr_K0Wn3axHqv5nYo-EkSiINcc,5032
39
+ ocf_data_sampler/select/location.py,sha256=AZvGR8y62opiW7zACGXjoOtBEWRfSLOZIA73O5Deu0c,1037
40
+ ocf_data_sampler/select/select_spatial_slice.py,sha256=qY2Ll00EPA80oBtzwMoR5nk0UIpoWZF9oXl22YwWr0Q,12341
41
+ ocf_data_sampler/select/select_time_slice.py,sha256=q5QdgHPIXQb49uT5NwbOguY1GhjWc_o3c-2cDb5kLAo,5455
42
+ ocf_data_sampler/select/spatial_slice_for_dataset.py,sha256=Hvz0wHSWMYYamf2oHNiGlzJcM4cAH6pL_7ZEvIBL2dE,1882
43
+ ocf_data_sampler/select/time_slice_for_dataset.py,sha256=1DN6VsWWdLvkpJxodZtBRDUgC4vJE2td_RP5J3ZqPNw,4268
44
+ ocf_data_sampler/torch_datasets/datasets/__init__.py,sha256=jfJSFcR0eO1AqeH7S3KnGjsBqVZT5w3oyi784PUR6Q0,146
45
+ ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=ZkXm0IQEIzZUi8O-qJJz2PbJr9T4ZvutL424yRQUJhc,12878
46
+ ocf_data_sampler/torch_datasets/datasets/site.py,sha256=j29cWPIcksRbge014MxR0_OgJqoskdki6KqvtoHtxpY,18023
47
+ ocf_data_sampler/torch_datasets/utils/__init__.py,sha256=7Yt4anQVU9y27nj4Wx1tRLqbAQLbzW0ED71UL65LvxA,187
48
+ ocf_data_sampler/torch_datasets/utils/channel_dict_to_dataarray.py,sha256=MGylKhXxXLQC2fYv-8L_GVoYhov3LcEwC0Q21xItDSk,353
49
+ ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py,sha256=we7BTxRH7B7jKayDT7YfNyfI3zZClz2Bk-HXKQIokgU,956
50
+ ocf_data_sampler/torch_datasets/utils/valid_time_periods.py,sha256=LdHgLPAYUVoCRMk2nnFdsMpygGS2kbps3h-7_bZnETw,4718
51
+ scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
52
+ utils/compute_icon_mean_stddev.py,sha256=a1oWMRMnny39rV-dvu8rcx85sb4bXzPFrR1gkUr4Jpg,2296
53
+ ocf_data_sampler-0.1.17.dist-info/METADATA,sha256=RI0JClDkwWGjw7gel_j-k2B-SmMKMFLwHdDqEVP5R0U,11713
54
+ ocf_data_sampler-0.1.17.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
55
+ ocf_data_sampler-0.1.17.dist-info/top_level.txt,sha256=LEFU4Uk-PEo72QGLAfnVZIUEm37Q8mKuMeg_Xk-p33g,31
56
+ ocf_data_sampler-0.1.17.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,3 +1,3 @@
1
1
  ocf_data_sampler
2
2
  scripts
3
- tests
3
+ utils
scripts/refactor_site.py CHANGED
@@ -1,50 +1,80 @@
1
- """ Helper functions for refactoring legacy site data """
1
+ """Refactor legacy site data into a more structured format."""
2
+
3
+ import pandas as pd
2
4
  import xarray as xr
3
5
 
4
- def legacy_format(data_ds, metadata_df):
5
- """This formats old legacy data to the new format.
6
6
 
7
- 1. This renames the columns in the metadata
8
- 2. Re-formats the site data from data variables named by the site_id to
9
- a data array with a site_id dimension. Also adds capacity_kwp to the dataset as a time series for each site_id
10
- """
7
+ def legacy_format(data_ds: xr.Dataset, metadata_df: pd.DataFrame) -> xr.Dataset:
8
+ """Converts old legacy site data into a more structured format.
9
+
10
+ This function does three main things:
11
+ 1. Renames some columns in the metadata to keep things consistent.
12
+ 2. Reshapes site data so that instead of having separate variables for each site,
13
+ we use a `site_id` dimension—makes life easier for analysis.
14
+ 3. Adds `capacity_kwp` as a time series so that each site has its capacity info.
15
+
16
+ Parameters:
17
+ data_ds (xr.Dataset): The dataset containing legacy site data.
18
+ metadata_df (pd.DataFrame): A DataFrame with metadata about the sites.
11
19
 
20
+ Returns:
21
+ xr.Dataset: Reformatted dataset with `generation_kw` and `capacity_kwp`.
22
+ """
23
+ # Step 1: Rename metadata columns to match the new expected format
12
24
  if "system_id" in metadata_df.columns:
13
- metadata_df["site_id"] = metadata_df["system_id"]
25
+ metadata_df = metadata_df.rename(columns={"system_id": "site_id"})
14
26
 
27
+ # Convert capacity from megawatts to kilowatts if needed
15
28
  if "capacity_megawatts" in metadata_df.columns:
16
29
  metadata_df["capacity_kwp"] = metadata_df["capacity_megawatts"] * 1000
17
30
 
18
- # only site data has the site_id as data variables.
19
- # We want to join them all together and create another coordinate called site_id
31
+ # Quick sanity check to ensure we have what we need
32
+ if "site_id" not in metadata_df.columns or "capacity_kwp" not in metadata_df.columns:
33
+ raise ValueError("Metadata is missing required columns: 'site_id' and 'capacity_kwp'.")
34
+
35
+ # Step 2: Transform the dataset
36
+ # Check if we actually have site data in the expected format
20
37
  if "0" in data_ds:
21
- gen_df = data_ds.to_dataframe()
22
- gen_da = xr.DataArray(
23
- data=gen_df.values,
24
- coords=(
25
- ("time_utc", gen_df.index.values),
26
- ("site_id", metadata_df["site_id"]),
27
- ),
38
+ # Convert the dataset into a DataFrame so we can manipulate it more easily
39
+ site_data_df = data_ds.to_dataframe()
40
+
41
+ # Create a DataArray for generation data
42
+ generation_da = xr.DataArray(
43
+ data=site_data_df.values,
44
+ coords={
45
+ "time_utc": site_data_df.index.values,
46
+ "site_id": metadata_df["site_id"].values,
47
+ },
48
+ dims=["time_utc", "site_id"],
28
49
  name="generation_kw",
29
50
  )
30
51
 
31
- capacity_df = gen_df
32
- for col in capacity_df.columns:
33
- capacity_df[col] = metadata_df[metadata_df["site_id"].astype(str) == col][
34
- "capacity_kwp"
35
- ].iloc[0]
52
+ # Step 3: Attach capacity information
53
+ # Map site_ids to their respective capacities
54
+ site_ids = site_data_df.columns
55
+ capacities = metadata_df.set_index("site_id").loc[site_ids, "capacity_kwp"]
56
+
57
+ # Broadcast capacities across all timestamps
58
+ capacity_df = pd.DataFrame(
59
+ {site_id: [capacities[site_id]] * len(site_data_df) for site_id in site_ids},
60
+ index=site_data_df.index,
61
+ )
62
+
63
+ # Create a DataArray for capacity data
36
64
  capacity_da = xr.DataArray(
37
65
  data=capacity_df.values,
38
- coords=(
39
- ("time_utc", gen_df.index.values),
40
- ("site_id", metadata_df["site_id"]),
41
- ),
66
+ coords={
67
+ "time_utc": site_data_df.index.values,
68
+ "site_id": metadata_df["site_id"].values,
69
+ },
70
+ dims=["time_utc", "site_id"],
42
71
  name="capacity_kwp",
43
72
  )
44
- data_ds = xr.Dataset(
45
- {
46
- "generation_kw": gen_da,
47
- "capacity_kwp": capacity_da,
48
- }
49
- )
50
- return data_ds
73
+
74
+ # Finally, bundle everything into a single Dataset
75
+ data_ds = xr.Dataset({
76
+ "generation_kw": generation_da,
77
+ "capacity_kwp": capacity_da,
78
+ })
79
+
80
+ return data_ds
@@ -0,0 +1,72 @@
1
+ """Script to compute normalisation constants from NWP data."""
2
+
3
+ import argparse
4
+ import glob
5
+ import logging
6
+
7
+ import numpy as np
8
+ import xarray as xr
9
+
10
+ from ocf_data_sampler.load.nwp.providers.icon import open_icon_eu
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Add argument parser
17
+ parser = argparse.ArgumentParser(description="Compute normalization constants from NWP data")
18
+ parser.add_argument("--data-path", type=str, required=True,
19
+ help='Path pattern to zarr files (e.g., "/path/to/data/*.zarr.zip")')
20
+ parser.add_argument("--n-samples", type=int, default=2000,
21
+ help="Number of random samples to use (default: 2000)")
22
+
23
+ args = parser.parse_args()
24
+
25
+ zarr_files = glob.glob(args.data_path)
26
+ n_samples = args.n_samples
27
+
28
+ ds = open_icon_eu(zarr_files)
29
+
30
+ n_init_times = ds.sizes["init_time_utc"]
31
+ n_lats = ds.sizes["latitude"]
32
+ n_longs = ds.sizes["longitude"]
33
+ n_steps = ds.sizes["step"]
34
+
35
+ random_init_times = np.random.choice(n_init_times, size=n_samples, replace=True)
36
+ random_lats = np.random.choice(n_lats, size=n_samples, replace=True)
37
+ random_longs = np.random.choice(n_longs, size=n_samples, replace=True)
38
+ random_steps = np.random.choice(n_steps, size=n_samples, replace=True)
39
+
40
+ samples = []
41
+ for i in range(n_samples):
42
+ sample = ds.isel(init_time_utc=random_init_times[i],
43
+ latitude=random_lats[i],
44
+ longitude=random_longs[i],
45
+ step=random_steps[i])
46
+ samples.append(sample)
47
+
48
+ samples_stack = xr.concat(samples, dim="samples")
49
+
50
+
51
+ available_channels = samples_stack.channel.values.tolist()
52
+ logger.info("Available channels: %s", available_channels)
53
+
54
+ ICON_EU_MEAN = {}
55
+ ICON_EU_STD = {}
56
+
57
+ for var in available_channels:
58
+ if var not in available_channels:
59
+ logger.warning("Variable '%s' not found in the channel coordinate; skipping.", var)
60
+ continue
61
+ var_data = samples_stack.sel(channel=var)
62
+ var_mean = float(var_data.mean().compute())
63
+ var_std = float(var_data.std().compute())
64
+
65
+ ICON_EU_MEAN[var] = var_mean
66
+ ICON_EU_STD[var] = var_std
67
+
68
+ logger.info("Processed %s: mean=%.4f, std=%.4f", var, var_mean, var_std)
69
+
70
+ logger.info("\nMean values:\n%s", ICON_EU_MEAN)
71
+ logger.info("\nStandard deviations:\n%s", ICON_EU_STD)
72
+
@@ -1,222 +0,0 @@
1
- import xarray as xr
2
- import numpy as np
3
-
4
-
5
- NWP_PROVIDERS = [
6
- "ukv",
7
- "ecmwf",
8
- "gfs"
9
- ]
10
- # TODO add ICON
11
-
12
-
13
- def _to_data_array(d):
14
- return xr.DataArray(
15
- [d[k] for k in d.keys()],
16
- coords={"channel": [k for k in d.keys()]},
17
- ).astype(np.float32)
18
-
19
-
20
- class NWPStatDict(dict):
21
- """Custom dictionary class to hold NWP normalization stats"""
22
-
23
- def __getitem__(self, key):
24
- if key not in NWP_PROVIDERS:
25
- raise KeyError(f"{key} is not a supported NWP provider - {NWP_PROVIDERS}")
26
- elif key in self.keys():
27
- return super().__getitem__(key)
28
- else:
29
- raise KeyError(
30
- f"Values for {key} not yet available in ocf-data-sampler {list(self.keys())}"
31
- )
32
-
33
-
34
- # ------ UKV
35
- # Means and std computed WITH version_7 and higher, MetOffice values
36
- UKV_STD = {
37
- "cdcb": 2126.99350113,
38
- "lcc": 39.33210726,
39
- "mcc": 41.91144559,
40
- "hcc": 38.07184418,
41
- "sde": 0.1029753,
42
- "hcct": 18382.63958991,
43
- "dswrf": 190.47216887,
44
- "dlwrf": 39.45988077,
45
- "h": 1075.77812282,
46
- "t": 4.38818501,
47
- "r": 11.45012499,
48
- "dpt": 4.57250482,
49
- "vis": 21578.97975625,
50
- "si10": 3.94718813,
51
- "wdir10": 94.08407495,
52
- "prmsl": 1252.71790539,
53
- "prate": 0.00021497,
54
- }
55
-
56
- UKV_MEAN = {
57
- "cdcb": 1412.26599062,
58
- "lcc": 50.08362643,
59
- "mcc": 40.88984494,
60
- "hcc": 29.11949682,
61
- "sde": 0.00289545,
62
- "hcct": -18345.97478167,
63
- "dswrf": 111.28265039,
64
- "dlwrf": 325.03130139,
65
- "h": 2096.51991356,
66
- "t": 283.64913206,
67
- "r": 81.79229501,
68
- "dpt": 280.54379901,
69
- "vis": 32262.03285118,
70
- "si10": 6.88348448,
71
- "wdir10": 199.41891636,
72
- "prmsl": 101321.61574029,
73
- "prate": 3.45793433e-05,
74
- }
75
-
76
- UKV_STD = _to_data_array(UKV_STD)
77
- UKV_MEAN = _to_data_array(UKV_MEAN)
78
-
79
- # ------ ECMWF
80
- # These were calculated from 100 random init times of UK data from 2020-2023
81
- ECMWF_STD = {
82
- "dlwrf": 15855867.0,
83
- "dswrf": 13025427.0,
84
- "duvrs": 1445635.25,
85
- "hcc": 0.42244860529899597,
86
- "lcc": 0.3791404366493225,
87
- "mcc": 0.38039860129356384,
88
- "prate": 9.81039775069803e-05,
89
- "sd": 0.000913831521756947,
90
- "sr": 16294988.0,
91
- "t2m": 3.692270040512085,
92
- "tcc": 0.37487083673477173,
93
- "u10": 5.531515598297119,
94
- "u100": 7.2320556640625,
95
- "u200": 8.049470901489258,
96
- "v10": 5.411230564117432,
97
- "v100": 6.944501876831055,
98
- "v200": 7.561611652374268,
99
- "diff_dlwrf": 131942.03125,
100
- "diff_dswrf": 715366.3125,
101
- "diff_duvrs": 81605.25,
102
- "diff_sr": 818950.6875,
103
- }
104
-
105
- ECMWF_MEAN = {
106
- "dlwrf": 27187026.0,
107
- "dswrf": 11458988.0,
108
- "duvrs": 1305651.25,
109
- "hcc": 0.3961029052734375,
110
- "lcc": 0.44901806116104126,
111
- "mcc": 0.3288780450820923,
112
- "prate": 3.108070450252853e-05,
113
- "sd": 8.107526082312688e-05,
114
- "sr": 12905302.0,
115
- "t2m": 283.48333740234375,
116
- "tcc": 0.7049227356910706,
117
- "u10": 1.7677178382873535,
118
- "u100": 2.393547296524048,
119
- "u200": 2.7963004112243652,
120
- "v10": 0.985887885093689,
121
- "v100": 1.4244288206100464,
122
- "v200": 1.6010299921035767,
123
- "diff_dlwrf": 1136464.0,
124
- "diff_dswrf": 420584.6875,
125
- "diff_duvrs": 48265.4765625,
126
- "diff_sr": 469169.5,
127
- }
128
-
129
- ECMWF_STD = _to_data_array(ECMWF_STD)
130
- ECMWF_MEAN = _to_data_array(ECMWF_MEAN)
131
-
132
- # ------ GFS
133
- GFS_STD = {
134
- "dlwrf": 96.305916,
135
- "dswrf": 246.18533,
136
- "hcc": 42.525383,
137
- "lcc": 44.3732,
138
- "mcc": 43.150745,
139
- "prate": 0.00010159573,
140
- "r": 25.440672,
141
- "sde": 0.43345627,
142
- "t": 22.825893,
143
- "tcc": 41.030598,
144
- "u10": 5.470838,
145
- "u100": 6.8899174,
146
- "v10": 4.7401133,
147
- "v100": 6.076132,
148
- "vis": 8294.022,
149
- "u": 10.614556,
150
- "v": 7.176398,
151
- }
152
-
153
- GFS_MEAN = {
154
- "dlwrf": 298.342,
155
- "dswrf": 168.12321,
156
- "hcc": 35.272,
157
- "lcc": 43.578342,
158
- "mcc": 33.738823,
159
- "prate": 2.8190969e-05,
160
- "r": 18.359747,
161
- "sde": 0.36937004,
162
- "t": 278.5223,
163
- "tcc": 66.841606,
164
- "u10": -0.0022310058,
165
- "u100": 0.0823025,
166
- "v10": 0.06219831,
167
- "v100": 0.0797807,
168
- "vis": 19628.32,
169
- "u": 11.645444,
170
- "v": 0.12330122,
171
- }
172
-
173
- GFS_STD = _to_data_array(GFS_STD)
174
- GFS_MEAN = _to_data_array(GFS_MEAN)
175
-
176
-
177
- NWP_STDS = NWPStatDict(
178
- ukv=UKV_STD,
179
- ecmwf=ECMWF_STD,
180
- gfs=GFS_STD
181
- )
182
- NWP_MEANS = NWPStatDict(
183
- ukv=UKV_MEAN,
184
- ecmwf=ECMWF_MEAN,
185
- gfs=GFS_MEAN
186
- )
187
-
188
- # ------ Satellite
189
- # RSS Mean and std values from randomised 20% of 2020 imagery
190
-
191
- RSS_STD = {
192
- "HRV": 0.11405209,
193
- "IR_016": 0.21462157,
194
- "IR_039": 0.04618041,
195
- "IR_087": 0.06687243,
196
- "IR_097": 0.0468558,
197
- "IR_108": 0.17482725,
198
- "IR_120": 0.06115861,
199
- "IR_134": 0.04492306,
200
- "VIS006": 0.12184761,
201
- "VIS008": 0.13090034,
202
- "WV_062": 0.16111417,
203
- "WV_073": 0.12924142,
204
- }
205
-
206
- RSS_MEAN = {
207
- "HRV": 0.09298719,
208
- "IR_016": 0.17594202,
209
- "IR_039": 0.86167645,
210
- "IR_087": 0.7719318,
211
- "IR_097": 0.8014212,
212
- "IR_108": 0.71254843,
213
- "IR_120": 0.89058584,
214
- "IR_134": 0.944365,
215
- "VIS006": 0.09633306,
216
- "VIS008": 0.11426069,
217
- "WV_062": 0.7359355,
218
- "WV_073": 0.62479186,
219
- }
220
-
221
- RSS_STD = _to_data_array(RSS_STD)
222
- RSS_MEAN = _to_data_array(RSS_MEAN)