ocf-data-sampler 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocf-data-sampler might be problematic. Click here for more details.

@@ -12,15 +12,20 @@ from ocf_data_sampler.load.utils import (
12
12
 
13
13
 
14
14
  def _get_single_sat_data(zarr_path: Path | str) -> xr.Dataset:
15
- """Helper function to open a zarr from either local or GCP path.
16
-
17
- The local or GCP path may contain wildcard matching (*)
15
+ """Helper function to open a Zarr from either a local or GCP path.
18
16
 
19
17
  Args:
20
- zarr_path: Path to zarr file
18
+ zarr_path: Path to a Zarr file. Wildcards (*) are supported **only** for local paths.
19
+ GCS paths (gs://) **do not support** wildcards.
20
+
21
+ Returns:
22
+ An xarray Dataset containing satellite data.
23
+
24
+ Raises:
25
+ ValueError: If a wildcard (*) is used in a GCS (gs://) path.
21
26
  """
22
27
 
23
- # These kwargs are used if zarr path contains "*"
28
+ # These kwargs are used if the path contains "*"
24
29
  openmf_kwargs = dict(
25
30
  engine="zarr",
26
31
  concat_dim="time",
@@ -29,19 +34,17 @@ def _get_single_sat_data(zarr_path: Path | str) -> xr.Dataset:
29
34
  join="override",
30
35
  )
31
36
 
32
- # Need to generate list of files if using GCP bucket storage
37
+ # Raise an error if a wildcard is used in a GCP path
33
38
  if "gs://" in str(zarr_path) and "*" in str(zarr_path):
34
- result_string = subprocess.run(
35
- f"gsutil ls -d {zarr_path}".split(" "), stdout=subprocess.PIPE
36
- ).stdout.decode("utf-8")
37
- files = result_string.splitlines()
38
-
39
- ds = xr.open_mfdataset(files, **openmf_kwargs)
39
+ raise ValueError("Wildcard (*) paths are not supported for GCP (gs://) URLs.")
40
40
 
41
- elif "*" in str(zarr_path): # Multi-file dataset
41
+ # Handle multi-file dataset for local paths
42
+ if "*" in str(zarr_path):
42
43
  ds = xr.open_mfdataset(zarr_path, **openmf_kwargs)
43
44
  else:
44
45
  ds = xr.open_dataset(zarr_path, engine="zarr", chunks="auto")
46
+
47
+ # Ensure time is unique and sorted
45
48
  ds = ds.drop_duplicates("time").sortby("time")
46
49
 
47
50
  return ds
@@ -53,24 +56,6 @@ def open_sat_data(zarr_path: Path | str | list[Path] | list[str]) -> xr.DataArra
53
56
  Args:
54
57
  zarr_path: Cloud URL or local path pattern, or list of these. If GCS URL, it must start with
55
58
  'gs://'.
56
-
57
- Example:
58
- With wild cards and GCS path:
59
- ```
60
- zarr_paths = [
61
- "gs://bucket/2020_nonhrv_split_*.zarr",
62
- "gs://bucket/2019_nonhrv_split_*.zarr",
63
- ]
64
- ds = open_sat_data(zarr_paths)
65
- ```
66
- Without wild cards and with local path:
67
- ```
68
- zarr_paths = [
69
- "/data/2020_nonhrv.zarr",
70
- "/data/2019_nonhrv.zarr",
71
- ]
72
- ds = open_sat_data(zarr_paths)
73
- ```
74
59
  """
75
60
 
76
61
  # Open the data
@@ -84,7 +69,7 @@ def open_sat_data(zarr_path: Path | str | list[Path] | list[str]) -> xr.DataArra
84
69
  else:
85
70
  ds = _get_single_sat_data(zarr_path)
86
71
 
87
- # Rename
72
+ # Rename dimensions
88
73
  ds = ds.rename(
89
74
  {
90
75
  "variable": "channel",
@@ -92,13 +77,13 @@ def open_sat_data(zarr_path: Path | str | list[Path] | list[str]) -> xr.DataArra
92
77
  }
93
78
  )
94
79
 
95
- # Check the timestamps are unique and increasing
80
+ # Check timestamps
96
81
  check_time_unique_increasing(ds.time_utc)
97
82
 
98
- # Make sure the spatial coords are in increasing order
83
+ # Ensure spatial coordinates are sorted
99
84
  ds = make_spatial_coords_increasing(ds, x_coord="x_geostationary", y_coord="y_geostationary")
100
85
 
101
86
  ds = ds.transpose("time_utc", "channel", "x_geostationary", "y_geostationary")
102
-
103
87
  # TODO: should we control the dtype of the DataArray?
88
+
104
89
  return get_xr_data_array_from_xr_dataset(ds)
@@ -84,16 +84,6 @@ def process_and_combine_datasets(
84
84
  )
85
85
  )
86
86
 
87
- # Add coordinate data
88
- # TODO: Do we need all of these?
89
- numpy_modalities.append(
90
- {
91
- GSPSampleKey.gsp_id: location.id,
92
- GSPSampleKey.x_osgb: location.x,
93
- GSPSampleKey.y_osgb: location.y,
94
- }
95
- )
96
-
97
87
  if target_key == 'gsp':
98
88
  # Make sun coords NumpySample
99
89
  datetimes = pd.date_range(
@@ -104,6 +94,14 @@ def process_and_combine_datasets(
104
94
 
105
95
  lon, lat = osgb_to_lon_lat(location.x, location.y)
106
96
 
97
+ numpy_modalities.append(
98
+ {
99
+ GSPSampleKey.gsp_id: location.id,
100
+ GSPSampleKey.x_osgb: location.x,
101
+ GSPSampleKey.y_osgb: location.y,
102
+ }
103
+ )
104
+
107
105
  numpy_modalities.append(
108
106
  make_sun_position_numpy_sample(datetimes, lon, lat, key_prefix=target_key)
109
107
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: ocf_data_sampler
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Sample from weather data for renewable energy prediction
5
5
  Author: James Fulton, Peter Dudfield, and the Open Climate Fix team
6
6
  Author-email: info@openclimatefix.org
@@ -9,7 +9,7 @@ ocf_data_sampler/data/uk_gsp_locations.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZt
9
9
  ocf_data_sampler/load/__init__.py,sha256=MjgfxilTzyz1RYFoBEeAXmE9hyjknLvdmlHPmlAoiQY,44
10
10
  ocf_data_sampler/load/gsp.py,sha256=Gcr1JVUOPKhFRDCSHtfPDjxx0BtyyEhXrZvGEKLPJ5I,759
11
11
  ocf_data_sampler/load/load_dataset.py,sha256=Ua3RaUg4PIYJkD9BKqTfN8IWUbezbhThJGgEkd9PcaE,1587
12
- ocf_data_sampler/load/satellite.py,sha256=3KlA1fx4SwxdzM-jC1WRaONXO0D6m0WxORnEnwUnZrA,2967
12
+ ocf_data_sampler/load/satellite.py,sha256=f2Q7FSyySOf7DeHxcigHd-vk-J-U4S2pXg_CnhnhuwU,2571
13
13
  ocf_data_sampler/load/site.py,sha256=P83uz01WBDzoZajdOH0m8FQt4-buKDlUk19N548KqhA,1086
14
14
  ocf_data_sampler/load/utils.py,sha256=sAEkPMS9LXVCrc5pANQo97zaoEItVg9hoNj2ZWfx_Ug,1405
15
15
  ocf_data_sampler/load/nwp/__init__.py,sha256=SmcrnbygO5xtCKmGR4wtHrj-HI7nOAvnAtfuvRufBGQ,25
@@ -41,7 +41,7 @@ ocf_data_sampler/select/select_time_slice.py,sha256=9M-yvDv9K77XfEys_OIR31_aVB56
41
41
  ocf_data_sampler/select/spatial_slice_for_dataset.py,sha256=3tRrMBXr7s4CnClbVSIq7hpls3H4Y3qYTDwswcxCCCE,1763
42
42
  ocf_data_sampler/select/time_slice_for_dataset.py,sha256=Z7pOiilSHScxmBKZNG18K5J-S4ifdXXAYGZoHRHD3AY,4324
43
43
  ocf_data_sampler/torch_datasets/datasets/__init__.py,sha256=jfJSFcR0eO1AqeH7S3KnGjsBqVZT5w3oyi784PUR6Q0,146
44
- ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=rodvVSR4sh8qZ2hLdI8qAc3lyxq5U7cVGfS4rRKCzbs,11944
44
+ ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py,sha256=4lqniFbUNt1qWSct4ISavXg9C7FM5cdVu48JHd7A9Pk,11873
45
45
  ocf_data_sampler/torch_datasets/datasets/site.py,sha256=5T8nkTMUHHFidZRuFOunYeKAqNuyZ8V7sikBoBOBwwA,16033
46
46
  ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py,sha256=hIbekql64eXsNDFIoEc--GWxwdVWrh2qKegdOi70Bow,874
47
47
  ocf_data_sampler/torch_datasets/utils/valid_time_periods.py,sha256=Qo65qUHtle_bW5tLTYr7empHTRv-lpjvfx_6GNJj3Xg,4371
@@ -70,10 +70,10 @@ tests/test_sample/test_base.py,sha256=ljtB38MmscTGN6OvUgclBceNnfx6m7AN8iHYDml9XW
70
70
  tests/test_sample/test_site_sample.py,sha256=Gln-Or060cUWvA7Q7c1vsthgCttOAM2z9yBI9zUIrDw,6238
71
71
  tests/test_sample/test_uk_regional_sample.py,sha256=gkeQWC2wC757jKJz_QBmDMFQjn3R54q_tEo948yyxCY,4840
72
72
  tests/torch_datasets/test_merge_and_fill_utils.py,sha256=GtuQg82BM1eHQjT7Ik1x1zaVcuc7KJO4_NC9stXsd4s,1123
73
- tests/torch_datasets/test_pvnet_uk.py,sha256=OzT9ArdnWPa3iJKggxc2-7npkDqWmZyS5pzM4M08NZU,5566
73
+ tests/torch_datasets/test_pvnet_uk.py,sha256=loueo7PUUYJVda3-vBn3bQIC_zgrTAThfx-GTDcBOZg,5596
74
74
  tests/torch_datasets/test_site.py,sha256=5MH5zkHFJXekwpnV6nHuSxt_sRNu9_mxiUjfWqmEhr0,6966
75
- ocf_data_sampler-0.1.3.dist-info/LICENSE,sha256=F-Q3UFCR-BECSocV55BFDpn4YKxve9PKrm-lTt6o_Tg,1073
76
- ocf_data_sampler-0.1.3.dist-info/METADATA,sha256=c5LEfePIqtFvxzWabUufEZkAhIZqp8ep-cHLUH61zAU,12173
77
- ocf_data_sampler-0.1.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
78
- ocf_data_sampler-0.1.3.dist-info/top_level.txt,sha256=Faob6N6cFdPc5eUpCTYcXgCaNhi4XLLteUL5W5ayYmg,31
79
- ocf_data_sampler-0.1.3.dist-info/RECORD,,
75
+ ocf_data_sampler-0.1.5.dist-info/LICENSE,sha256=F-Q3UFCR-BECSocV55BFDpn4YKxve9PKrm-lTt6o_Tg,1073
76
+ ocf_data_sampler-0.1.5.dist-info/METADATA,sha256=PetECVCNM6jys05FuPsOVmntGurbxTuW3n1_j7CYCLE,12173
77
+ ocf_data_sampler-0.1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
78
+ ocf_data_sampler-0.1.5.dist-info/top_level.txt,sha256=Faob6N6cFdPc5eUpCTYcXgCaNhi4XLLteUL5W5ayYmg,31
79
+ ocf_data_sampler-0.1.5.dist-info/RECORD,,
@@ -55,6 +55,7 @@ def test_process_and_combine_datasets(pvnet_config_filename):
55
55
  assert "nwp" in sample
56
56
  assert sample["satellite_actual"].shape == (7, 1, 2, 2)
57
57
  assert sample["nwp"]["ukv"]["nwp"].shape == (4, 1, 2, 2)
58
+ assert "gsp_id" in sample
58
59
 
59
60
 
60
61
  def test_compute():