ocf-data-sampler 0.5.7__tar.gz → 0.5.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (69) hide show
  1. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/PKG-INFO +1 -1
  2. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/datasets/pvnet_uk.py +3 -3
  3. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/datasets/site.py +3 -3
  4. ocf_data_sampler-0.5.9/ocf_data_sampler/utils.py +37 -0
  5. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler.egg-info/PKG-INFO +1 -1
  6. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler.egg-info/SOURCES.txt +1 -2
  7. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler.egg-info/top_level.txt +0 -1
  8. ocf_data_sampler-0.5.7/ocf_data_sampler/utils.py +0 -21
  9. ocf_data_sampler-0.5.7/utils/compute_icon_mean_stddev.py +0 -72
  10. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/LICENSE +0 -0
  11. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/README.md +0 -0
  12. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/__init__.py +0 -0
  13. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/config/__init__.py +0 -0
  14. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/config/load.py +0 -0
  15. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/config/model.py +0 -0
  16. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/config/save.py +0 -0
  17. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/data/uk_gsp_locations_20220314.csv +0 -0
  18. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/data/uk_gsp_locations_20250109.csv +0 -0
  19. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/__init__.py +0 -0
  20. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/gsp.py +0 -0
  21. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/load_dataset.py +0 -0
  22. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/__init__.py +0 -0
  23. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/nwp.py +0 -0
  24. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/providers/__init__.py +0 -0
  25. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/providers/cloudcasting.py +0 -0
  26. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/providers/ecmwf.py +0 -0
  27. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/providers/gfs.py +0 -0
  28. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/providers/icon.py +0 -0
  29. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/providers/ukv.py +0 -0
  30. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/nwp/providers/utils.py +0 -0
  31. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/open_xarray_tensorstore.py +0 -0
  32. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/satellite.py +0 -0
  33. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/site.py +0 -0
  34. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/load/utils.py +0 -0
  35. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/__init__.py +0 -0
  36. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/collate.py +0 -0
  37. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/common_types.py +0 -0
  38. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/datetime_features.py +0 -0
  39. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/gsp.py +0 -0
  40. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/nwp.py +0 -0
  41. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/satellite.py +0 -0
  42. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/site.py +0 -0
  43. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/numpy_sample/sun_position.py +0 -0
  44. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/select/__init__.py +0 -0
  45. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/select/dropout.py +0 -0
  46. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/select/fill_time_periods.py +0 -0
  47. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/select/find_contiguous_time_periods.py +0 -0
  48. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/select/geospatial.py +0 -0
  49. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/select/location.py +0 -0
  50. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/select/select_spatial_slice.py +0 -0
  51. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/select/select_time_slice.py +0 -0
  52. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/datasets/__init__.py +0 -0
  53. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/sample/__init__.py +0 -0
  54. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/sample/base.py +0 -0
  55. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/sample/site.py +0 -0
  56. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/sample/uk_regional.py +0 -0
  57. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/utils/__init__.py +0 -0
  58. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/utils/config_normalization_values_to_dicts.py +0 -0
  59. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py +0 -0
  60. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/utils/spatial_slice_for_dataset.py +0 -0
  61. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py +0 -0
  62. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/utils/valid_time_periods.py +0 -0
  63. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler/torch_datasets/utils/validation_utils.py +0 -0
  64. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler.egg-info/dependency_links.txt +0 -0
  65. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/ocf_data_sampler.egg-info/requires.txt +0 -0
  66. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/pyproject.toml +0 -0
  67. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/scripts/download_gsp_location_data.py +0 -0
  68. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/scripts/refactor_site.py +0 -0
  69. {ocf_data_sampler-0.5.7 → ocf_data_sampler-0.5.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocf-data-sampler
3
- Version: 0.5.7
3
+ Version: 0.5.9
4
4
  Author: James Fulton, Peter Dudfield
5
5
  Author-email: Open Climate Fix team <info@openclimatefix.org>
6
6
  License: MIT License
@@ -30,7 +30,7 @@ from ocf_data_sampler.torch_datasets.utils.merge_and_fill_utils import (
30
30
  fill_nans_in_arrays,
31
31
  merge_dicts,
32
32
  )
33
- from ocf_data_sampler.utils import compute, minutes
33
+ from ocf_data_sampler.utils import minutes, tensorstore_compute
34
34
 
35
35
  xr.set_options(keep_attrs=True)
36
36
 
@@ -254,7 +254,7 @@ class PVNetUKRegionalDataset(AbstractPVNetUKDataset):
254
254
  """
255
255
  sample_dict = slice_datasets_by_space(self.datasets_dict, location, self.config)
256
256
  sample_dict = slice_datasets_by_time(sample_dict, t0, self.config)
257
- sample_dict = compute(sample_dict)
257
+ sample_dict = tensorstore_compute(sample_dict)
258
258
 
259
259
  return self.process_and_combine_datasets(sample_dict, t0, location)
260
260
 
@@ -313,7 +313,7 @@ class PVNetUKConcurrentDataset(AbstractPVNetUKDataset):
313
313
  """
314
314
  # Slice by time then load to avoid loading the data multiple times from disk
315
315
  sample_dict = slice_datasets_by_time(self.datasets_dict, t0, self.config)
316
- sample_dict = compute(sample_dict)
316
+ sample_dict = tensorstore_compute(sample_dict)
317
317
 
318
318
  gsp_samples = []
319
319
 
@@ -34,7 +34,7 @@ from ocf_data_sampler.torch_datasets.utils.merge_and_fill_utils import (
34
34
  fill_nans_in_arrays,
35
35
  merge_dicts,
36
36
  )
37
- from ocf_data_sampler.utils import compute, minutes
37
+ from ocf_data_sampler.utils import minutes, tensorstore_compute
38
38
 
39
39
  xr.set_options(keep_attrs=True)
40
40
 
@@ -272,7 +272,7 @@ class SitesDataset(Dataset):
272
272
  sample_dict = slice_datasets_by_space(self.datasets_dict, location, self.config)
273
273
  sample_dict = slice_datasets_by_time(sample_dict, t0, self.config)
274
274
 
275
- sample_dict = compute(sample_dict)
275
+ sample_dict = tensorstore_compute(sample_dict)
276
276
 
277
277
  return process_and_combine_datasets(
278
278
  sample_dict,
@@ -408,7 +408,7 @@ class SitesDatasetConcurrent(Dataset):
408
408
  """
409
409
  # slice by time first as we want to keep all site id info
410
410
  sample_dict = slice_datasets_by_time(self.datasets_dict, t0, self.config)
411
- sample_dict = compute(sample_dict)
411
+ sample_dict = tensorstore_compute(sample_dict)
412
412
 
413
413
  site_samples = []
414
414
 
@@ -0,0 +1,37 @@
1
+ """Miscellaneous helper functions."""
2
+
3
+ import pandas as pd
4
+ from xarray_tensorstore import read
5
+
6
+
7
+ def minutes(minutes: int | list[float]) -> pd.Timedelta | pd.TimedeltaIndex:
8
+ """Timedelta minutes.
9
+
10
+ Args:
11
+ minutes: the number of minutes, single value or list
12
+ """
13
+ return pd.to_timedelta(minutes, unit="m")
14
+
15
+
16
+ def compute(xarray_dict: dict) -> dict:
17
+ """Eagerly load a nested dictionary of xarray DataArrays."""
18
+ for k, v in xarray_dict.items():
19
+ if isinstance(v, dict):
20
+ xarray_dict[k] = compute(v)
21
+ else:
22
+ xarray_dict[k] = v.compute()
23
+ return xarray_dict
24
+
25
+
26
+ def tensorstore_compute(xarray_dict: dict) -> dict:
27
+ """Eagerly read and load a nested dictionary of xarray-tensorstore DataArrays."""
28
+ # Kick off the tensorstore async reading
29
+ for k, v in xarray_dict.items():
30
+ if isinstance(v, dict):
31
+ xarray_dict[k] = tensorstore_compute(v)
32
+ else:
33
+ xarray_dict[k] = read(v)
34
+
35
+ # Running the compute function will wait until all arrays have been read
36
+ return compute(xarray_dict)
37
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocf-data-sampler
3
- Version: 0.5.7
3
+ Version: 0.5.9
4
4
  Author: James Fulton, Peter Dudfield
5
5
  Author-email: Open Climate Fix team <info@openclimatefix.org>
6
6
  License: MIT License
@@ -62,5 +62,4 @@ ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py
62
62
  ocf_data_sampler/torch_datasets/utils/valid_time_periods.py
63
63
  ocf_data_sampler/torch_datasets/utils/validation_utils.py
64
64
  scripts/download_gsp_location_data.py
65
- scripts/refactor_site.py
66
- utils/compute_icon_mean_stddev.py
65
+ scripts/refactor_site.py
@@ -1,4 +1,3 @@
1
1
  dist
2
2
  ocf_data_sampler
3
3
  scripts
4
- utils
@@ -1,21 +0,0 @@
1
- """Miscellaneous helper functions."""
2
-
3
- import pandas as pd
4
-
5
-
6
- def minutes(minutes: int | list[float]) -> pd.Timedelta | pd.TimedeltaIndex:
7
- """Timedelta minutes.
8
-
9
- Args:
10
- minutes: the number of minutes, single value or list
11
- """
12
- return pd.to_timedelta(minutes, unit="m")
13
-
14
- def compute(xarray_dict: dict) -> dict:
15
- """Eagerly load a nested dictionary of xarray DataArrays."""
16
- for k, v in xarray_dict.items():
17
- if isinstance(v, dict):
18
- xarray_dict[k] = compute(v)
19
- else:
20
- xarray_dict[k] = v.compute(scheduler="single-threaded")
21
- return xarray_dict
@@ -1,72 +0,0 @@
1
- """Script to compute normalisation constants from NWP data."""
2
-
3
- import argparse
4
- import glob
5
- import logging
6
-
7
- import numpy as np
8
- import xarray as xr
9
-
10
- from ocf_data_sampler.load.nwp.providers.icon import open_icon_eu
11
-
12
- # Configure logging
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
15
-
16
- # Add argument parser
17
- parser = argparse.ArgumentParser(description="Compute normalization constants from NWP data")
18
- parser.add_argument("--data-path", type=str, required=True,
19
- help='Path pattern to zarr files (e.g., "/path/to/data/*.zarr.zip")')
20
- parser.add_argument("--n-samples", type=int, default=2000,
21
- help="Number of random samples to use (default: 2000)")
22
-
23
- args = parser.parse_args()
24
-
25
- zarr_files = glob.glob(args.data_path)
26
- n_samples = args.n_samples
27
-
28
- ds = open_icon_eu(zarr_files)
29
-
30
- n_init_times = ds.sizes["init_time_utc"]
31
- n_lats = ds.sizes["latitude"]
32
- n_longs = ds.sizes["longitude"]
33
- n_steps = ds.sizes["step"]
34
-
35
- random_init_times = np.random.choice(n_init_times, size=n_samples, replace=True)
36
- random_lats = np.random.choice(n_lats, size=n_samples, replace=True)
37
- random_longs = np.random.choice(n_longs, size=n_samples, replace=True)
38
- random_steps = np.random.choice(n_steps, size=n_samples, replace=True)
39
-
40
- samples = []
41
- for i in range(n_samples):
42
- sample = ds.isel(init_time_utc=random_init_times[i],
43
- latitude=random_lats[i],
44
- longitude=random_longs[i],
45
- step=random_steps[i])
46
- samples.append(sample)
47
-
48
- samples_stack = xr.concat(samples, dim="samples")
49
-
50
-
51
- available_channels = samples_stack.channel.values.tolist()
52
- logger.info("Available channels: %s", available_channels)
53
-
54
- ICON_EU_MEAN = {}
55
- ICON_EU_STD = {}
56
-
57
- for var in available_channels:
58
- if var not in available_channels:
59
- logger.warning("Variable '%s' not found in the channel coordinate; skipping.", var)
60
- continue
61
- var_data = samples_stack.sel(channel=var)
62
- var_mean = float(var_data.mean().compute())
63
- var_std = float(var_data.std().compute())
64
-
65
- ICON_EU_MEAN[var] = var_mean
66
- ICON_EU_STD[var] = var_std
67
-
68
- logger.info("Processed %s: mean=%.4f, std=%.4f", var, var_mean, var_std)
69
-
70
- logger.info("\nMean values:\n%s", ICON_EU_MEAN)
71
- logger.info("\nStandard deviations:\n%s", ICON_EU_STD)
72
-