PyPI - ocf-data-sampler - Versions diffs - 0.5.3__tar.gz → 0.5.6__tar.gz - Mend

ocf-data-sampler 0.5.3tar.gz → 0.5.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (69) hide show

{ocf_data_sampler-0.5.3 → ocf_data_sampler-0.5.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.5.3
+Version: 0.5.6
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License
@@ -28,14 +28,14 @@ License: MIT License
 Project-URL: repository, https://github.com/openclimatefix/ocf-data-sampler
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 Requires-Dist: torch
 Requires-Dist: numpy
 Requires-Dist: pandas
 Requires-Dist: xarray
 Requires-Dist: zarr
-Requires-Dist: numcodecs==0.13.1
+Requires-Dist: numcodecs
 Requires-Dist: dask
 Requires-Dist: matplotlib
 Requires-Dist: pvlib
@@ -45,6 +45,7 @@ Requires-Dist: pyaml_env
 Requires-Dist: pyresample
 Requires-Dist: h5netcdf
 Requires-Dist: xarray-tensorstore==0.1.5
+Requires-Dist: zarr>=3
 # ocf-data-sampler
@@ -62,6 +63,12 @@ We are currently migrating to this repo from [ocf_datapipes](https://github.com/
 > [!Note]
 > This repository is still in early development development and large changes to the user facing functions may still occur.
+## Licence
+This project is primarily licensed under the MIT License (see LICENSE).
+It includes and adapts internal functions from the Google xarray-tensorstore project, licensed under the Apache License, Version 2.0.
 ## Documentation
 **ocf-data-sampler** doesn't have external documentation _yet_; you can read a bit about how our torch datasets work in the README [here](ocf_data_sampler/torch_datasets/README.md).

{ocf_data_sampler-0.5.3 → ocf_data_sampler-0.5.6}/README.md RENAMED Viewed

@@ -14,6 +14,12 @@ We are currently migrating to this repo from [ocf_datapipes](https://github.com/
 > [!Note]
 > This repository is still in early development development and large changes to the user facing functions may still occur.
+## Licence
+This project is primarily licensed under the MIT License (see LICENSE).
+It includes and adapts internal functions from the Google xarray-tensorstore project, licensed under the Apache License, Version 2.0.
 ## Documentation
 **ocf-data-sampler** doesn't have external documentation _yet_; you can read a bit about how our torch datasets work in the README [here](ocf_data_sampler/torch_datasets/README.md).

{ocf_data_sampler-0.5.3 → ocf_data_sampler-0.5.6}/ocf_data_sampler/load/nwp/providers/utils.py RENAMED Viewed

@@ -3,9 +3,8 @@
 from glob import glob
 import xarray as xr
-from xarray_tensorstore import open_zarr
-from ocf_data_sampler.load.open_tensorstore_zarrs import open_zarrs
+from ocf_data_sampler.load.open_xarray_tensorstore import open_zarr, open_zarrs
 def open_zarr_paths(

ocf_data_sampler-0.5.6/ocf_data_sampler/load/open_xarray_tensorstore.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""Utilities for loading TensorStore data into Xarray.
+This module uses and adapts internal functions from the Google xarray-tensorstore project [1],
+licensed under the Apache License, Version 2.0. See [2] for details.
+Modifications copyright 2025 Open climate Fix. Licensed under the MIT License.
+Modifications from the original include:
+- Adding support for opening multiple zarr files as a single xarray object
+- Support for zarr 3 -> https://github.com/google/xarray-tensorstore/pull/22
+References:
+    [1] https://github.com/google-research/tensorstore/blob/main/tensorstore/xarray.py
+    [2] https://www.apache.org/licenses/LICENSE-2.0
+"""
+import os.path
+import re
+import tensorstore as ts
+import xarray as xr
+import zarr
+from xarray_tensorstore import (
+    _DEFAULT_STORAGE_DRIVER,
+    _raise_if_mask_and_scale_used_for_data_vars,
+    _TensorStoreAdapter,
+)
+def _zarr_spec_from_path(path: str, zarr_format: int) -> ...:
+    if re.match(r"\w+\://", path):  # path is a URI
+      kv_store = path
+    else:
+      kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path}
+    return {"driver": f"zarr{zarr_format}", "kvstore": kv_store}
+def _get_data_variable_array_futures(
+    path: str,
+    context: ts.Context | None,
+    variables: list[str],
+) -> dict[ts.Future]:
+    """Open all data variables in a zarr group and return futures.
+    Args:
+        path: path or URI to zarr group to open.
+        context: TensorStore configuration options to use when opening arrays.
+        variables: The variables in the zarr groupto open.
+    """
+    zarr_format = zarr.open(path).metadata.zarr_format
+    specs = {k: _zarr_spec_from_path(os.path.join(path, k), zarr_format) for k in variables}
+    return {k: ts.open(spec, read=True, write=False, context=context) for k, spec in specs.items()}
+def _tensorstore_open_zarrs(
+    paths: list[str],
+    data_vars: list[str],
+    concat_axes: list[int],
+    context: ts.Context,
+) -> dict[str, ts.TensorStore]:
+    """Open multiple zarrs with TensorStore.
+    Args:
+        paths: List of paths to zarr stores.
+        data_vars: List of data variable names to open.
+        concat_axes: List of axes along which to concatenate the data variables.
+        context: TensorStore context.
+    """
+    # Open all the variables from all the datasets - returned as futures
+    arrays_list: list[dict[str, ts.Future]] = []
+    for path in paths:
+        arrays_list.append(_get_data_variable_array_futures(path, context, data_vars))
+    # Wait for the async open operations
+    arrays_list = [{k: v.result() for k, v in arrays.items()} for arrays in arrays_list]
+    # Concatenate each of the variables along the required axis
+    arrays = {}
+    for k, axis in zip(data_vars, concat_axes, strict=True):
+        variable_arrays = [d[k] for d in arrays_list]
+        arrays[k] = ts.concat(variable_arrays, axis=axis)
+    return arrays
+def open_zarr(
+    path: str,
+    context: ts.Context | None = None,
+    mask_and_scale: bool = True,
+) -> xr.Dataset:
+    """Open an xarray.Dataset from zarr using TensorStore.
+    Args:
+        path: path or URI to zarr group to open.
+        context: TensorStore configuration options to use when opening arrays.
+        mask_and_scale: if True (default), attempt to apply masking and scaling like
+          xarray.open_zarr(). This is only supported for coordinate variables and
+          otherwise will raise an error.
+    Returns:
+        Dataset with all data variables opened via TensorStore.
+    """
+    if context is None:
+        context = ts.Context()
+    # Avoid using dask by settung `chunks=None`
+    ds = xr.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale)
+    if mask_and_scale:
+        _raise_if_mask_and_scale_used_for_data_vars(ds)
+    # Open all data variables using tensorstore - returned as futures
+    data_vars = list(ds.data_vars)
+    arrays = _get_data_variable_array_futures(path, context, data_vars)
+    # Wait for the async open operations
+    arrays = {k: v.result() for k, v in arrays.items()}
+    # Adapt the tensorstore arrays and plug them into the xarray object
+    new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
+    return ds.copy(data=new_data)
+def open_zarrs(
+    paths: list[str],
+    concat_dim: str,
+    context: ts.Context | None = None,
+    mask_and_scale: bool = True,
+) -> xr.Dataset:
+    """Open multiple zarrs with TensorStore.
+    Args:
+        paths: List of paths to zarr stores.
+        concat_dim: Dimension along which to concatenate the data variables.
+        context: TensorStore context.
+        mask_and_scale: Whether to mask and scale the data.
+    Returns:
+        Concatenated Dataset with all data variables opened via TensorStore.
+    """
+    if context is None:
+        context = ts.Context()
+    ds_list = [xr.open_zarr(p, mask_and_scale=mask_and_scale, decode_timedelta=True) for p in paths]
+    ds = xr.concat(
+        ds_list,
+        dim=concat_dim,
+        data_vars="minimal",
+        compat="equals",
+        combine_attrs="no_conflicts",
+    )
+    if mask_and_scale:
+        _raise_if_mask_and_scale_used_for_data_vars(ds)
+    # Find the axis along which each data array must be concatenated
+    data_vars = list(ds.data_vars)
+    concat_axes = [ds[v].dims.index(concat_dim) for v in data_vars]
+    # Open and concat all zarrs so each variables is a single TensorStore array
+    arrays = _tensorstore_open_zarrs(paths, data_vars, concat_axes, context)
+    # Plug the arrays into the xarray object
+    new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
+    return ds.copy(data=new_data)

{ocf_data_sampler-0.5.3 → ocf_data_sampler-0.5.6}/ocf_data_sampler/load/satellite.py RENAMED Viewed

@@ -1,16 +1,14 @@
 """Satellite loader."""
 import numpy as np
 import xarray as xr
-from xarray_tensorstore import open_zarr
+from ocf_data_sampler.load.open_xarray_tensorstore import open_zarr, open_zarrs
 from ocf_data_sampler.load.utils import (
     check_time_unique_increasing,
     get_xr_data_array_from_xr_dataset,
     make_spatial_coords_increasing,
 )
-from .open_tensorstore_zarrs import open_zarrs
 def open_sat_data(zarr_path: str | list[str]) -> xr.DataArray:
     """Lazily opens the zarr store and validates data types.

{ocf_data_sampler-0.5.3 → ocf_data_sampler-0.5.6}/ocf_data_sampler.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.5.3
+Version: 0.5.6
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License
@@ -28,14 +28,14 @@ License: MIT License
 Project-URL: repository, https://github.com/openclimatefix/ocf-data-sampler
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 Requires-Dist: torch
 Requires-Dist: numpy
 Requires-Dist: pandas
 Requires-Dist: xarray
 Requires-Dist: zarr
-Requires-Dist: numcodecs==0.13.1
+Requires-Dist: numcodecs
 Requires-Dist: dask
 Requires-Dist: matplotlib
 Requires-Dist: pvlib
@@ -45,6 +45,7 @@ Requires-Dist: pyaml_env
 Requires-Dist: pyresample
 Requires-Dist: h5netcdf
 Requires-Dist: xarray-tensorstore==0.1.5
+Requires-Dist: zarr>=3
 # ocf-data-sampler
@@ -62,6 +63,12 @@ We are currently migrating to this repo from [ocf_datapipes](https://github.com/
 > [!Note]
 > This repository is still in early development development and large changes to the user facing functions may still occur.
+## Licence
+This project is primarily licensed under the MIT License (see LICENSE).
+It includes and adapts internal functions from the Google xarray-tensorstore project, licensed under the Apache License, Version 2.0.
 ## Documentation
 **ocf-data-sampler** doesn't have external documentation _yet_; you can read a bit about how our torch datasets work in the README [here](ocf_data_sampler/torch_datasets/README.md).

{ocf_data_sampler-0.5.3 → ocf_data_sampler-0.5.6}/ocf_data_sampler.egg-info/SOURCES.txt RENAMED Viewed

@@ -17,7 +17,7 @@ ocf_data_sampler/data/uk_gsp_locations_20250109.csv
 ocf_data_sampler/load/__init__.py
 ocf_data_sampler/load/gsp.py
 ocf_data_sampler/load/load_dataset.py
-ocf_data_sampler/load/open_tensorstore_zarrs.py
+ocf_data_sampler/load/open_xarray_tensorstore.py
 ocf_data_sampler/load/satellite.py
 ocf_data_sampler/load/site.py
 ocf_data_sampler/load/utils.py

{ocf_data_sampler-0.5.3 → ocf_data_sampler-0.5.6}/ocf_data_sampler.egg-info/requires.txt RENAMED Viewed

@@ -3,7 +3,7 @@ numpy
 pandas
 xarray
 zarr
-numcodecs==0.13.1
+numcodecs
 dask
 matplotlib
 pvlib
@@ -13,3 +13,4 @@ pyaml_env
 pyresample
 h5netcdf
 xarray-tensorstore==0.1.5
+zarr>=3

{ocf_data_sampler-0.5.3 → ocf_data_sampler-0.5.6}/pyproject.toml RENAMED Viewed

@@ -9,7 +9,7 @@ build-backend = "setuptools.build_meta"
 name = "ocf-data-sampler"
 dynamic = ["version"] # Set automtically using git: https://setuptools-git-versioning.readthedocs.io/en/stable/
 readme = { file = "README.md", content-type = "text/markdown" }
-requires-python = ">=3.10"
+requires-python = ">=3.11"
 license = { file = "LICENSE" }
 authors = [
     { name = "Open Climate Fix team", email = "info@openclimatefix.org" },
@@ -26,7 +26,7 @@ dependencies = [
     "pandas",
     "xarray",
     "zarr",
-    "numcodecs==0.13.1",
+    "numcodecs",
     "dask",
     "matplotlib",
     "pvlib",
@@ -36,6 +36,7 @@ dependencies = [
     "pyresample",
     "h5netcdf",
     "xarray-tensorstore==0.1.5",
+    "zarr>=3",
 ]
 [dependency-groups]

ocf_data_sampler-0.5.3/ocf_data_sampler/load/open_tensorstore_zarrs.py DELETED Viewed

@@ -1,92 +0,0 @@
-"""Open multiple zarrs with TensorStore.
-This extendds the functionality of xarray_tensorstore to open multiple zarr stores
-"""
-import os
-import tensorstore as ts
-import xarray as xr
-from xarray_tensorstore import (
-    _raise_if_mask_and_scale_used_for_data_vars,
-    _TensorStoreAdapter,
-    _zarr_spec_from_path,
-)
-def tensorstore_open_multi_zarrs(
-    paths: list[str],
-    data_vars: list[str],
-    concat_axes: list[int],
-    context: ts.Context,
-    write: bool,
-) -> dict[str, ts.TensorStore]:
-    """Open multiple zarrs with TensorStore.
-    Args:
-        paths: List of paths to zarr stores.
-        data_vars: List of data variable names to open.
-        concat_axes: List of axes along which to concatenate the data variables.
-        context: TensorStore context.
-        write: Whether to open the stores for writing.
-    """
-    arrays_list = []
-    for path in paths:
-        specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in data_vars}
-        array_futures = {
-          k: ts.open(spec, read=True, write=write, context=context)
-          for k, spec in specs.items()
-        }
-        arrays_list.append({k: v.result() for k, v in array_futures.items()})
-    arrays = {}
-    for k, axis in zip(data_vars, concat_axes, strict=False):
-        datasets = [d[k] for d in arrays_list]
-        arrays[k] = ts.concat(datasets, axis=axis)
-    return arrays
-def open_zarrs(
-    paths: list[str],
-    concat_dim: str,
-    *,
-    context: ts.Context | None = None,
-    mask_and_scale: bool = True,
-    write: bool = False,
-) -> xr.Dataset:
-    """Open multiple zarrs with TensorStore.
-    Args:
-        paths: List of paths to zarr stores.
-        concat_dim: Dimension along which to concatenate the data variables.
-        context: TensorStore context.
-        mask_and_scale: Whether to mask and scale the data.
-        write: Whether to open the stores for writing.
-    """
-    if context is None:
-        context = ts.Context()
-    ds = xr.open_mfdataset(
-        paths,
-        concat_dim=concat_dim,
-        combine="nested",
-        mask_and_scale=mask_and_scale,
-        decode_timedelta=True,
-    )
-    if mask_and_scale:
-        # Data variables get replaced below with _TensorStoreAdapter arrays, which
-        # don't get masked or scaled. Raising an error avoids surprising users with
-        # incorrect data values.
-        _raise_if_mask_and_scale_used_for_data_vars(ds)
-    data_vars = list(ds.data_vars)
-    concat_axes = [ds[v].dims.index(concat_dim) for v in data_vars]
-    arrays = tensorstore_open_multi_zarrs(paths, data_vars, concat_axes, context, write)
-    new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
-    return ds.copy(data=new_data)