PyPI - mxalign - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mxalign 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

mxalign/__init__.py +36 -0
mxalign/accessors/__init__.py +7 -0
mxalign/accessors/space.py +205 -0
mxalign/accessors/time.py +180 -0
mxalign/align/__init__.py +7 -0
mxalign/align/nans.py +72 -0
mxalign/align/space.py +21 -0
mxalign/align/time.py +62 -0
mxalign/cli.py +157 -0
mxalign/interpolations/__init__.py +9 -0
mxalign/interpolations/base.py +29 -0
mxalign/interpolations/delaunay.py +218 -0
mxalign/interpolations/interpolate.py +29 -0
mxalign/interpolations/registry.py +17 -0
mxalign/interpolations/xarray.py +63 -0
mxalign/loaders/__init__.py +11 -0
mxalign/loaders/anemoi_datasets.py +92 -0
mxalign/loaders/anemoi_inference.py +103 -0
mxalign/loaders/base.py +103 -0
mxalign/loaders/harp_obstable.py +81 -0
mxalign/loaders/loader.py +8 -0
mxalign/loaders/registry.py +17 -0
mxalign/properties/__init__.py +0 -0
mxalign/properties/properties.py +25 -0
mxalign/properties/specs.py +54 -0
mxalign/properties/utils.py +43 -0
mxalign/properties/validation.py +48 -0
mxalign/runner.py +167 -0
mxalign/transformations/__init__.py +7 -0
mxalign/transformations/base.py +38 -0
mxalign/transformations/external.py +34 -0
mxalign/transformations/registry.py +20 -0
mxalign/transformations/transform.py +28 -0
mxalign/utils/config.py +55 -0
mxalign/utils/dates.py +76 -0
mxalign/utils/projections.py +104 -0
mxalign/utils/save.py +62 -0
mxalign/verification.py +57 -0
mxalign-0.1.0.dist-info/METADATA +136 -0
mxalign-0.1.0.dist-info/RECORD +43 -0
mxalign-0.1.0.dist-info/WHEEL +4 -0
mxalign-0.1.0.dist-info/entry_points.txt +2 -0
mxalign-0.1.0.dist-info/licenses/LICENSE +21 -0

mxalign/cli.py ADDED Viewed

@@ -0,0 +1,157 @@
+import argparse
+import sys
+import logging
+# Define log format
+LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+LOG = logging.getLogger(__name__)
+def run_local(args):
+    # Only import the necessary modules if function is called
+    # to avoid unnecessary slow imports at the top level
+    from dask.distributed import Client, LocalCluster
+    from .runner import Runner
+    cluster = LocalCluster(
+        n_workers=args.n_workers,
+        threads_per_worker=args.threads_per_worker,
+        processes=True,
+    )
+    client = Client(cluster)
+    runner = Runner(args.CONFIG)
+    try:
+        runner.run()
+    except Exception:
+        LOG.error("Error during verification closing down dask cluster", exc_info=True)
+        client.close()
+        cluster.close()
+        sys.exit(1)
+def run_slurm(args):
+    # Only import the necessary modules if function is called
+    # to avoid unnecessary slow imports at the top level
+    from dask.distributed import Client
+    from dask_jobqueue import SLURMCluster
+    from .runner import Runner
+    cluster = SLURMCluster(
+        queue=args.queue,
+        account=args.account,
+        cores=args.cores,
+        # processes = args.processes,
+        memory=args.memory,
+        interface=args.interface,
+    )
+    cluster.scale(jobs=3)
+    client = Client(cluster)
+    logging.basicConfig(
+        level=logging.INFO,  # Set log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        format=LOG_FORMAT,
+        datefmt=DATE_FORMAT,
+        handlers=[
+            # logging.FileHandler("app.log"),  # Log to a file
+            logging.StreamHandler()  # Log to console
+        ],
+    )
+    runner = Runner(args.CONFIG)
+    try:
+        runner.run()
+    except Exception:
+        LOG.error("Error during verification closing down dask cluster", exc_info=True)
+        client.close()
+        cluster.close()
+        sys.exit(1)
+def main():
+    parser = argparse.ArgumentParser(description="mxalign CLI")
+    subparsers = parser.add_subparsers(
+        dest="command", required=True, help="Available commands"
+    )
+    local_parser = subparsers.add_parser(
+        "local",
+        help="Run the verification pipeline based on a config-file on a local dask cluster",
+    )
+    local_parser.add_argument(
+        "--n_workers", default=4, type=int, help="Number of dask workers"
+    )
+    local_parser.add_argument(
+        "--threads_per_worker",
+        default=1,
+        type=int,
+        help="Number of threads per dask worker",
+    )
+    slurm_parser = subparsers.add_parser(
+        "slurm",
+        help="Run the verification pipeline based on a config-file on a slurm cluster",
+    )
+    slurm_parser.add_argument(
+        "--queue", type=str, help="Destination queue for the worker jobs"
+    )
+    slurm_parser.add_argument(
+        "--account", type=str, help="Account to charge the jobs to"
+    )
+    slurm_parser.add_argument(
+        "--cores",
+        type=int,
+        default=8,
+        help="Total number of CPU cores on which all worker threads inside a job will run",
+    )
+    slurm_parser.add_argument(
+        "--memory",
+        type=str,
+        default="64GB",
+        help="Total amount of memory to be used by all workers inside a job",
+    )
+    slurm_parser.add_argument(
+        "--interface",
+        type=str,
+        default="hsn0",
+        help="Network interface to use for the dask workers",
+    )
+    parser.add_argument("CONFIG", type=str, help="Path to the YAML configuration file")
+    args = parser.parse_args()
+    if args.command == "local":
+        run_local(args)
+    elif args.command == "slurm":
+        run_slurm(args)
+    elif not args.command:
+        parser.print_help()
+        sys.exit(1)
+    else:
+        LOG.error(f"Unknown command: {args.command}")
+        parser.print_help()
+        sys.exit(1)
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,  # Set log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        format=LOG_FORMAT,
+        datefmt=DATE_FORMAT,
+        handlers=[
+            # logging.FileHandler("app.log"),  # Log to a file
+            logging.StreamHandler()  # Log to console
+        ],
+    )
+    LOG.info("Starting mxalign CLI")
+    main()

mxalign/interpolations/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from . import base
+from . import xarray
+from . import delaunay
+__all__ = [
+    "base",
+    "xarray",
+    "delaunay",
+]

mxalign/interpolations/base.py ADDED Viewed

@@ -0,0 +1,29 @@
+import xarray as xr
+from ..properties.properties import Space
+from ..properties.utils import update_space_property
+class BaseInterpolator:
+    """Base class for all interpolators."""
+    name: str = "base"
+    source_space: Space | None = None
+    target_space: Space | None = None
+    def __init__(self, target_dataset, **options):
+        self.target_dataset = target_dataset
+        self.options = options
+        # TODO: Check the properties
+    # def supports(self, src: Properties, tgt: Properties):
+    def interpolate(
+        self, source_dataset: xr.Dataset | xr.DataArray
+    ) -> xr.Dataset | xr.DataArray:
+        ds_out = self._interpolate(source_dataset)
+        return update_space_property(ds_out, self.target_space)
+    def _interpolate(
+        self, source_dataset: xr.Dataset | xr.DataArray
+    ) -> xr.Dataset | xr.DataArray:
+        pass

mxalign/interpolations/delaunay.py ADDED Viewed

@@ -0,0 +1,218 @@
+from functools import partial
+import numpy as np
+import dask.array as dda
+import xarray as xr
+from scipy.spatial import Delaunay
+from scipy.sparse import csr_matrix
+from .base import BaseInterpolator
+from .registry import register_interpolator
+from ..properties.properties import Space
+@register_interpolator
+class DelaunayInterpolator(BaseInterpolator):
+    name = "delaunay"
+    source_space = Space.GRID
+    target_space = Space.POINT
+    def __init__(self, target_dataset, **options):
+        super().__init__(target_dataset, **options)
+        method = self.options.get("method", "linear")
+        self._W_cache = {}  # keyed by source grid hash
+        if method != "linear":
+            raise ValueError(
+                f"Method: {method}. Delaunay interpolation only supports linear interpolation"
+            )
+    def _get_weights(self, source_points, target_points):
+        key = (
+            source_points.shape,
+            source_points[0, 0],
+            source_points[-1, 1],
+        )  # cheap fingerprint
+        if key not in self._W_cache:
+            triangulation = Delaunay(source_points)
+            self._W_cache[key] = _build_weight_matrix(
+                triangulation, source_points, target_points
+            )
+        return self._W_cache[key]
+    def _interpolate(self, source_dataset):
+        if "grid_index" not in source_dataset.dims:
+            raise NotImplementedError(
+                "Delaunay interpolation currently only supports stacked grids"
+            )
+        if "latitude" in source_dataset.dims:
+            lon_grid, lat_grid = np.meshgrid(
+                source_dataset["longitude"].values, source_dataset["latitude"].values
+            )
+            source_points = np.column_stack((lat_grid.ravel(), lon_grid.ravel()))
+        else:
+            source_points = np.column_stack(
+                (source_dataset["latitude"].values, source_dataset["longitude"].values)
+            )
+        target_points = np.column_stack(
+            (
+                self.target_dataset["latitude"].values,
+                self.target_dataset["longitude"].values,
+            )
+        )
+        # Compute triangulation and sparse weight matrix ONCE, shared across all variables
+        W = self._get_weights(source_points, target_points)
+        arrays_out = {}
+        for var in source_dataset.data_vars:
+            da = source_dataset[var]
+            if da.dims[-1] != "grid_index":
+                print(
+                    f"Skipping variable '{var}' - doesn't end with spatial dimension grid_index"
+                )
+                continue
+            else:
+                arrays_out[var] = interpolate_da(da, W, target_points)
+        ds_out = xr.Dataset(arrays_out).assign_coords(
+            latitude=self.target_dataset["latitude"],
+            longitude=self.target_dataset["longitude"],
+        )
+        ds_out.attrs["properties"] = source_dataset.attrs["properties"]
+        return ds_out
+def _build_weight_matrix(
+    triangulation: Delaunay,
+    source_points: np.ndarray,
+    target_points: np.ndarray,
+) -> csr_matrix:
+    """
+    Precompute a sparse (n_target, n_source) weight matrix from the triangulation.
+    Applying W to a (n_source,) value vector gives (n_target,) interpolated values
+    via a simple sparse matrix multiply. Target points outside the convex hull
+    receive NaN weights.
+    """
+    print("Calculating interpolation-weight matrix")
+    n_target = len(target_points)
+    n_source = len(source_points)
+    ndim = source_points.shape[1]  # 2 for lat/lon
+    # Find which simplex each target point falls in; -1 means outside convex hull
+    simplex_indices = triangulation.find_simplex(target_points)  # (n_target,)
+    # Map outside points to simplex 0 temporarily to avoid index errors —
+    # their weights will be NaN'd out below
+    safe_indices = np.where(simplex_indices >= 0, simplex_indices, 0)
+    # Vertices of each target point's simplex: (n_target, ndim+1)
+    simplex_vertices = triangulation.simplices[safe_indices]
+    # Recover barycentric coordinates using the affine transforms stored in
+    # triangulation.transform: shape (nsimplex, ndim+1, ndim)
+    #   transform[s, :ndim, :] — inverse of the edge matrix for simplex s
+    #   transform[s,  ndim, :] — the ndim-th vertex (origin) of simplex s
+    Tinv = triangulation.transform[safe_indices, :ndim, :]  # (n_target, ndim, ndim)
+    origin = triangulation.transform[safe_indices, ndim, :]  # (n_target, ndim)
+    r = target_points - origin  # (n_target, ndim)
+    bary_partial = np.einsum("nij,nj->ni", Tinv, r)  # (n_target, ndim)
+    last = 1.0 - bary_partial.sum(axis=1, keepdims=True)
+    bary = np.concatenate([bary_partial, last], axis=1)  # (n_target, ndim+1)
+    # Flatten into coordinate format (COO) for sparse matrix construction
+    rows = np.repeat(np.arange(n_target), ndim + 1)
+    cols = simplex_vertices.ravel()
+    vals = bary.ravel()
+    # NaN out weights for points outside the convex hull
+    outside = simplex_indices == -1
+    vals[np.repeat(outside, ndim + 1)] = np.nan
+    W = csr_matrix((vals, (rows, cols)), shape=(n_target, n_source))
+    print("Done")
+    return W
+def interpolate_da(
+    da: xr.DataArray, W: csr_matrix, target_points: np.ndarray
+) -> xr.DataArray:
+    n_target = len(target_points)
+    leading_dims = da.dims[:-1]
+    # Validate that grid_index is not chunked
+    if isinstance(da.data, dda.Array):
+        grid_chunks = dict(zip(da.dims, da.chunks)).get("grid_index")
+        if grid_chunks is not None and len(grid_chunks) > 1:
+            raise ValueError(
+                f"grid_index must not be chunked for Delaunay interpolation "
+                f"(found {len(grid_chunks)} chunks). Rechunk with da.chunk({{'grid_index': -1}}) "
+                f"or enforce this on the loading side."
+            )
+    # Build the template
+    # Get chunking info for leading dims
+    shape_tmp = tuple(da.sizes[d] for d in leading_dims) + (n_target,)
+    if isinstance(da.data, dda.Array):
+        dim_to_chunks = dict(zip(da.dims, da.chunks))
+    else:
+        dim_to_chunks = {dim: (da.sizes[dim],) for dim in da.dims}
+    chunks_tmp = tuple(
+        dim_to_chunks[dim] if dim in dim_to_chunks else (da.sizes[dim],)
+        for dim in leading_dims
+    ) + ((n_target,),)
+    # Create a dask array template matching the chunking pattern
+    tmp = dda.empty(shape=shape_tmp, chunks=chunks_tmp, dtype=da.dtype)
+    tmp = xr.DataArray(
+        tmp,
+        dims=leading_dims + ("point_index",),
+        coords={d: da.coords[d].load() for d in leading_dims},
+    )
+    # Drop coords tied to grid_index to avoid dimension mismatch in map_blocks
+    spatial_coords = [c for c in da.coords if "grid_index" in da[c].dims]
+    da_clean = da.drop_vars(spatial_coords)
+    da_interp = da_clean.map_blocks(
+        partial(interpolate_block, W=W, target_points=target_points), template=tmp
+    )
+    return da_interp
+def interpolate_block(
+    block: xr.DataArray,
+    W: csr_matrix,
+    target_points: np.ndarray,
+) -> xr.DataArray:
+    data = block.values  # shape = (.., npoints)
+    original_shape = data.shape[:-1]
+    data_flat = data.reshape(
+        -1, data.shape[-1]
+    )  # shape = (ndim1 * ndim2 * ... , npoints)
+    # Identify NaN source points
+    nan_mask = np.isnan(data_flat)  # (nleading, n_source)
+    if nan_mask.any():
+        print(f"Warning, interpolating NaNs for variable {block.name}")
+    # Single sparse matrix multiply replaces the per-row interpolator loop:
+    # (nleading, n_source) @ (n_source, n_target) -> (nleading, n_target)
+    interpolated_flat = data_flat @ W.T
+    interpolated = interpolated_flat.reshape(*original_shape, target_points.shape[0])
+    new_dims = block.dims[:-1] + ("point_index",)
+    new_coords = {dim: block.coords[dim] for dim in block.dims[:-1]}
+    return xr.DataArray(interpolated, dims=new_dims, coords=new_coords)

mxalign/interpolations/interpolate.py ADDED Viewed

@@ -0,0 +1,29 @@
+from .registry import get_interpolation
+def interpolate(source_datasets, target_dataset, method, **kwargs):
+    interp_cls = get_interpolation(method)
+    interpolator = interp_cls(target_dataset, **kwargs)
+    if isinstance(source_datasets, dict):
+        keys = list(source_datasets.keys())
+        datasets = list(source_datasets.values())
+    else:
+        if not isinstance(source_datasets, list):
+            datasets = [source_datasets]
+        keys = None
+    if keys:
+        interpolated_datasets = dict()
+        for key, ds in zip(keys, datasets):
+            interpolated_datasets[key] = interpolator.interpolate(ds.copy())
+    else:
+        interpolated_datasets = []
+        for ds in datasets:
+            interpolated_datasets.append(interpolator.interpolate(ds.copy()))
+    interpolated_datasets = (
+        interpolated_datasets[0]
+        if len(interpolated_datasets) == 1
+        else interpolated_datasets
+    )
+    return interpolated_datasets

mxalign/interpolations/registry.py ADDED Viewed

@@ -0,0 +1,17 @@
+_INTERPOLATORS = {}
+def register_interpolator(cls):
+    _INTERPOLATORS[cls.name] = cls
+    return cls
+def available_interpolations():
+    return list(_INTERPOLATORS.keys())
+def get_interpolation(name):
+    try:
+        return _INTERPOLATORS[name]
+    except KeyError:
+        raise ValueError(f"Unknown interpolation: {name}")

mxalign/interpolations/xarray.py ADDED Viewed

@@ -0,0 +1,63 @@
+from .base import BaseInterpolator
+from .registry import register_interpolator
+from ..properties.properties import Space
+import xarray as xr
+@register_interpolator
+class XarrayInterpolator(BaseInterpolator):
+    name = "xarray"
+    source_space = Space.GRID
+    target_space = Space.POINT
+    def _interpolate(self, source_dataset):
+        if "latitude" in source_dataset.dims and "longitude" in source_dataset.dims:
+            ds_out = self._interpolate_from_latlon(source_dataset)
+        else:
+            if source_dataset.space.is_stacked():
+                try:
+                    source_dataset = source_dataset.space.unstack()
+                except ValueError:
+                    raise ValueError(
+                        "Cannot unstack dataset, dataset must be unstacked to use xarray interpolation"
+                    )
+            ds_out = self._interpolate_from_xcyc(source_dataset)
+        return ds_out
+    def _interpolate_from_xcyc(self, source_dataset):
+        import cartopy.crs as ccrs
+        try:
+            crs = source_dataset.attrs["crs"]
+        except KeyError:
+            raise KeyError("Source dataset does not have a crs-attribute")
+        xyz = crs.transform_points(
+            x=self.target_dataset["longitude"].values,
+            y=self.target_dataset["latitude"].values,
+            src_crs=ccrs.PlateCarree(),
+        )
+        x = xr.DataArray(xyz[:, 0], dims="point_index")
+        y = xr.DataArray(xyz[:, 1], dims="point_index")
+        ds_out = source_dataset.interp(xc=x, yc=y, **self.options)
+        # ).assing_coords(
+        #     longitude=self.target_dataset["longitude"],
+        #     latitude=self.target_dataset["latitude"]
+        # )
+        return ds_out
+    def _interpolate_from_latlon(self, source_dataset):
+        longitude = self.target_dataset["longitude"]
+        latitude = self.target_dataset["latitude"]
+        ds_out = source_dataset.interp(
+            longitude=longitude, latitude=latitude, **self.options
+        )
+        return ds_out

mxalign/loaders/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from . import anemoi_datasets
+from . import anemoi_inference
+from . import harp_obstable
+from . import base
+__all__ = [
+    "anemoi_datasets",
+    "anemoi_inference",
+    "harp_obstable",
+    "base",
+]

mxalign/loaders/anemoi_datasets.py ADDED Viewed

@@ -0,0 +1,92 @@
+import numpy as np
+import xarray as xr
+from .registry import register_loader
+from ..properties.properties import Space, Time, Uncertainty
+from .base import BaseLoader
+DROP_VARS = [
+    "latitude",
+    "longitude",
+    "time",
+    "cos_julian_day",
+    "cos_latitude",
+    "cos_local_time",
+    "cos_longitude",
+    "insolation",
+    "sin_julian_day",
+    "sin_latitude",
+    "sin_local_time",
+    "sin_longitude",
+]
+COORDS = dict(longitude="longitudes", latitude="latitudes", valid_time="dates")
+DEFAULTS = {"chunks": "auto"}
+@register_loader
+class AnemoiDatasetsLoader(BaseLoader):
+    name = "anemoi-datasets"
+    space = Space.GRID
+    time = Time.OBSERVATION
+    uncertainty = Uncertainty.DETERMINISTIC
+    def _load(self):
+        if isinstance(self.files, list):
+            dss = [xr.open_zarr(file, consolidated=False) for file in self.files]
+            dss_postproc = [_postprocess(ds) for ds in dss]
+            ds_postproc = xr.concat(dss_postproc, dim="valid_time")
+        else:
+            ds = xr.open_zarr(self.files, consolidated=False)
+            ds_postproc = _postprocess(ds)
+        if self.variables:
+            ds_selected = ds_postproc.sel(variable=self.variables)
+        else:
+            ds_selected = ds_postproc
+            if len(ds_selected["variable"]) > 10:
+                print(
+                    f"Transforming anemoi-datasets xr.DataArray with {len(ds_postproc['variable'])} variables to xr.Dataset, this might take some time. Consider selecting the relevant variables during loading"
+                )
+        return ds_selected.to_dataset(dim="variable")
+def _postprocess(dataset: xr.Dataset) -> xr.Dataset:
+    """Post-process the dataset to add coordinates and drop unused variables.
+    Args:
+        dataset (xr.Dataset): The input dataset to be processed.
+    Returns:
+        xr.Dataset: The processed dataset with assigned coordinates and
+            attributes.
+    """
+    # Add coordinates
+    coords = {
+        key: dataset[value].astype("datetime64[ns]").load()
+        if key == "valid_time"
+        else dataset[value].load()
+        for key, value in COORDS.items()
+    }
+    for key in ("latitude", "longitude"):
+        coords[key] = coords[key].astype(np.float32)
+    coords["variable"] = dataset.attrs["variables"]
+    coords["valid_time"] = coords["valid_time"].astype("datetime64[ns]")
+    ds_coords = dataset.assign_coords(coords)
+    # Drop unused variables and remove ensemble dimension
+    drop_vars = [var for var in DROP_VARS if var in coords["variable"]]
+    ds_pruned = (
+        ds_coords["data"]
+        .isel(ensemble=0)
+        .drop_sel(variable=drop_vars)
+        .swap_dims({"time": "valid_time"})
+        .rename({"cell": "grid_index"})
+    )
+    return ds_pruned