PyPI - recursive-diff - Versions diffs - 1.2.0__py3-none-any.whl - Mend

recursive-diff 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

recursive_diff/__init__.py +18 -0
recursive_diff/cast.py +282 -0
recursive_diff/dask_or_stub.py +9 -0
recursive_diff/ncdiff.py +205 -0
recursive_diff/proper_unstack.py +63 -0
recursive_diff/py.typed +0 -0
recursive_diff/recursive_diff.py +558 -0
recursive_diff/recursive_eq.py +19 -0
recursive_diff/tests/__init__.py +47 -0
recursive_diff/tests/test_ncdiff.py +192 -0
recursive_diff/tests/test_proper_unstack.py +121 -0
recursive_diff/tests/test_recursive_diff.py +858 -0
recursive_diff/tests/test_recursive_eq.py +32 -0
recursive_diff-1.2.0.dist-info/LICENSE +191 -0
recursive_diff-1.2.0.dist-info/METADATA +27 -0
recursive_diff-1.2.0.dist-info/RECORD +19 -0
recursive_diff-1.2.0.dist-info/WHEEL +5 -0
recursive_diff-1.2.0.dist-info/entry_points.txt +2 -0
recursive_diff-1.2.0.dist-info/top_level.txt +1 -0

recursive_diff/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+import importlib.metadata
+from recursive_diff.cast import cast
+from recursive_diff.recursive_diff import recursive_diff
+from recursive_diff.recursive_eq import recursive_eq
+try:
+    __version__ = importlib.metadata.version("recursive_diff")
+except importlib.metadata.PackageNotFoundError:  # pragma: nocover
+    # Local copy, not installed with pip
+    __version__ = "999"
+# Prevent Intersphinx from pointing to the implementation modules
+for obj in (recursive_diff, recursive_eq, cast):
+    obj.__module__ = "recursive_diff"
+del obj
+__all__ = ("__version__", "recursive_diff", "recursive_eq", "cast")

recursive_diff/cast.py ADDED Viewed

@@ -0,0 +1,282 @@
+from __future__ import annotations
+from collections.abc import Collection, Hashable
+from functools import singledispatch
+import numpy
+import pandas
+import xarray
+from recursive_diff.proper_unstack import proper_unstack
+@singledispatch
+def cast(obj: object, brief_dims: Collection[Hashable]) -> object:
+    """Helper function of :func:`recursive_diff`.
+    Cast objects into simpler object types:
+    - Cast tuple to list
+    - Cast frozenset to set
+    - Cast all numpy-based objects to :class:`xarray.DataArray`, as it is the
+      most generic format that can describe all use cases:
+      - :class:`numpy.ndarray`
+      - :class:`pandas.Series`
+      - :class:`pandas.DataFrame`
+      - :class:`pandas.Index`, except :class:`pandas.RangeIndex`, which is
+        instead returned unaltered
+      - :class:`xarray.Dataset`
+    The data will be potentially wrapped by a dict to hold the various
+    attributes and marked so that it doesn't trigger an infinite recursion.
+    - Do nothing for any other object types.
+    :param obj:
+        complex object that must be simplified
+    :param brief_dims:
+        xarray dimensions that must be compacted.
+        See documentation on :func:`recursive_diff`.
+    :returns:
+        simpler object to compare
+    """
+    # This is a single dispatch function, defining the default for any
+    # classes not explicitly registered below.
+    return obj
+@cast.register(numpy.integer)
+def cast_npint(obj: numpy.integer, brief_dims: Collection[Hashable]) -> int:
+    """Single dispatch specialised variant of :func:`cast` for all numpy scalar
+    integers (not to be confused with numpy arrays of integers)
+    """
+    return int(obj)
+@cast.register(numpy.floating)
+def cast_npfloat(obj: numpy.floating, brief_dims: Collection[Hashable]) -> float:
+    """Single dispatch specialised variant of :func:`cast` for all numpy scalar
+    floats (not to be confused with numpy arrays of floats)
+    """
+    return float(obj)
+@cast.register(numpy.ndarray)
+def cast_nparray(
+    obj: numpy.ndarray, brief_dims: Collection[Hashable]
+) -> dict[str, object]:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`numpy.ndarray`.
+    Map to a DataArray with dimensions dim_0, dim_1, ... and
+    RangeIndex() as the coords.
+    """
+    data = _strip_dataarray(xarray.DataArray(obj), brief_dims)
+    out = {f"dim_{i}": pandas.RangeIndex(size) for i, size in enumerate(obj.shape)}
+    out["data"] = data
+    return out
+@cast.register(pandas.Series)
+def cast_series(
+    obj: pandas.Series, brief_dims: Collection[Hashable]
+) -> dict[str, object]:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`pandas.Series`.
+    Map to a DataArray.
+    """
+    return {
+        "name": obj.name,
+        "data": _strip_dataarray(xarray.DataArray(obj, dims=["index"]), brief_dims),
+        "index": obj.index,
+    }
+@cast.register(pandas.DataFrame)
+def cast_dataframe(
+    obj: pandas.DataFrame, brief_dims: Collection[Hashable]
+) -> dict[str, object]:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`pandas.DataFrame`.
+    Map to a DataArray.
+    TODO: proper support for columns with different dtypes. Right now
+    they are cast to the closest common type by DataFrame.values.
+    """
+    return {
+        "data": _strip_dataarray(
+            xarray.DataArray(obj, dims=["index", "column"]), brief_dims
+        ),
+        "index": obj.index,
+        "columns": obj.columns,
+    }
+@cast.register(xarray.DataArray)
+def cast_dataarray(obj: xarray.DataArray, brief_dims: Collection[Hashable]) -> object:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`xarray.DataArray`.
+    Map to a simpler DataArray, with separate indices, non-index coords,
+    name, and attributes.
+    """
+    # Prevent infinite recursion - see _strip_dataarray()
+    if "__strip_dataarray__" in obj.attrs:
+        return obj
+    # Strip out the non-index coordinates and attributes
+    return {
+        "name": obj.name,
+        "attrs": obj.attrs,
+        # Index is handled separately, and created as a default
+        # RangeIndex(shape[i]) if it doesn't exist, as it is compared
+        # with outer join, whereas non-index coords and data are
+        # compared with inner joinu
+        "index": {k: obj.coords[k].to_index() for k in obj.dims},
+        "coords": {
+            k: _strip_dataarray(v, brief_dims)
+            for k, v in obj.coords.items()
+            if not isinstance(v.variable, xarray.IndexVariable)
+        },
+        "data": _strip_dataarray(obj, brief_dims),
+    }
+@cast.register(xarray.Dataset)
+def cast_dataset(
+    obj: xarray.Dataset, brief_dims: Collection[Hashable]
+) -> dict[str, object]:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`xarray.Dataset`.
+    Map to a dict of DataArrays.
+    """
+    return {
+        "attrs": obj.attrs,
+        # There may be coords, index or not, that are not
+        # used in any data variable.
+        # See above on why indices are handled separately
+        "index": {k: obj.coords[k].to_index() for k in obj.dims},
+        "coords": {
+            k: _strip_dataarray(v, brief_dims)
+            for k, v in obj.coords.items()
+            if not isinstance(v.variable, xarray.IndexVariable)
+        },
+        "data_vars": {
+            k: _strip_dataarray(v, brief_dims) for k, v in obj.data_vars.items()
+        },
+    }
+@cast.register(pandas.MultiIndex)
+def cast_multiindex(
+    obj: pandas.MultiIndex, brief_dims: Collection[Hashable]
+) -> dict[str, object]:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`pandas.MultiIndex`.
+    Map to a set of tuples. Note that this means that levels are
+    positional. Using a set allows comparing the indices non-positionally.
+    """
+    return {"names": obj.names, "data": set(obj.tolist())}
+@cast.register(pandas.RangeIndex)
+def cast_rangeindex(
+    obj: pandas.RangeIndex, brief_dims: Collection[Hashable]
+) -> pandas.RangeIndex:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`pandas.RangeIndex`.
+    This function does nothing - RangeIndex objects are dealt with
+    directly by :func:`_recursive_diff`. This function is defined
+    to prevent RangeIndex objects to be processed by the more generic
+    ``cast(obj: pandas.Index)`` below.
+    """
+    return obj
+@cast.register(pandas.Index)
+def cast_index(obj: pandas.Index, brief_dims: Collection[Hashable]) -> xarray.DataArray:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`pandas.Index`.
+    Cast to a DataArray.
+    .. note::
+       :func:`~functools.singledispatch` always prefers a more specialised
+       variant if available, so this function will not be called for
+       :class:`pandas.MultiIndex` or :class:`pandas.RangeIndex`, as they have
+       their own single dispatch variants.
+    """
+    return _strip_dataarray(xarray.DataArray(obj), brief_dims)
+@cast.register(frozenset)
+def cast_frozenset(obj: frozenset, brief_dims: Collection[Hashable]) -> set:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`frozenset`.
+    Cast to a set.
+    """
+    return set(obj)
+@cast.register(tuple)
+def cast_tuple(obj: tuple, brief_dims: Collection[Hashable]) -> list:
+    """Single dispatch specialised variant of :func:`cast` for
+    :class:`tuple`.
+    Cast to a list.
+    """
+    return list(obj)
+def _strip_dataarray(
+    obj: xarray.DataArray, brief_dims: Collection[Hashable]
+) -> xarray.DataArray:
+    """Helper function of :func:`recursive_diff`.
+    Analyse a :class:`xarray.DataArray` and:
+    - strip away any non-index coordinates (including scalar coords)
+    - create stub coords for dimensions without coords
+    - sort dimensions alphabetically
+    - ravel the array to a 1D array with (potentially) a MultiIndex.
+      brief_dims, if any, are excluded.
+    :param obj:
+        any xarray.DataArray
+    :param brief_dims:
+        collection of dims, or "all"
+    :returns:
+        a stripped-down shallow copy of obj; otherwise None
+    """
+    res = obj.copy()
+    # Remove non-index coordinates
+    for k, v in obj.coords.items():
+        if not isinstance(v.variable, xarray.IndexVariable):
+            del res[k]
+    # Ravel the array to make it become 1-dimensional.
+    # To do this, we must first unstack any already stacked dimension.
+    for dim in obj.dims:
+        if isinstance(obj.get_index(dim), pandas.MultiIndex):
+            res = proper_unstack(res, dim)
+    # Transpose to ignore dimensions order
+    res = res.transpose(*sorted(res.dims, key=str))
+    # Finally stack everything back together
+    if brief_dims != "all":
+        stack_dims = sorted(set(res.dims) - set(brief_dims), key=str)
+        if stack_dims:
+            res = res.stack(__stacked__=stack_dims)
+    # Prevent infinite recursion - see cast(obj: xarray.DataArray)
+    res.attrs["__strip_dataarray__"] = True
+    return res

recursive_diff/dask_or_stub.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Support dask-backed xarray objects, if dask is installed
+"""
+try:
+    from dask import compute
+except ImportError:
+    def compute(*args: object) -> object:
+        return args

recursive_diff/ncdiff.py ADDED Viewed

@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+"""Compare either two NetCDF files or all NetCDF files in two directories.
+See :doc:`bin/ncdiff`
+"""
+from __future__ import annotations
+import argparse
+import glob
+import logging
+import os
+import sys
+import xarray
+from recursive_diff.recursive_diff import recursive_diff
+LOGFORMAT = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+def argparser() -> argparse.ArgumentParser:
+    """Return precompiled ArgumentParser"""
+    parser = argparse.ArgumentParser(
+        description="Compare either two NetCDF files or all NetCDF files in "
+        "two directories.",
+        epilog="Examples:\n\n"
+        "Compare two NetCDF files:\n"
+        "  ncdiff a.nc b.nc\n"
+        "Compare all NetCDF files with identical names in two "
+        "directories:\n"
+        "  ncdiff -r dir1 dir2\n",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--engine",
+        "-e",
+        help="NeCDF engine (may require additional modules)",
+        choices=[
+            "netcdf4",
+            "scipy",
+            "pydap",
+            "h5netcdf",
+            "pynio",
+            "cfgrib",
+            "pseudonetcdf",
+        ],
+    )
+    parser.add_argument("--quiet", "-q", action="store_true", help="Suppress logging")
+    parser.add_argument(
+        "--recursive",
+        "-r",
+        action="store_true",
+        help="Compare all NetCDF files with matching names in two directories",
+    )
+    parser.add_argument(
+        "--match",
+        "-m",
+        default="**/*.nc",
+        help="Bash wildcard match for file names when using --recursive "
+        "(default: **/*.nc)",
+    )
+    parser.add_argument(
+        "--rtol",
+        type=float,
+        default=1e-9,
+        help="Relative comparison tolerance (default: 1e-9)",
+    )
+    parser.add_argument(
+        "--atol",
+        type=float,
+        default=0,
+        help="Absolute comparison tolerance (default: 0)",
+    )
+    brief = parser.add_mutually_exclusive_group()
+    brief.add_argument(
+        "--brief_dims",
+        nargs="+",
+        default=(),
+        metavar="DIM",
+        help="Just count differences along one or more dimensions instead of "
+        "printing them out individually",
+    )
+    brief.add_argument(
+        "--brief",
+        "-b",
+        action="store_true",
+        help="Just count differences for every variable instead of printing "
+        "them out individually",
+    )
+    parser.add_argument(
+        "lhs", help="Left-hand-side NetCDF file or (if --recursive) directory"
+    )
+    parser.add_argument(
+        "rhs", help="Right-hand-side NetCDF file or (if --recursive) directory"
+    )
+    return parser
+def open_netcdf(fname: str, engine: str | None = None) -> xarray.Dataset:
+    """Open a single NetCDF dataset
+    Read the metadata into RAM. Do not load the actual data.
+    :param str fname:
+        path to .nc file
+    :param str engine:
+        NetCDF engine (see :func:`xarray.open_dataset`)
+    :returns:
+        :class:`xarray.Dataset`
+    """
+    # At the moment of writing, h5netcdf is the only engine
+    # supporting LZF compression
+    logging.info(f"Opening {fname}")
+    return xarray.open_dataset(fname, engine=engine, chunks={})
+def recursive_open_netcdf(
+    path: str, match: str, engine: str | None = None
+) -> dict[str, xarray.Dataset]:
+    """Recursively find and open all NetCDF files that exist in any of
+    the given paths.
+    :param str path:
+        Root directory to search into
+    :param str match:
+        Glob match relative to path
+    :param str engine:
+        NetCDF engine (see :func:`xarray.open_dataset`)
+    :returns:
+        dict of {relative file name: dataset}
+    """
+    cwd = os.getcwd()
+    os.chdir(path)
+    try:
+        fnames = glob.glob(match, recursive=True)
+    finally:
+        os.chdir(cwd)
+    # We don't invoke open_netcdf() directly inside the pushd context
+    # to get a prettier logging message on the file being opened
+    logging.info(f"Opening {len(fnames)} NetCDF stores from {path}")
+    return {
+        fname: open_netcdf(os.path.join(path, fname), engine=engine) for fname in fnames
+    }
+def main(argv: list[str] | None = None) -> int:
+    """Parse command-line arguments, load all files, and invoke recursive_diff
+    :returns:
+        exit code
+    """
+    # Parse command-line arguments and init logging
+    args = argparser().parse_args(argv)
+    if args.brief:
+        args.brief_dims = "all"
+    if args.quiet:
+        loglevel = logging.WARNING
+    else:
+        loglevel = logging.INFO
+    # Don't init logging when running inside unit tests
+    if argv is None:
+        logging.basicConfig(level=loglevel, format=LOGFORMAT)  # pragma: nocover
+    # Load metadata of all NetCDF stores
+    # Leave actual data on disk
+    lhs: xarray.Dataset | dict[str, xarray.Dataset]
+    rhs: xarray.Dataset | dict[str, xarray.Dataset]
+    if args.recursive:
+        lhs = recursive_open_netcdf(args.lhs, args.match, engine=args.engine)
+        rhs = recursive_open_netcdf(args.rhs, args.match, engine=args.engine)
+    else:
+        lhs = open_netcdf(args.lhs, engine=args.engine)
+        rhs = open_netcdf(args.rhs, engine=args.engine)
+    logging.info("Comparing...")
+    # 1. Load a pair of NetCDF variables fully into RAM
+    # 2. compare them
+    # 3. print all differences
+    # 4. free the RAM
+    # 5. proceed to next pair
+    diff_iter = recursive_diff(
+        lhs, rhs, abs_tol=args.atol, rel_tol=args.rtol, brief_dims=args.brief_dims
+    )
+    diff_count = 0
+    for diff in diff_iter:
+        diff_count += 1
+        print(diff)
+    print(f"Found {diff_count} differences")
+    if diff_count:
+        return 1
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())  # pragma: nocover

recursive_diff/proper_unstack.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Utilities for stacking/unstacking dimensions
+Copy-pasted from xarray-extras
+"""
+from __future__ import annotations
+from collections.abc import Hashable
+from typing import TypeVar
+import pandas
+import xarray
+T = TypeVar("T", xarray.DataArray, xarray.Dataset)
+def proper_unstack(array: T, dim: Hashable) -> T:
+    """Work around an issue in xarray that causes the data to be sorted
+    alphabetically by label on unstack():
+    `<https://github.com/pydata/xarray/issues/906>`_
+    Also work around issue that causes string labels to be converted to
+    objects:
+    `<https://github.com/pydata/xarray/issues/907>`_
+    :param array:
+        xarray.DataArray or xarray.Dataset to unstack
+    :param Hashable dim:
+        Name of existing dimension to unstack
+    :returns:
+        xarray.DataArray or xarray.Dataset with unstacked dimension
+    """
+    # Regenerate Pandas multi-index to be ordered by first appearance
+    mindex = array.coords[dim].to_pandas().index
+    levels = []
+    codes = []
+    for levels_i, codes_i in zip(mindex.levels, mindex.codes):
+        level_map: dict[Hashable, int] = {}
+        for code in codes_i:
+            if code not in level_map:
+                level_map[code] = len(level_map)
+        levels.append([levels_i[k] for k in level_map])
+        codes.append([level_map[k] for k in codes_i])
+    mindex = pandas.MultiIndex(levels, codes, names=mindex.names)
+    array = array.copy()
+    array.coords[dim] = mindex
+    # Invoke builtin unstack
+    array = array.unstack((dim,))
+    # Convert numpy arrays of Python objects to numpy arrays of C floats, ints,
+    # strings, etc.
+    for dim in mindex.names:
+        if array.coords[dim].dtype == object:
+            array.coords[dim] = array.coords[dim].values.tolist()
+    return array

recursive_diff/py.typed ADDED Viewed

File without changes