recursive-diff 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ import importlib.metadata
2
+
3
+ from recursive_diff.cast import cast
4
+ from recursive_diff.recursive_diff import recursive_diff
5
+ from recursive_diff.recursive_eq import recursive_eq
6
+
7
+ try:
8
+ __version__ = importlib.metadata.version("recursive_diff")
9
+ except importlib.metadata.PackageNotFoundError: # pragma: nocover
10
+ # Local copy, not installed with pip
11
+ __version__ = "999"
12
+
13
+ # Prevent Intersphinx from pointing to the implementation modules
14
+ for obj in (recursive_diff, recursive_eq, cast):
15
+ obj.__module__ = "recursive_diff"
16
+ del obj
17
+
18
+ __all__ = ("__version__", "recursive_diff", "recursive_eq", "cast")
recursive_diff/cast.py ADDED
@@ -0,0 +1,282 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Collection, Hashable
4
+ from functools import singledispatch
5
+
6
+ import numpy
7
+ import pandas
8
+ import xarray
9
+
10
+ from recursive_diff.proper_unstack import proper_unstack
11
+
12
+
13
+ @singledispatch
14
+ def cast(obj: object, brief_dims: Collection[Hashable]) -> object:
15
+ """Helper function of :func:`recursive_diff`.
16
+
17
+ Cast objects into simpler object types:
18
+
19
+ - Cast tuple to list
20
+ - Cast frozenset to set
21
+ - Cast all numpy-based objects to :class:`xarray.DataArray`, as it is the
22
+ most generic format that can describe all use cases:
23
+
24
+ - :class:`numpy.ndarray`
25
+ - :class:`pandas.Series`
26
+ - :class:`pandas.DataFrame`
27
+ - :class:`pandas.Index`, except :class:`pandas.RangeIndex`, which is
28
+ instead returned unaltered
29
+ - :class:`xarray.Dataset`
30
+
31
+ The data will be potentially wrapped by a dict to hold the various
32
+ attributes and marked so that it doesn't trigger an infinite recursion.
33
+
34
+ - Do nothing for any other object types.
35
+
36
+ :param obj:
37
+ complex object that must be simplified
38
+ :param brief_dims:
39
+ xarray dimensions that must be compacted.
40
+ See documentation on :func:`recursive_diff`.
41
+ :returns:
42
+ simpler object to compare
43
+ """
44
+ # This is a single dispatch function, defining the default for any
45
+ # classes not explicitly registered below.
46
+ return obj
47
+
48
+
49
+ @cast.register(numpy.integer)
50
+ def cast_npint(obj: numpy.integer, brief_dims: Collection[Hashable]) -> int:
51
+ """Single dispatch specialised variant of :func:`cast` for all numpy scalar
52
+ integers (not to be confused with numpy arrays of integers)
53
+ """
54
+ return int(obj)
55
+
56
+
57
+ @cast.register(numpy.floating)
58
+ def cast_npfloat(obj: numpy.floating, brief_dims: Collection[Hashable]) -> float:
59
+ """Single dispatch specialised variant of :func:`cast` for all numpy scalar
60
+ floats (not to be confused with numpy arrays of floats)
61
+ """
62
+ return float(obj)
63
+
64
+
65
+ @cast.register(numpy.ndarray)
66
+ def cast_nparray(
67
+ obj: numpy.ndarray, brief_dims: Collection[Hashable]
68
+ ) -> dict[str, object]:
69
+ """Single dispatch specialised variant of :func:`cast` for
70
+ :class:`numpy.ndarray`.
71
+
72
+ Map to a DataArray with dimensions dim_0, dim_1, ... and
73
+ RangeIndex() as the coords.
74
+ """
75
+ data = _strip_dataarray(xarray.DataArray(obj), brief_dims)
76
+ out = {f"dim_{i}": pandas.RangeIndex(size) for i, size in enumerate(obj.shape)}
77
+ out["data"] = data
78
+ return out
79
+
80
+
81
+ @cast.register(pandas.Series)
82
+ def cast_series(
83
+ obj: pandas.Series, brief_dims: Collection[Hashable]
84
+ ) -> dict[str, object]:
85
+ """Single dispatch specialised variant of :func:`cast` for
86
+ :class:`pandas.Series`.
87
+
88
+ Map to a DataArray.
89
+ """
90
+ return {
91
+ "name": obj.name,
92
+ "data": _strip_dataarray(xarray.DataArray(obj, dims=["index"]), brief_dims),
93
+ "index": obj.index,
94
+ }
95
+
96
+
97
+ @cast.register(pandas.DataFrame)
98
+ def cast_dataframe(
99
+ obj: pandas.DataFrame, brief_dims: Collection[Hashable]
100
+ ) -> dict[str, object]:
101
+ """Single dispatch specialised variant of :func:`cast` for
102
+ :class:`pandas.DataFrame`.
103
+
104
+ Map to a DataArray.
105
+
106
+ TODO: proper support for columns with different dtypes. Right now
107
+ they are cast to the closest common type by DataFrame.values.
108
+ """
109
+ return {
110
+ "data": _strip_dataarray(
111
+ xarray.DataArray(obj, dims=["index", "column"]), brief_dims
112
+ ),
113
+ "index": obj.index,
114
+ "columns": obj.columns,
115
+ }
116
+
117
+
118
+ @cast.register(xarray.DataArray)
119
+ def cast_dataarray(obj: xarray.DataArray, brief_dims: Collection[Hashable]) -> object:
120
+ """Single dispatch specialised variant of :func:`cast` for
121
+ :class:`xarray.DataArray`.
122
+
123
+ Map to a simpler DataArray, with separate indices, non-index coords,
124
+ name, and attributes.
125
+ """
126
+ # Prevent infinite recursion - see _strip_dataarray()
127
+ if "__strip_dataarray__" in obj.attrs:
128
+ return obj
129
+
130
+ # Strip out the non-index coordinates and attributes
131
+ return {
132
+ "name": obj.name,
133
+ "attrs": obj.attrs,
134
+ # Index is handled separately, and created as a default
135
+ # RangeIndex(shape[i]) if it doesn't exist, as it is compared
136
+ # with outer join, whereas non-index coords and data are
137
+ # compared with inner joinu
138
+ "index": {k: obj.coords[k].to_index() for k in obj.dims},
139
+ "coords": {
140
+ k: _strip_dataarray(v, brief_dims)
141
+ for k, v in obj.coords.items()
142
+ if not isinstance(v.variable, xarray.IndexVariable)
143
+ },
144
+ "data": _strip_dataarray(obj, brief_dims),
145
+ }
146
+
147
+
148
+ @cast.register(xarray.Dataset)
149
+ def cast_dataset(
150
+ obj: xarray.Dataset, brief_dims: Collection[Hashable]
151
+ ) -> dict[str, object]:
152
+ """Single dispatch specialised variant of :func:`cast` for
153
+ :class:`xarray.Dataset`.
154
+
155
+ Map to a dict of DataArrays.
156
+ """
157
+ return {
158
+ "attrs": obj.attrs,
159
+ # There may be coords, index or not, that are not
160
+ # used in any data variable.
161
+ # See above on why indices are handled separately
162
+ "index": {k: obj.coords[k].to_index() for k in obj.dims},
163
+ "coords": {
164
+ k: _strip_dataarray(v, brief_dims)
165
+ for k, v in obj.coords.items()
166
+ if not isinstance(v.variable, xarray.IndexVariable)
167
+ },
168
+ "data_vars": {
169
+ k: _strip_dataarray(v, brief_dims) for k, v in obj.data_vars.items()
170
+ },
171
+ }
172
+
173
+
174
+ @cast.register(pandas.MultiIndex)
175
+ def cast_multiindex(
176
+ obj: pandas.MultiIndex, brief_dims: Collection[Hashable]
177
+ ) -> dict[str, object]:
178
+ """Single dispatch specialised variant of :func:`cast` for
179
+ :class:`pandas.MultiIndex`.
180
+
181
+ Map to a set of tuples. Note that this means that levels are
182
+ positional. Using a set allows comparing the indices non-positionally.
183
+ """
184
+ return {"names": obj.names, "data": set(obj.tolist())}
185
+
186
+
187
+ @cast.register(pandas.RangeIndex)
188
+ def cast_rangeindex(
189
+ obj: pandas.RangeIndex, brief_dims: Collection[Hashable]
190
+ ) -> pandas.RangeIndex:
191
+ """Single dispatch specialised variant of :func:`cast` for
192
+ :class:`pandas.RangeIndex`.
193
+
194
+ This function does nothing - RangeIndex objects are dealt with
195
+ directly by :func:`_recursive_diff`. This function is defined
196
+ to prevent RangeIndex objects to be processed by the more generic
197
+ ``cast(obj: pandas.Index)`` below.
198
+ """
199
+ return obj
200
+
201
+
202
+ @cast.register(pandas.Index)
203
+ def cast_index(obj: pandas.Index, brief_dims: Collection[Hashable]) -> xarray.DataArray:
204
+ """Single dispatch specialised variant of :func:`cast` for
205
+ :class:`pandas.Index`.
206
+
207
+ Cast to a DataArray.
208
+
209
+ .. note::
210
+ :func:`~functools.singledispatch` always prefers a more specialised
211
+ variant if available, so this function will not be called for
212
+ :class:`pandas.MultiIndex` or :class:`pandas.RangeIndex`, as they have
213
+ their own single dispatch variants.
214
+ """
215
+ return _strip_dataarray(xarray.DataArray(obj), brief_dims)
216
+
217
+
218
+ @cast.register(frozenset)
219
+ def cast_frozenset(obj: frozenset, brief_dims: Collection[Hashable]) -> set:
220
+ """Single dispatch specialised variant of :func:`cast` for
221
+ :class:`frozenset`.
222
+
223
+ Cast to a set.
224
+ """
225
+ return set(obj)
226
+
227
+
228
+ @cast.register(tuple)
229
+ def cast_tuple(obj: tuple, brief_dims: Collection[Hashable]) -> list:
230
+ """Single dispatch specialised variant of :func:`cast` for
231
+ :class:`tuple`.
232
+
233
+ Cast to a list.
234
+ """
235
+ return list(obj)
236
+
237
+
238
+ def _strip_dataarray(
239
+ obj: xarray.DataArray, brief_dims: Collection[Hashable]
240
+ ) -> xarray.DataArray:
241
+ """Helper function of :func:`recursive_diff`.
242
+
243
+ Analyse a :class:`xarray.DataArray` and:
244
+
245
+ - strip away any non-index coordinates (including scalar coords)
246
+ - create stub coords for dimensions without coords
247
+ - sort dimensions alphabetically
248
+ - ravel the array to a 1D array with (potentially) a MultiIndex.
249
+ brief_dims, if any, are excluded.
250
+
251
+ :param obj:
252
+ any xarray.DataArray
253
+ :param brief_dims:
254
+ collection of dims, or "all"
255
+ :returns:
256
+ a stripped-down shallow copy of obj; otherwise None
257
+ """
258
+ res = obj.copy()
259
+
260
+ # Remove non-index coordinates
261
+ for k, v in obj.coords.items():
262
+ if not isinstance(v.variable, xarray.IndexVariable):
263
+ del res[k]
264
+
265
+ # Ravel the array to make it become 1-dimensional.
266
+ # To do this, we must first unstack any already stacked dimension.
267
+ for dim in obj.dims:
268
+ if isinstance(obj.get_index(dim), pandas.MultiIndex):
269
+ res = proper_unstack(res, dim)
270
+
271
+ # Transpose to ignore dimensions order
272
+ res = res.transpose(*sorted(res.dims, key=str))
273
+
274
+ # Finally stack everything back together
275
+ if brief_dims != "all":
276
+ stack_dims = sorted(set(res.dims) - set(brief_dims), key=str)
277
+ if stack_dims:
278
+ res = res.stack(__stacked__=stack_dims)
279
+
280
+ # Prevent infinite recursion - see cast(obj: xarray.DataArray)
281
+ res.attrs["__strip_dataarray__"] = True
282
+ return res
@@ -0,0 +1,9 @@
1
+ """Support dask-backed xarray objects, if dask is installed
2
+ """
3
+
4
+ try:
5
+ from dask import compute
6
+ except ImportError:
7
+
8
+ def compute(*args: object) -> object:
9
+ return args
@@ -0,0 +1,205 @@
1
+ #!/usr/bin/env python
2
+ """Compare either two NetCDF files or all NetCDF files in two directories.
3
+
4
+ See :doc:`bin/ncdiff`
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import glob
10
+ import logging
11
+ import os
12
+ import sys
13
+
14
+ import xarray
15
+
16
+ from recursive_diff.recursive_diff import recursive_diff
17
+
18
+ LOGFORMAT = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
19
+
20
+
21
+ def argparser() -> argparse.ArgumentParser:
22
+ """Return precompiled ArgumentParser"""
23
+ parser = argparse.ArgumentParser(
24
+ description="Compare either two NetCDF files or all NetCDF files in "
25
+ "two directories.",
26
+ epilog="Examples:\n\n"
27
+ "Compare two NetCDF files:\n"
28
+ " ncdiff a.nc b.nc\n"
29
+ "Compare all NetCDF files with identical names in two "
30
+ "directories:\n"
31
+ " ncdiff -r dir1 dir2\n",
32
+ formatter_class=argparse.RawTextHelpFormatter,
33
+ )
34
+
35
+ parser.add_argument(
36
+ "--engine",
37
+ "-e",
38
+ help="NeCDF engine (may require additional modules)",
39
+ choices=[
40
+ "netcdf4",
41
+ "scipy",
42
+ "pydap",
43
+ "h5netcdf",
44
+ "pynio",
45
+ "cfgrib",
46
+ "pseudonetcdf",
47
+ ],
48
+ )
49
+ parser.add_argument("--quiet", "-q", action="store_true", help="Suppress logging")
50
+
51
+ parser.add_argument(
52
+ "--recursive",
53
+ "-r",
54
+ action="store_true",
55
+ help="Compare all NetCDF files with matching names in two directories",
56
+ )
57
+ parser.add_argument(
58
+ "--match",
59
+ "-m",
60
+ default="**/*.nc",
61
+ help="Bash wildcard match for file names when using --recursive "
62
+ "(default: **/*.nc)",
63
+ )
64
+
65
+ parser.add_argument(
66
+ "--rtol",
67
+ type=float,
68
+ default=1e-9,
69
+ help="Relative comparison tolerance (default: 1e-9)",
70
+ )
71
+ parser.add_argument(
72
+ "--atol",
73
+ type=float,
74
+ default=0,
75
+ help="Absolute comparison tolerance (default: 0)",
76
+ )
77
+
78
+ brief = parser.add_mutually_exclusive_group()
79
+ brief.add_argument(
80
+ "--brief_dims",
81
+ nargs="+",
82
+ default=(),
83
+ metavar="DIM",
84
+ help="Just count differences along one or more dimensions instead of "
85
+ "printing them out individually",
86
+ )
87
+ brief.add_argument(
88
+ "--brief",
89
+ "-b",
90
+ action="store_true",
91
+ help="Just count differences for every variable instead of printing "
92
+ "them out individually",
93
+ )
94
+
95
+ parser.add_argument(
96
+ "lhs", help="Left-hand-side NetCDF file or (if --recursive) directory"
97
+ )
98
+ parser.add_argument(
99
+ "rhs", help="Right-hand-side NetCDF file or (if --recursive) directory"
100
+ )
101
+
102
+ return parser
103
+
104
+
105
+ def open_netcdf(fname: str, engine: str | None = None) -> xarray.Dataset:
106
+ """Open a single NetCDF dataset
107
+ Read the metadata into RAM. Do not load the actual data.
108
+
109
+ :param str fname:
110
+ path to .nc file
111
+ :param str engine:
112
+ NetCDF engine (see :func:`xarray.open_dataset`)
113
+ :returns:
114
+ :class:`xarray.Dataset`
115
+ """
116
+ # At the moment of writing, h5netcdf is the only engine
117
+ # supporting LZF compression
118
+ logging.info(f"Opening {fname}")
119
+ return xarray.open_dataset(fname, engine=engine, chunks={})
120
+
121
+
122
+ def recursive_open_netcdf(
123
+ path: str, match: str, engine: str | None = None
124
+ ) -> dict[str, xarray.Dataset]:
125
+ """Recursively find and open all NetCDF files that exist in any of
126
+ the given paths.
127
+
128
+ :param str path:
129
+ Root directory to search into
130
+ :param str match:
131
+ Glob match relative to path
132
+ :param str engine:
133
+ NetCDF engine (see :func:`xarray.open_dataset`)
134
+ :returns:
135
+ dict of {relative file name: dataset}
136
+ """
137
+ cwd = os.getcwd()
138
+ os.chdir(path)
139
+ try:
140
+ fnames = glob.glob(match, recursive=True)
141
+ finally:
142
+ os.chdir(cwd)
143
+
144
+ # We don't invoke open_netcdf() directly inside the pushd context
145
+ # to get a prettier logging message on the file being opened
146
+ logging.info(f"Opening {len(fnames)} NetCDF stores from {path}")
147
+ return {
148
+ fname: open_netcdf(os.path.join(path, fname), engine=engine) for fname in fnames
149
+ }
150
+
151
+
152
+ def main(argv: list[str] | None = None) -> int:
153
+ """Parse command-line arguments, load all files, and invoke recursive_diff
154
+
155
+ :returns:
156
+ exit code
157
+ """
158
+ # Parse command-line arguments and init logging
159
+ args = argparser().parse_args(argv)
160
+ if args.brief:
161
+ args.brief_dims = "all"
162
+
163
+ if args.quiet:
164
+ loglevel = logging.WARNING
165
+ else:
166
+ loglevel = logging.INFO
167
+
168
+ # Don't init logging when running inside unit tests
169
+ if argv is None:
170
+ logging.basicConfig(level=loglevel, format=LOGFORMAT) # pragma: nocover
171
+
172
+ # Load metadata of all NetCDF stores
173
+ # Leave actual data on disk
174
+ lhs: xarray.Dataset | dict[str, xarray.Dataset]
175
+ rhs: xarray.Dataset | dict[str, xarray.Dataset]
176
+ if args.recursive:
177
+ lhs = recursive_open_netcdf(args.lhs, args.match, engine=args.engine)
178
+ rhs = recursive_open_netcdf(args.rhs, args.match, engine=args.engine)
179
+ else:
180
+ lhs = open_netcdf(args.lhs, engine=args.engine)
181
+ rhs = open_netcdf(args.rhs, engine=args.engine)
182
+
183
+ logging.info("Comparing...")
184
+ # 1. Load a pair of NetCDF variables fully into RAM
185
+ # 2. compare them
186
+ # 3. print all differences
187
+ # 4. free the RAM
188
+ # 5. proceed to next pair
189
+ diff_iter = recursive_diff(
190
+ lhs, rhs, abs_tol=args.atol, rel_tol=args.rtol, brief_dims=args.brief_dims
191
+ )
192
+
193
+ diff_count = 0
194
+ for diff in diff_iter:
195
+ diff_count += 1
196
+ print(diff)
197
+
198
+ print(f"Found {diff_count} differences")
199
+ if diff_count:
200
+ return 1
201
+ return 0
202
+
203
+
204
+ if __name__ == "__main__":
205
+ sys.exit(main()) # pragma: nocover
@@ -0,0 +1,63 @@
1
+ """Utilities for stacking/unstacking dimensions
2
+
3
+ Copy-pasted from xarray-extras
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from collections.abc import Hashable
8
+ from typing import TypeVar
9
+
10
+ import pandas
11
+ import xarray
12
+
13
+ T = TypeVar("T", xarray.DataArray, xarray.Dataset)
14
+
15
+
16
+ def proper_unstack(array: T, dim: Hashable) -> T:
17
+ """Work around an issue in xarray that causes the data to be sorted
18
+ alphabetically by label on unstack():
19
+
20
+ `<https://github.com/pydata/xarray/issues/906>`_
21
+
22
+ Also work around issue that causes string labels to be converted to
23
+ objects:
24
+
25
+ `<https://github.com/pydata/xarray/issues/907>`_
26
+
27
+ :param array:
28
+ xarray.DataArray or xarray.Dataset to unstack
29
+ :param Hashable dim:
30
+ Name of existing dimension to unstack
31
+ :returns:
32
+ xarray.DataArray or xarray.Dataset with unstacked dimension
33
+ """
34
+ # Regenerate Pandas multi-index to be ordered by first appearance
35
+ mindex = array.coords[dim].to_pandas().index
36
+
37
+ levels = []
38
+ codes = []
39
+
40
+ for levels_i, codes_i in zip(mindex.levels, mindex.codes):
41
+ level_map: dict[Hashable, int] = {}
42
+
43
+ for code in codes_i:
44
+ if code not in level_map:
45
+ level_map[code] = len(level_map)
46
+
47
+ levels.append([levels_i[k] for k in level_map])
48
+ codes.append([level_map[k] for k in codes_i])
49
+
50
+ mindex = pandas.MultiIndex(levels, codes, names=mindex.names)
51
+ array = array.copy()
52
+ array.coords[dim] = mindex
53
+
54
+ # Invoke builtin unstack
55
+ array = array.unstack((dim,))
56
+
57
+ # Convert numpy arrays of Python objects to numpy arrays of C floats, ints,
58
+ # strings, etc.
59
+ for dim in mindex.names:
60
+ if array.coords[dim].dtype == object:
61
+ array.coords[dim] = array.coords[dim].values.tolist()
62
+
63
+ return array
File without changes