diffly 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffly/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ # Copyright (c) QuantCo 2025-2026
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+
4
+ import importlib.metadata
5
+ import warnings
6
+
7
+ try:
8
+ __version__ = importlib.metadata.version(__name__)
9
+ except importlib.metadata.PackageNotFoundError as e: # pragma: no cover
10
+ warnings.warn(f"Could not determine version of {__name__}\n{e!s}", stacklevel=2)
11
+ __version__ = "unknown"
12
+
13
+
14
+ from .comparison import compare_frames
15
+
16
+ __all__ = ["compare_frames"]
diffly/_cache.py ADDED
@@ -0,0 +1,32 @@
1
+ # Copyright (c) QuantCo 2025-2026
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+
4
+ import functools
5
+ from collections.abc import Callable
6
+ from typing import Any, Concatenate, ParamSpec, TypeVar
7
+
8
+ P = ParamSpec("P")
9
+ T = TypeVar("T")
10
+
11
+
12
+ def cached_method(
13
+ fn: Callable[Concatenate[Any, P], T],
14
+ ) -> Callable[Concatenate[Any, P], T]:
15
+ """Cache all results from the executions from an instance method."""
16
+ cache_name = f"_{fn.__name__}_cache"
17
+ kwd_mark = object() # sentinel for separating args from kwargs
18
+
19
+ @functools.wraps(fn)
20
+ def wrapped(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
21
+ key = args + (kwd_mark,) + tuple(sorted(kwargs.items()))
22
+
23
+ if not hasattr(self, cache_name):
24
+ setattr(self, cache_name, {})
25
+ if key in getattr(self, cache_name):
26
+ return getattr(self, cache_name)[key]
27
+
28
+ result = fn(self, *args, **kwargs)
29
+ getattr(self, cache_name)[key] = result
30
+ return result
31
+
32
+ return wrapped
diffly/_compat.py ADDED
@@ -0,0 +1,27 @@
1
+ # Copyright (c) QuantCo 2025-2026
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+
4
+ from typing import Any
5
+
6
+
7
+ class _DummyModule: # pragma: no cover
8
+ def __init__(self, module: str) -> None:
9
+ self.module = module
10
+
11
+ def __getattr__(self, name: str) -> Any:
12
+ raise ValueError(f"Module '{self.module}' is not installed.")
13
+
14
+
15
+ # ------------------------------------ DATAFRAMELY ------------------------------------ #
16
+
17
+ try:
18
+ import dataframely as dy
19
+ except ImportError: # pragma: no cover
20
+ dy = _DummyModule("dataframely") # type: ignore
21
+
22
+ # --------------------------------------- TYPER -------------------------------------- #
23
+
24
+ try:
25
+ import typer
26
+ except ImportError: # pragma: no cover
27
+ typer = _DummyModule("typer") # type: ignore
diffly/_conditions.py ADDED
@@ -0,0 +1,208 @@
1
+ # Copyright (c) QuantCo 2025-2026
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+
4
+ import datetime as dt
5
+ from collections.abc import Mapping
6
+
7
+ import polars as pl
8
+ from polars.datatypes import DataType, DataTypeClass
9
+
10
+ from diffly._utils import (
11
+ ABS_TOL_DEFAULT,
12
+ ABS_TOL_TEMPORAL_DEFAULT,
13
+ REL_TOL_DEFAULT,
14
+ Side,
15
+ )
16
+
17
+
18
+ def condition_equal_rows(
19
+ columns: list[str],
20
+ schema_left: pl.Schema,
21
+ schema_right: pl.Schema,
22
+ abs_tol_by_column: Mapping[str, float],
23
+ rel_tol_by_column: Mapping[str, float],
24
+ abs_tol_temporal_by_column: Mapping[str, dt.timedelta],
25
+ ) -> pl.Expr:
26
+ """Build an expression whether two rows are equal, based on all columns' data
27
+ types."""
28
+ if not columns:
29
+ return pl.lit(True)
30
+
31
+ return pl.all_horizontal(
32
+ [
33
+ condition_equal_columns(
34
+ column=column,
35
+ dtype_left=schema_left[column],
36
+ dtype_right=schema_right[column],
37
+ abs_tol=abs_tol_by_column[column],
38
+ rel_tol=rel_tol_by_column[column],
39
+ abs_tol_temporal=abs_tol_temporal_by_column[column],
40
+ )
41
+ for column in columns
42
+ ]
43
+ )
44
+
45
+
46
+ def condition_equal_columns(
47
+ column: str,
48
+ dtype_left: pl.DataType,
49
+ dtype_right: pl.DataType,
50
+ abs_tol: float = ABS_TOL_DEFAULT,
51
+ rel_tol: float = REL_TOL_DEFAULT,
52
+ abs_tol_temporal: dt.timedelta = ABS_TOL_TEMPORAL_DEFAULT,
53
+ ) -> pl.Expr:
54
+ """Build an expression whether two columns are equal, depending on the columns' data
55
+ types."""
56
+ return _compare_columns(
57
+ col_left=pl.col(f"{column}_{Side.LEFT}"),
58
+ col_right=pl.col(f"{column}_{Side.RIGHT}"),
59
+ dtype_left=dtype_left,
60
+ dtype_right=dtype_right,
61
+ abs_tol=abs_tol,
62
+ rel_tol=rel_tol,
63
+ abs_tol_temporal=abs_tol_temporal,
64
+ )
65
+
66
+
67
+ # --------------------------------------- UTILS -------------------------------------- #
68
+
69
+
70
+ def _can_compare_dtypes(
71
+ dtype_left: DataType | DataTypeClass,
72
+ dtype_right: DataType | DataTypeClass,
73
+ ) -> bool:
74
+ return (
75
+ (dtype_left == dtype_right)
76
+ or (dtype_left == pl.Null)
77
+ or (dtype_right == pl.Null)
78
+ or (
79
+ (
80
+ (dtype_left.is_numeric() or dtype_left == pl.Boolean)
81
+ == (dtype_right.is_numeric() or dtype_right == pl.Boolean)
82
+ )
83
+ and (dtype_left.is_temporal() == dtype_right.is_temporal())
84
+ and (dtype_left.is_nested() == dtype_right.is_nested())
85
+ and ((dtype_left == pl.Struct) == (dtype_right == pl.Struct))
86
+ )
87
+ )
88
+
89
+
90
+ def _compare_columns(
91
+ col_left: pl.Expr,
92
+ col_right: pl.Expr,
93
+ dtype_left: DataType | DataTypeClass,
94
+ dtype_right: DataType | DataTypeClass,
95
+ abs_tol: float,
96
+ rel_tol: float,
97
+ abs_tol_temporal: dt.timedelta,
98
+ ) -> pl.Expr:
99
+ """Build an expression whether two expressions yield the same value.
100
+
101
+ This method is more generic than :meth:`condition_equal_columns` as it accepts two
102
+ arbitrary expressions rather than a "base column name".
103
+ """
104
+ if not _can_compare_dtypes(dtype_left, dtype_right):
105
+ return pl.repeat(pl.lit(False), pl.len())
106
+
107
+ # If we encounter nested dtypes, we have to treat them specially
108
+ if dtype_left.is_nested():
109
+ if isinstance(dtype_left, pl.Struct):
110
+ assert isinstance(dtype_right, pl.Struct)
111
+ # For two structs, we necessarily need to have matching field names (the
112
+ # order does not matter). If that isn't the case, we cannot observe equality
113
+ fields_left = {f.name: f.dtype for f in dtype_left.fields}
114
+ fields_right = {f.name: f.dtype for f in dtype_right.fields}
115
+ if fields_left.keys() != fields_right.keys():
116
+ return pl.repeat(pl.lit(False), pl.len())
117
+
118
+ # Otherwise, we simply compare all fields independently
119
+ return pl.all_horizontal(
120
+ [
121
+ _compare_columns(
122
+ col_left=col_left.struct[field],
123
+ col_right=col_right.struct[field],
124
+ dtype_left=fields_left[field],
125
+ dtype_right=fields_right[field],
126
+ abs_tol=abs_tol,
127
+ rel_tol=rel_tol,
128
+ abs_tol_temporal=abs_tol_temporal,
129
+ )
130
+ for field in fields_left
131
+ ]
132
+ )
133
+ elif isinstance(dtype_left, pl.List | pl.Array) and isinstance(
134
+ dtype_right, pl.List | pl.Array
135
+ ):
136
+ # As of polars 1.28, there is no way to access another column within
137
+ # `list.eval`. Hence, we necessarily need to resort to a primitive
138
+ # comparison in this case.
139
+ pass
140
+
141
+ if (
142
+ isinstance(dtype_left, pl.Enum)
143
+ and isinstance(dtype_right, pl.Enum)
144
+ and dtype_left != dtype_right
145
+ ) or _enum_and_categorical(dtype_left, dtype_right):
146
+ # Enums with different categories as well as enums and categoricals
147
+ # can't be compared directly.
148
+ # Fall back to comparison of strings.
149
+ return _compare_columns(
150
+ col_left=col_left.cast(pl.String),
151
+ col_right=col_right.cast(pl.String),
152
+ dtype_left=pl.String,
153
+ dtype_right=pl.String,
154
+ abs_tol=abs_tol,
155
+ rel_tol=rel_tol,
156
+ abs_tol_temporal=abs_tol_temporal,
157
+ )
158
+
159
+ return _compare_primitive_columns(
160
+ col_left=col_left,
161
+ col_right=col_right,
162
+ dtype_left=dtype_left,
163
+ dtype_right=dtype_right,
164
+ abs_tol=abs_tol,
165
+ rel_tol=rel_tol,
166
+ abs_tol_temporal=abs_tol_temporal,
167
+ )
168
+
169
+
170
+ def _compare_primitive_columns(
171
+ col_left: pl.Expr,
172
+ col_right: pl.Expr,
173
+ dtype_left: DataType | DataTypeClass,
174
+ dtype_right: DataType | DataTypeClass,
175
+ abs_tol: float,
176
+ rel_tol: float,
177
+ abs_tol_temporal: dt.timedelta,
178
+ ) -> pl.Expr:
179
+ if (dtype_left.is_float() or dtype_right.is_float()) and (
180
+ dtype_left.is_numeric() and dtype_right.is_numeric()
181
+ ):
182
+ return col_left.is_close(col_right, abs_tol=abs_tol, rel_tol=rel_tol).pipe(
183
+ _eq_missing_with_nan, lhs=col_left, rhs=col_right
184
+ )
185
+ elif dtype_left.is_temporal() and dtype_right.is_temporal():
186
+ diff_less_than_tolerance = (col_left - col_right).abs() <= abs_tol_temporal
187
+ return diff_less_than_tolerance.pipe(_eq_missing, lhs=col_left, rhs=col_right)
188
+
189
+ return col_left.eq_missing(col_right)
190
+
191
+
192
+ def _eq_missing(expr: pl.Expr, lhs: pl.Expr, rhs: pl.Expr) -> pl.Expr:
193
+ both_null = lhs.is_null() & rhs.is_null()
194
+ both_not_null = lhs.is_not_null() & rhs.is_not_null()
195
+ return (expr & both_not_null) | both_null
196
+
197
+
198
+ def _eq_missing_with_nan(expr: pl.Expr, lhs: pl.Expr, rhs: pl.Expr) -> pl.Expr:
199
+ both_nan = lhs.is_nan() & rhs.is_nan()
200
+ return _eq_missing(expr, lhs, rhs) | both_nan
201
+
202
+
203
+ def _enum_and_categorical(
204
+ left: DataType | DataTypeClass, right: DataType | DataTypeClass
205
+ ) -> bool:
206
+ return (isinstance(left, pl.Enum) and isinstance(right, pl.Categorical)) or (
207
+ isinstance(left, pl.Categorical) and isinstance(right, pl.Enum)
208
+ )
diffly/_utils.py ADDED
@@ -0,0 +1,63 @@
1
+ # Copyright (c) QuantCo 2025-2026
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+
4
+ import datetime as dt
5
+ from collections.abc import Mapping
6
+ from enum import StrEnum
7
+ from typing import TypeVar
8
+
9
+ import polars as pl
10
+
11
+
12
+ def lazy_len(lf: pl.LazyFrame) -> int:
13
+ return lf.select(pl.len()).collect().item()
14
+
15
+
16
+ def is_primary_key(lf: pl.LazyFrame, columns: list[str]) -> bool:
17
+ return not lf.select(pl.struct(*columns).is_duplicated().any()).collect().item()
18
+
19
+
20
+ def get_select_columns(keep: list[str], expand: list[str]) -> list[str]:
21
+ return keep + sum(
22
+ [[f"{col}_{Side.LEFT}", f"{col}_{Side.RIGHT}"] for col in expand], []
23
+ )
24
+
25
+
26
+ T = TypeVar("T", float, dt.timedelta, int)
27
+
28
+
29
+ def make_and_validate_mapping(
30
+ value_or_mapping: T | Mapping[str, T], other_common_columns: list[str]
31
+ ) -> dict[str, T]:
32
+ if isinstance(value_or_mapping, Mapping):
33
+ for col in other_common_columns:
34
+ try:
35
+ value_or_mapping[col]
36
+ except KeyError:
37
+ raise KeyError(
38
+ "The mapping needs to specify a value for every common column except "
39
+ "the primary key."
40
+ )
41
+ if diff := (set(value_or_mapping.keys()) - set(other_common_columns)):
42
+ raise KeyError(
43
+ f"The mapping must only contain common columns except the primary key. "
44
+ f"However, it also contains the following columns: {diff}."
45
+ )
46
+ return {col: value_or_mapping[col] for col in other_common_columns}
47
+ return {col: value_or_mapping for col in other_common_columns}
48
+
49
+
50
+ def capitalize_first(s: str) -> str:
51
+ return s[0].upper() + s[1:] if s else s
52
+
53
+
54
+ ABS_TOL_DEFAULT = 1e-08
55
+ REL_TOL_DEFAULT = 1e-05
56
+ ABS_TOL_TEMPORAL_DEFAULT = dt.timedelta(0)
57
+
58
+
59
+ class Side(StrEnum):
60
+ "Side refers to either the left or right dataframe in a comparison."
61
+
62
+ LEFT = "left"
63
+ RIGHT = "right"
diffly/cli.py ADDED
@@ -0,0 +1,148 @@
1
+ # Copyright (c) QuantCo 2025-2026
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+
4
+ import datetime as dt
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ import polars as pl
9
+
10
+ from diffly import compare_frames
11
+
12
+ from ._compat import typer
13
+ from ._utils import ABS_TOL_DEFAULT, ABS_TOL_TEMPORAL_DEFAULT, REL_TOL_DEFAULT
14
+
15
+ app = typer.Typer()
16
+
17
+
18
+ @app.command()
19
+ def main(
20
+ left: Annotated[Path, typer.Argument(help="Path to the left parquet file.")],
21
+ right: Annotated[Path, typer.Argument(help="Path to the right parquet file.")],
22
+ primary_key: Annotated[
23
+ list[str],
24
+ typer.Option(
25
+ help=(
26
+ "Primary key columns to use for joining the data frames. If not "
27
+ "provided, comparisons based on joins will raise an error."
28
+ )
29
+ ),
30
+ ] = [],
31
+ abs_tol: Annotated[
32
+ float,
33
+ typer.Option(
34
+ help="Absolute tolerance for numerical comparisons. Default is 1e-08."
35
+ ),
36
+ ] = ABS_TOL_DEFAULT,
37
+ rel_tol: Annotated[
38
+ float,
39
+ typer.Option(
40
+ help="Relative tolerance for numerical comparisons. Default is 1e-05."
41
+ ),
42
+ ] = REL_TOL_DEFAULT,
43
+ abs_tol_temporal: Annotated[
44
+ float,
45
+ typer.Option(
46
+ help=("Absolute tolerance for temporal comparisons. Default is 0 seconds.")
47
+ ),
48
+ ] = ABS_TOL_TEMPORAL_DEFAULT.total_seconds(),
49
+ show_perfect_column_matches: Annotated[
50
+ bool,
51
+ typer.Option(
52
+ help=(
53
+ "Whether to include column matches in the summary even if the column "
54
+ "match rate is 100%. Setting this to ``False`` is useful when comparing "
55
+ "very wide data frames. "
56
+ )
57
+ ),
58
+ ] = True,
59
+ top_k_column_changes: Annotated[
60
+ int,
61
+ typer.Option(
62
+ help=(
63
+ "The maximum number of column values changes to display for columns "
64
+ "with a match rate below 100% in the summary. When enabling this "
65
+ "feature, make sure that no sensitive data is leaked."
66
+ )
67
+ ),
68
+ ] = 0,
69
+ sample_k_rows_only: Annotated[
70
+ int,
71
+ typer.Option(
72
+ help=(
73
+ 'The number of rows to show in the Rows "left/right only" '
74
+ "section of the summary. If 0 (default), no rows are shown. Only the "
75
+ "primary key will be printed. An error will be raised if a positive "
76
+ "number is provided and any of the primary key columns is also in "
77
+ "`hidden_columns`. "
78
+ )
79
+ ),
80
+ ] = 0,
81
+ show_sample_primary_key_per_change: Annotated[
82
+ bool,
83
+ typer.Option(
84
+ help=(
85
+ "Whether to show a sample primary key per column change in the summary."
86
+ "If False (default), no primary key values are shown. A sample primary"
87
+ "key can only be shown if `top_k_column_changes` is greater than 0, as"
88
+ "each sample primary key is linked to a specific column change. An "
89
+ "error will be raised if True and any of the primary key columns is also"
90
+ "in `hidden_columns`."
91
+ )
92
+ ),
93
+ ] = False,
94
+ left_name: Annotated[
95
+ str,
96
+ typer.Option(help="Custom display name for the left data frame."),
97
+ ] = "left",
98
+ right_name: Annotated[
99
+ str,
100
+ typer.Option(help="Custom display name for the right data frame."),
101
+ ] = "right",
102
+ slim: Annotated[
103
+ bool,
104
+ typer.Option(
105
+ help=(
106
+ "Whether to generate a slim summary. In slim mode, the summary is as"
107
+ "concise as possible, only showing sections that contain differences."
108
+ "As the structure of the summary can vary, it should only be used by"
109
+ "advanced users who are familiar with the summary format."
110
+ )
111
+ ),
112
+ ] = False,
113
+ hidden_columns: Annotated[
114
+ list[str],
115
+ typer.Option(
116
+ help=(
117
+ "Columns for which no values are printed, e.g. because they contain"
118
+ "sensitive information."
119
+ )
120
+ ),
121
+ ] = [],
122
+ ) -> None:
123
+ """Compare two `parquet` files and print the comparison result."""
124
+
125
+ comparison = compare_frames(
126
+ pl.scan_parquet(left),
127
+ pl.scan_parquet(right),
128
+ primary_key=None if not primary_key else primary_key,
129
+ abs_tol=abs_tol,
130
+ rel_tol=rel_tol,
131
+ abs_tol_temporal=dt.timedelta(seconds=abs_tol_temporal),
132
+ )
133
+ typer.echo(
134
+ comparison.summary(
135
+ show_perfect_column_matches=show_perfect_column_matches,
136
+ top_k_column_changes=top_k_column_changes,
137
+ sample_k_rows_only=sample_k_rows_only,
138
+ show_sample_primary_key_per_change=show_sample_primary_key_per_change,
139
+ left_name=left_name,
140
+ right_name=right_name,
141
+ slim=slim,
142
+ hidden_columns=hidden_columns,
143
+ ).format(pretty=True)
144
+ )
145
+
146
+
147
+ if __name__ == "__main__": # pragma: no cover
148
+ app()