PyPI - diffly - Versions diffs - 1.0.0__py3-none-any.whl - Mend

diffly 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

diffly/__init__.py +16 -0
diffly/_cache.py +32 -0
diffly/_compat.py +27 -0
diffly/_conditions.py +208 -0
diffly/_utils.py +63 -0
diffly/cli.py +148 -0
diffly/comparison.py +809 -0
diffly/py.typed +0 -0
diffly/summary.py +770 -0
diffly/testing.py +251 -0
diffly-1.0.0.dist-info/METADATA +116 -0
diffly-1.0.0.dist-info/RECORD +15 -0
diffly-1.0.0.dist-info/WHEEL +4 -0
diffly-1.0.0.dist-info/entry_points.txt +2 -0
diffly-1.0.0.dist-info/licenses/LICENSE +28 -0

diffly/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright (c) QuantCo 2025-2026
+# SPDX-License-Identifier: BSD-3-Clause
+import importlib.metadata
+import warnings
+try:
+    __version__ = importlib.metadata.version(__name__)
+except importlib.metadata.PackageNotFoundError as e:  # pragma: no cover
+    warnings.warn(f"Could not determine version of {__name__}\n{e!s}", stacklevel=2)
+    __version__ = "unknown"
+from .comparison import compare_frames
+__all__ = ["compare_frames"]

diffly/_cache.py ADDED Viewed

@@ -0,0 +1,32 @@
+# Copyright (c) QuantCo 2025-2026
+# SPDX-License-Identifier: BSD-3-Clause
+import functools
+from collections.abc import Callable
+from typing import Any, Concatenate, ParamSpec, TypeVar
+P = ParamSpec("P")
+T = TypeVar("T")
+def cached_method(
+    fn: Callable[Concatenate[Any, P], T],
+) -> Callable[Concatenate[Any, P], T]:
+    """Cache all results from the executions from an instance method."""
+    cache_name = f"_{fn.__name__}_cache"
+    kwd_mark = object()  # sentinel for separating args from kwargs
+    @functools.wraps(fn)
+    def wrapped(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
+        key = args + (kwd_mark,) + tuple(sorted(kwargs.items()))
+        if not hasattr(self, cache_name):
+            setattr(self, cache_name, {})
+        if key in getattr(self, cache_name):
+            return getattr(self, cache_name)[key]
+        result = fn(self, *args, **kwargs)
+        getattr(self, cache_name)[key] = result
+        return result
+    return wrapped

diffly/_compat.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright (c) QuantCo 2025-2026
+# SPDX-License-Identifier: BSD-3-Clause
+from typing import Any
+class _DummyModule:  # pragma: no cover
+    def __init__(self, module: str) -> None:
+        self.module = module
+    def __getattr__(self, name: str) -> Any:
+        raise ValueError(f"Module '{self.module}' is not installed.")
+# ------------------------------------ DATAFRAMELY ------------------------------------ #
+try:
+    import dataframely as dy
+except ImportError:  # pragma: no cover
+    dy = _DummyModule("dataframely")  # type: ignore
+# --------------------------------------- TYPER -------------------------------------- #
+try:
+    import typer
+except ImportError:  # pragma: no cover
+    typer = _DummyModule("typer")  # type: ignore

diffly/_conditions.py ADDED Viewed

@@ -0,0 +1,208 @@
+# Copyright (c) QuantCo 2025-2026
+# SPDX-License-Identifier: BSD-3-Clause
+import datetime as dt
+from collections.abc import Mapping
+import polars as pl
+from polars.datatypes import DataType, DataTypeClass
+from diffly._utils import (
+    ABS_TOL_DEFAULT,
+    ABS_TOL_TEMPORAL_DEFAULT,
+    REL_TOL_DEFAULT,
+    Side,
+)
+def condition_equal_rows(
+    columns: list[str],
+    schema_left: pl.Schema,
+    schema_right: pl.Schema,
+    abs_tol_by_column: Mapping[str, float],
+    rel_tol_by_column: Mapping[str, float],
+    abs_tol_temporal_by_column: Mapping[str, dt.timedelta],
+) -> pl.Expr:
+    """Build an expression whether two rows are equal, based on all columns' data
+    types."""
+    if not columns:
+        return pl.lit(True)
+    return pl.all_horizontal(
+        [
+            condition_equal_columns(
+                column=column,
+                dtype_left=schema_left[column],
+                dtype_right=schema_right[column],
+                abs_tol=abs_tol_by_column[column],
+                rel_tol=rel_tol_by_column[column],
+                abs_tol_temporal=abs_tol_temporal_by_column[column],
+            )
+            for column in columns
+        ]
+    )
+def condition_equal_columns(
+    column: str,
+    dtype_left: pl.DataType,
+    dtype_right: pl.DataType,
+    abs_tol: float = ABS_TOL_DEFAULT,
+    rel_tol: float = REL_TOL_DEFAULT,
+    abs_tol_temporal: dt.timedelta = ABS_TOL_TEMPORAL_DEFAULT,
+) -> pl.Expr:
+    """Build an expression whether two columns are equal, depending on the columns' data
+    types."""
+    return _compare_columns(
+        col_left=pl.col(f"{column}_{Side.LEFT}"),
+        col_right=pl.col(f"{column}_{Side.RIGHT}"),
+        dtype_left=dtype_left,
+        dtype_right=dtype_right,
+        abs_tol=abs_tol,
+        rel_tol=rel_tol,
+        abs_tol_temporal=abs_tol_temporal,
+    )
+# --------------------------------------- UTILS -------------------------------------- #
+def _can_compare_dtypes(
+    dtype_left: DataType | DataTypeClass,
+    dtype_right: DataType | DataTypeClass,
+) -> bool:
+    return (
+        (dtype_left == dtype_right)
+        or (dtype_left == pl.Null)
+        or (dtype_right == pl.Null)
+        or (
+            (
+                (dtype_left.is_numeric() or dtype_left == pl.Boolean)
+                == (dtype_right.is_numeric() or dtype_right == pl.Boolean)
+            )
+            and (dtype_left.is_temporal() == dtype_right.is_temporal())
+            and (dtype_left.is_nested() == dtype_right.is_nested())
+            and ((dtype_left == pl.Struct) == (dtype_right == pl.Struct))
+        )
+    )
+def _compare_columns(
+    col_left: pl.Expr,
+    col_right: pl.Expr,
+    dtype_left: DataType | DataTypeClass,
+    dtype_right: DataType | DataTypeClass,
+    abs_tol: float,
+    rel_tol: float,
+    abs_tol_temporal: dt.timedelta,
+) -> pl.Expr:
+    """Build an expression whether two expressions yield the same value.
+    This method is more generic than :meth:`condition_equal_columns` as it accepts two
+    arbitrary expressions rather than a "base column name".
+    """
+    if not _can_compare_dtypes(dtype_left, dtype_right):
+        return pl.repeat(pl.lit(False), pl.len())
+    # If we encounter nested dtypes, we have to treat them specially
+    if dtype_left.is_nested():
+        if isinstance(dtype_left, pl.Struct):
+            assert isinstance(dtype_right, pl.Struct)
+            # For two structs, we necessarily need to have matching field names (the
+            # order does not matter). If that isn't the case, we cannot observe equality
+            fields_left = {f.name: f.dtype for f in dtype_left.fields}
+            fields_right = {f.name: f.dtype for f in dtype_right.fields}
+            if fields_left.keys() != fields_right.keys():
+                return pl.repeat(pl.lit(False), pl.len())
+            # Otherwise, we simply compare all fields independently
+            return pl.all_horizontal(
+                [
+                    _compare_columns(
+                        col_left=col_left.struct[field],
+                        col_right=col_right.struct[field],
+                        dtype_left=fields_left[field],
+                        dtype_right=fields_right[field],
+                        abs_tol=abs_tol,
+                        rel_tol=rel_tol,
+                        abs_tol_temporal=abs_tol_temporal,
+                    )
+                    for field in fields_left
+                ]
+            )
+        elif isinstance(dtype_left, pl.List | pl.Array) and isinstance(
+            dtype_right, pl.List | pl.Array
+        ):
+            # As of polars 1.28, there is no way to access another column within
+            # `list.eval`. Hence, we necessarily need to resort to a primitive
+            # comparison in this case.
+            pass
+    if (
+        isinstance(dtype_left, pl.Enum)
+        and isinstance(dtype_right, pl.Enum)
+        and dtype_left != dtype_right
+    ) or _enum_and_categorical(dtype_left, dtype_right):
+        # Enums with different categories as well as enums and categoricals
+        # can't be compared directly.
+        # Fall back to comparison of strings.
+        return _compare_columns(
+            col_left=col_left.cast(pl.String),
+            col_right=col_right.cast(pl.String),
+            dtype_left=pl.String,
+            dtype_right=pl.String,
+            abs_tol=abs_tol,
+            rel_tol=rel_tol,
+            abs_tol_temporal=abs_tol_temporal,
+        )
+    return _compare_primitive_columns(
+        col_left=col_left,
+        col_right=col_right,
+        dtype_left=dtype_left,
+        dtype_right=dtype_right,
+        abs_tol=abs_tol,
+        rel_tol=rel_tol,
+        abs_tol_temporal=abs_tol_temporal,
+    )
+def _compare_primitive_columns(
+    col_left: pl.Expr,
+    col_right: pl.Expr,
+    dtype_left: DataType | DataTypeClass,
+    dtype_right: DataType | DataTypeClass,
+    abs_tol: float,
+    rel_tol: float,
+    abs_tol_temporal: dt.timedelta,
+) -> pl.Expr:
+    if (dtype_left.is_float() or dtype_right.is_float()) and (
+        dtype_left.is_numeric() and dtype_right.is_numeric()
+    ):
+        return col_left.is_close(col_right, abs_tol=abs_tol, rel_tol=rel_tol).pipe(
+            _eq_missing_with_nan, lhs=col_left, rhs=col_right
+        )
+    elif dtype_left.is_temporal() and dtype_right.is_temporal():
+        diff_less_than_tolerance = (col_left - col_right).abs() <= abs_tol_temporal
+        return diff_less_than_tolerance.pipe(_eq_missing, lhs=col_left, rhs=col_right)
+    return col_left.eq_missing(col_right)
+def _eq_missing(expr: pl.Expr, lhs: pl.Expr, rhs: pl.Expr) -> pl.Expr:
+    both_null = lhs.is_null() & rhs.is_null()
+    both_not_null = lhs.is_not_null() & rhs.is_not_null()
+    return (expr & both_not_null) | both_null
+def _eq_missing_with_nan(expr: pl.Expr, lhs: pl.Expr, rhs: pl.Expr) -> pl.Expr:
+    both_nan = lhs.is_nan() & rhs.is_nan()
+    return _eq_missing(expr, lhs, rhs) | both_nan
+def _enum_and_categorical(
+    left: DataType | DataTypeClass, right: DataType | DataTypeClass
+) -> bool:
+    return (isinstance(left, pl.Enum) and isinstance(right, pl.Categorical)) or (
+        isinstance(left, pl.Categorical) and isinstance(right, pl.Enum)
+    )

diffly/_utils.py ADDED Viewed

@@ -0,0 +1,63 @@
+# Copyright (c) QuantCo 2025-2026
+# SPDX-License-Identifier: BSD-3-Clause
+import datetime as dt
+from collections.abc import Mapping
+from enum import StrEnum
+from typing import TypeVar
+import polars as pl
+def lazy_len(lf: pl.LazyFrame) -> int:
+    return lf.select(pl.len()).collect().item()
+def is_primary_key(lf: pl.LazyFrame, columns: list[str]) -> bool:
+    return not lf.select(pl.struct(*columns).is_duplicated().any()).collect().item()
+def get_select_columns(keep: list[str], expand: list[str]) -> list[str]:
+    return keep + sum(
+        [[f"{col}_{Side.LEFT}", f"{col}_{Side.RIGHT}"] for col in expand], []
+    )
+T = TypeVar("T", float, dt.timedelta, int)
+def make_and_validate_mapping(
+    value_or_mapping: T | Mapping[str, T], other_common_columns: list[str]
+) -> dict[str, T]:
+    if isinstance(value_or_mapping, Mapping):
+        for col in other_common_columns:
+            try:
+                value_or_mapping[col]
+            except KeyError:
+                raise KeyError(
+                    "The mapping needs to specify a value for every common column except "
+                    "the primary key."
+                )
+        if diff := (set(value_or_mapping.keys()) - set(other_common_columns)):
+            raise KeyError(
+                f"The mapping must only contain common columns except the primary key. "
+                f"However, it also contains the following columns: {diff}."
+            )
+        return {col: value_or_mapping[col] for col in other_common_columns}
+    return {col: value_or_mapping for col in other_common_columns}
+def capitalize_first(s: str) -> str:
+    return s[0].upper() + s[1:] if s else s
+ABS_TOL_DEFAULT = 1e-08
+REL_TOL_DEFAULT = 1e-05
+ABS_TOL_TEMPORAL_DEFAULT = dt.timedelta(0)
+class Side(StrEnum):
+    "Side refers to either the left or right dataframe in a comparison."
+    LEFT = "left"
+    RIGHT = "right"

diffly/cli.py ADDED Viewed

@@ -0,0 +1,148 @@
+# Copyright (c) QuantCo 2025-2026
+# SPDX-License-Identifier: BSD-3-Clause
+import datetime as dt
+from pathlib import Path
+from typing import Annotated
+import polars as pl
+from diffly import compare_frames
+from ._compat import typer
+from ._utils import ABS_TOL_DEFAULT, ABS_TOL_TEMPORAL_DEFAULT, REL_TOL_DEFAULT
+app = typer.Typer()
+@app.command()
+def main(
+    left: Annotated[Path, typer.Argument(help="Path to the left parquet file.")],
+    right: Annotated[Path, typer.Argument(help="Path to the right parquet file.")],
+    primary_key: Annotated[
+        list[str],
+        typer.Option(
+            help=(
+                "Primary key columns to use for joining the data frames. If not "
+                "provided, comparisons based on joins will raise an error."
+            )
+        ),
+    ] = [],
+    abs_tol: Annotated[
+        float,
+        typer.Option(
+            help="Absolute tolerance for numerical comparisons. Default is 1e-08."
+        ),
+    ] = ABS_TOL_DEFAULT,
+    rel_tol: Annotated[
+        float,
+        typer.Option(
+            help="Relative tolerance for numerical comparisons. Default is 1e-05."
+        ),
+    ] = REL_TOL_DEFAULT,
+    abs_tol_temporal: Annotated[
+        float,
+        typer.Option(
+            help=("Absolute tolerance for temporal comparisons. Default is 0 seconds.")
+        ),
+    ] = ABS_TOL_TEMPORAL_DEFAULT.total_seconds(),
+    show_perfect_column_matches: Annotated[
+        bool,
+        typer.Option(
+            help=(
+                "Whether to include column matches in the summary even if the column "
+                "match rate is 100%. Setting this to ``False`` is useful when comparing "
+                "very wide data frames. "
+            )
+        ),
+    ] = True,
+    top_k_column_changes: Annotated[
+        int,
+        typer.Option(
+            help=(
+                "The maximum number of column values changes to display for columns "
+                "with a match rate below 100% in the summary. When enabling this "
+                "feature, make sure that no sensitive data is leaked."
+            )
+        ),
+    ] = 0,
+    sample_k_rows_only: Annotated[
+        int,
+        typer.Option(
+            help=(
+                'The number of rows to show in the Rows "left/right only" '
+                "section of the summary. If 0 (default), no rows are shown. Only the "
+                "primary key will be printed. An error will be raised if a positive "
+                "number is provided and any of the primary key columns is also in "
+                "`hidden_columns`. "
+            )
+        ),
+    ] = 0,
+    show_sample_primary_key_per_change: Annotated[
+        bool,
+        typer.Option(
+            help=(
+                "Whether to show a sample primary key per column change in the summary."
+                "If False (default), no primary key values are shown. A sample primary"
+                "key can only be shown if `top_k_column_changes` is greater than 0, as"
+                "each sample primary key is linked to a specific column change. An "
+                "error will be raised if True and any of the primary key columns is also"
+                "in `hidden_columns`."
+            )
+        ),
+    ] = False,
+    left_name: Annotated[
+        str,
+        typer.Option(help="Custom display name for the left data frame."),
+    ] = "left",
+    right_name: Annotated[
+        str,
+        typer.Option(help="Custom display name for the right data frame."),
+    ] = "right",
+    slim: Annotated[
+        bool,
+        typer.Option(
+            help=(
+                "Whether to generate a slim summary. In slim mode, the summary is as"
+                "concise as possible, only showing sections that contain differences."
+                "As the structure of the summary can vary, it should only be used by"
+                "advanced users who are familiar with the summary format."
+            )
+        ),
+    ] = False,
+    hidden_columns: Annotated[
+        list[str],
+        typer.Option(
+            help=(
+                "Columns for which no values are printed, e.g. because they contain"
+                "sensitive information."
+            )
+        ),
+    ] = [],
+) -> None:
+    """Compare two `parquet` files and print the comparison result."""
+    comparison = compare_frames(
+        pl.scan_parquet(left),
+        pl.scan_parquet(right),
+        primary_key=None if not primary_key else primary_key,
+        abs_tol=abs_tol,
+        rel_tol=rel_tol,
+        abs_tol_temporal=dt.timedelta(seconds=abs_tol_temporal),
+    )
+    typer.echo(
+        comparison.summary(
+            show_perfect_column_matches=show_perfect_column_matches,
+            top_k_column_changes=top_k_column_changes,
+            sample_k_rows_only=sample_k_rows_only,
+            show_sample_primary_key_per_change=show_sample_primary_key_per_change,
+            left_name=left_name,
+            right_name=right_name,
+            slim=slim,
+            hidden_columns=hidden_columns,
+        ).format(pretty=True)
+    )
+if __name__ == "__main__":  # pragma: no cover
+    app()