PyPI - dsdiff - Versions diffs - 0.2.0__py3-none-any.whl - Mend

dsdiff 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

dsdiff/__init__.py +19 -0
dsdiff/__main__.py +4 -0
dsdiff/cli.py +114 -0
dsdiff/compare.py +161 -0
dsdiff/dataset.py +174 -0
dsdiff/drift.py +63 -0
dsdiff/profile.py +151 -0
dsdiff/render.py +55 -0
dsdiff-0.2.0.dist-info/METADATA +138 -0
dsdiff-0.2.0.dist-info/RECORD +13 -0
dsdiff-0.2.0.dist-info/WHEEL +4 -0
dsdiff-0.2.0.dist-info/entry_points.txt +2 -0
dsdiff-0.2.0.dist-info/licenses/LICENSE +21 -0

dsdiff/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""dsdiff: diff two dataset files by schema and column-level drift."""
+from dsdiff.compare import Finding, FindingKind, compare_files, compare_profiles
+from dsdiff.dataset import DatasetProfile, profile_file
+from dsdiff.drift import Severity, psi_from_counts
+__version__ = "0.2.0"
+__all__ = [
+    "DatasetProfile",
+    "Finding",
+    "FindingKind",
+    "Severity",
+    "__version__",
+    "compare_files",
+    "compare_profiles",
+    "profile_file",
+    "psi_from_counts",
+]

dsdiff/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from dsdiff.cli import entrypoint
+if __name__ == "__main__":
+    entrypoint()

dsdiff/cli.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Command-line interface for dsdiff."""
+from __future__ import annotations
+import json
+import sys
+from enum import Enum
+from pathlib import Path
+import typer
+from rich.console import Console
+from dsdiff import __version__
+from dsdiff.compare import compare_files, has_blocking
+from dsdiff.dataset import profile_file
+from dsdiff.drift import Severity
+from dsdiff.render import findings_to_json, render_markdown, render_terminal
+app = typer.Typer(
+    add_completion=False,
+    no_args_is_help=True,
+    help="Diff two dataset files: schema changes plus column-level drift.",
+)
+_out = Console()
+_err = Console(stderr=True)
+EXIT_OK = 0
+EXIT_BLOCKING = 1
+EXIT_BAD_INPUT = 2
+class FailOn(str, Enum):
+    high = "high"
+    medium = "medium"
+def _version_callback(value: bool) -> None:
+    if value:
+        _out.print(f"dsdiff {__version__}")
+        raise typer.Exit()
+@app.callback()
+def main(
+    _version: bool = typer.Option(
+        False,
+        "--version",
+        callback=_version_callback,
+        is_eager=True,
+        help="Show the version and exit.",
+    ),
+) -> None:
+    """dsdiff command-line interface."""
+@app.command("diff")
+def diff(
+    baseline: Path = typer.Argument(..., help="Baseline dataset, or a saved profile JSON."),
+    candidate: Path = typer.Argument(..., help="Dataset to compare against the baseline."),
+    as_json: bool = typer.Option(False, "--json", help="Emit findings as JSON."),
+    markdown: bool = typer.Option(False, "--markdown", help="Emit a Markdown table."),
+    check: bool = typer.Option(False, "--check", help="Exit non-zero on blocking findings."),
+    fail_on: FailOn = typer.Option(
+        FailOn.high, "--fail-on", help="Severity that --check treats as blocking."
+    ),
+) -> None:
+    """Compare two datasets and report schema and distribution changes."""
+    try:
+        findings = compare_files(baseline, candidate)
+    except (OSError, ValueError, json.JSONDecodeError) as exc:
+        _err.print(f"dsdiff: {exc}")
+        raise typer.Exit(EXIT_BAD_INPUT) from exc
+    if as_json:
+        _out.print_json(json.dumps(findings_to_json(findings)))
+    elif markdown:
+        _out.print(render_markdown(findings))
+    else:
+        _out.print(render_terminal(findings))
+    threshold = Severity.HIGH if fail_on is FailOn.high else Severity.MEDIUM
+    if check and has_blocking(findings, threshold):
+        raise typer.Exit(EXIT_BLOCKING)
+@app.command("profile")
+def profile(
+    dataset: Path = typer.Argument(..., help="Dataset to profile."),
+    output: Path | None = typer.Option(
+        None, "-o", "--output", help="Write the profile JSON here (default: stdout)."
+    ),
+) -> None:
+    """Write a committable baseline profile of a dataset."""
+    try:
+        prof = profile_file(dataset)
+    except (OSError, ValueError) as exc:
+        _err.print(f"dsdiff: {exc}")
+        raise typer.Exit(EXIT_BAD_INPUT) from exc
+    payload = json.dumps(prof.to_dict(), indent=2)
+    if output is None:
+        _out.print_json(payload)
+    else:
+        Path(output).write_text(payload + "\n", encoding="utf-8")
+        _err.print(f"dsdiff: wrote {output}")
+def entrypoint() -> None:
+    try:
+        app()
+    except KeyboardInterrupt:  # pragma: no cover - interactive only
+        print("dsdiff: interrupted", file=sys.stderr)
+        raise SystemExit(130) from None

dsdiff/compare.py ADDED Viewed

@@ -0,0 +1,161 @@
+"""Turn two dataset profiles into an ordered list of findings.
+A finding is one human-readable difference with a severity. The high-level
+:func:`compare_files` ties it together: it profiles the baseline, bins the new
+dataset against the baseline's edges so drift is comparable, and runs the
+comparison. The baseline may be a data file or a previously saved profile JSON.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from dsdiff.dataset import DatasetProfile, profile_file
+from dsdiff.drift import (
+    Severity,
+    psi_from_counts,
+    psi_from_frequencies,
+    severity_for_psi,
+)
+from dsdiff.profile import NUMERIC, ColumnProfile, category_frequencies
+_NULL_RATE_HIGH = 0.2
+_NULL_RATE_MEDIUM = 0.05
+_SEVERITY_ORDER = {Severity.HIGH: 0, Severity.MEDIUM: 1, Severity.LOW: 2}
+class FindingKind(str, Enum):
+    COLUMN_ADDED = "column_added"
+    COLUMN_REMOVED = "column_removed"
+    TYPE_CHANGED = "type_changed"
+    NULL_RATE = "null_rate"
+    CARDINALITY = "cardinality"
+    DRIFT = "drift"
+@dataclass(frozen=True, slots=True)
+class Finding:
+    column: str
+    kind: FindingKind
+    severity: Severity
+    detail: str
+    psi: float | None = None
+    @property
+    def sort_key(self) -> tuple[int, str]:
+        return (_SEVERITY_ORDER[self.severity], self.column)
+def compare_profiles(old: DatasetProfile, new: DatasetProfile) -> list[Finding]:
+    findings: list[Finding] = []
+    old_cols = set(old.columns)
+    new_cols = set(new.columns)
+    for name in sorted(new_cols - old_cols):
+        findings.append(Finding(name, FindingKind.COLUMN_ADDED, Severity.HIGH, "new column"))
+    for name in sorted(old_cols - new_cols):
+        findings.append(Finding(name, FindingKind.COLUMN_REMOVED, Severity.HIGH, "column removed"))
+    for name in sorted(old_cols & new_cols):
+        findings.extend(_compare_column(old.columns[name], new.columns[name]))
+    findings.sort(key=lambda f: f.sort_key)
+    return findings
+def _compare_column(old: ColumnProfile, new: ColumnProfile) -> list[Finding]:
+    if old.kind != new.kind:
+        return [
+            Finding(
+                old.name,
+                FindingKind.TYPE_CHANGED,
+                Severity.HIGH,
+                f"{old.kind} -> {new.kind}",
+            )
+        ]
+    findings: list[Finding] = []
+    findings.extend(_null_rate_finding(old, new))
+    findings.extend(_cardinality_finding(old, new))
+    findings.extend(_drift_finding(old, new))
+    return findings
+def _null_rate_finding(old: ColumnProfile, new: ColumnProfile) -> list[Finding]:
+    delta = abs(new.null_rate - old.null_rate)
+    if delta >= _NULL_RATE_HIGH:
+        severity = Severity.HIGH
+    elif delta >= _NULL_RATE_MEDIUM:
+        severity = Severity.MEDIUM
+    else:
+        return []
+    return [
+        Finding(
+            old.name,
+            FindingKind.NULL_RATE,
+            severity,
+            f"null rate {old.null_rate:.1%} -> {new.null_rate:.1%}",
+        )
+    ]
+def _cardinality_finding(old: ColumnProfile, new: ColumnProfile) -> list[Finding]:
+    if old.kind == NUMERIC or old.n_unique == 0:
+        return []
+    ratio = new.n_unique / old.n_unique
+    if ratio >= 2.0 or ratio <= 0.5:
+        return [
+            Finding(
+                old.name,
+                FindingKind.CARDINALITY,
+                Severity.MEDIUM,
+                f"distinct values {old.n_unique} -> {new.n_unique}",
+            )
+        ]
+    return []
+def _drift_finding(old: ColumnProfile, new: ColumnProfile) -> list[Finding]:
+    if old.kind == NUMERIC and old.numeric and new.numeric:
+        psi = psi_from_counts(old.numeric.counts, new.numeric.counts)
+    else:
+        psi = psi_from_frequencies(category_frequencies(old), category_frequencies(new))
+    severity = severity_for_psi(psi)
+    if severity is Severity.LOW:
+        return []
+    return [
+        Finding(
+            old.name,
+            FindingKind.DRIFT,
+            severity,
+            f"PSI {psi:.3f}",
+            psi=psi,
+        )
+    ]
+def has_blocking(findings: list[Finding], threshold: Severity = Severity.HIGH) -> bool:
+    limit = _SEVERITY_ORDER[threshold]
+    return any(_SEVERITY_ORDER[f.severity] <= limit for f in findings)
+def _load_baseline(path: Path) -> tuple[DatasetProfile, dict[str, tuple[float, ...]]]:
+    if path.suffix.lower() == ".json":
+        profile = DatasetProfile.from_dict(json.loads(path.read_text(encoding="utf-8")))
+    else:
+        profile = profile_file(path)
+    edges = {
+        name: col.numeric.edges for name, col in profile.columns.items() if col.numeric is not None
+    }
+    return profile, edges
+def compare_files(baseline: str | Path, candidate: str | Path) -> list[Finding]:
+    """Profile both inputs and return findings, highest severity first."""
+    old_profile, edges = _load_baseline(Path(baseline))
+    new_profile = profile_file(candidate, edges=edges)
+    return compare_profiles(old_profile, new_profile)

dsdiff/dataset.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Read tabular files and turn them into dataset profiles.
+This is the only module that knows about polars and file formats. It adapts
+columns into the pure profiling functions in :mod:`dsdiff.profile`, optionally
+reusing a baseline's bin edges so a new dataset is binned the same way (which
+is what makes the population stability index comparable).
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+import polars as pl
+from dsdiff.profile import (
+    BOOLEAN,
+    CATEGORICAL,
+    DATETIME,
+    NUMERIC,
+    OTHER,
+    ColumnProfile,
+    NumericSummary,
+    bin_counts,
+    profile_categorical,
+    profile_numeric,
+)
+@dataclass(frozen=True, slots=True)
+class DatasetProfile:
+    row_count: int
+    columns: dict[str, ColumnProfile]
+    def to_dict(self) -> dict:
+        return {
+            "row_count": self.row_count,
+            "columns": {name: _column_to_dict(p) for name, p in self.columns.items()},
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> DatasetProfile:
+        columns = {
+            name: _column_from_dict(name, payload)
+            for name, payload in data.get("columns", {}).items()
+        }
+        return cls(row_count=int(data.get("row_count", 0)), columns=columns)
+def read_frame(path: str | Path) -> pl.DataFrame:
+    path = Path(path)
+    suffix = path.suffix.lower()
+    if suffix == ".csv":
+        return pl.read_csv(path)
+    if suffix in {".parquet", ".pq"}:
+        return pl.read_parquet(path)
+    if suffix in {".jsonl", ".ndjson"}:
+        return pl.read_ndjson(path)
+    if suffix == ".json":
+        return pl.read_json(path)
+    raise ValueError(f"unsupported file type: {path.suffix or '(none)'}")
+def _kind_of(dtype: pl.DataType) -> str:
+    if dtype.is_numeric():
+        return NUMERIC
+    if dtype == pl.Boolean:
+        return BOOLEAN
+    if dtype in (pl.Utf8, pl.Categorical) or dtype == pl.String:
+        return CATEGORICAL
+    if dtype in (pl.Date, pl.Time) or isinstance(dtype, pl.Datetime):
+        return DATETIME
+    return OTHER
+def profile_frame(
+    frame: pl.DataFrame,
+    *,
+    edges: dict[str, tuple[float, ...]] | None = None,
+) -> DatasetProfile:
+    edges = edges or {}
+    columns: dict[str, ColumnProfile] = {}
+    for name in frame.columns:
+        series = frame.get_column(name)
+        kind = _kind_of(series.dtype)
+        if kind == NUMERIC:
+            columns[name] = _profile_numeric_series(name, series, edges.get(name))
+        elif kind == BOOLEAN:
+            values = ["true" if v else "false" if v is not None else None for v in series.to_list()]
+            columns[name] = profile_categorical(name, values, kind=BOOLEAN)
+        else:
+            values = [None if v is None else str(v) for v in series.to_list()]
+            columns[name] = profile_categorical(name, values, kind=kind)
+    return DatasetProfile(row_count=frame.height, columns=columns)
+def _profile_numeric_series(
+    name: str, series: pl.Series, baseline_edges: tuple[float, ...] | None
+) -> ColumnProfile:
+    arr = series.cast(pl.Float64, strict=False).to_numpy()
+    profile = profile_numeric(name, arr)
+    if baseline_edges is None or profile.numeric is None:
+        return profile
+    # Re-bin against the baseline edges so PSI is comparable.
+    edges = np.asarray(baseline_edges, dtype=float)
+    counts = bin_counts(arr, edges)
+    summary = profile.numeric
+    rebinned = NumericSummary(
+        minimum=summary.minimum,
+        maximum=summary.maximum,
+        mean=summary.mean,
+        std=summary.std,
+        edges=tuple(float(e) for e in edges),
+        counts=tuple(int(c) for c in counts),
+    )
+    return ColumnProfile(
+        name=profile.name,
+        kind=profile.kind,
+        count=profile.count,
+        null_count=profile.null_count,
+        n_unique=profile.n_unique,
+        numeric=rebinned,
+    )
+def profile_file(path: str | Path, *, edges: dict[str, tuple[float, ...]] | None = None):
+    return profile_frame(read_frame(path), edges=edges)
+def _column_to_dict(profile: ColumnProfile) -> dict:
+    out: dict = {
+        "kind": profile.kind,
+        "count": profile.count,
+        "null_count": profile.null_count,
+        "n_unique": profile.n_unique,
+    }
+    if profile.numeric is not None:
+        n = profile.numeric
+        out["numeric"] = {
+            "minimum": n.minimum,
+            "maximum": n.maximum,
+            "mean": n.mean,
+            "std": n.std,
+            "edges": list(n.edges),
+            "counts": list(n.counts),
+        }
+    if profile.top_categories:
+        out["top_categories"] = [list(item) for item in profile.top_categories]
+    return out
+def _column_from_dict(name: str, payload: dict) -> ColumnProfile:
+    numeric = None
+    if "numeric" in payload:
+        n = payload["numeric"]
+        numeric = NumericSummary(
+            minimum=float(n["minimum"]),
+            maximum=float(n["maximum"]),
+            mean=float(n["mean"]),
+            std=float(n["std"]),
+            edges=tuple(float(e) for e in n["edges"]),
+            counts=tuple(int(c) for c in n["counts"]),
+        )
+    top = tuple((str(v), int(c)) for v, c in payload.get("top_categories", []))
+    return ColumnProfile(
+        name=name,
+        kind=str(payload["kind"]),
+        count=int(payload["count"]),
+        null_count=int(payload["null_count"]),
+        n_unique=int(payload["n_unique"]),
+        numeric=numeric,
+        top_categories=top,
+    )

dsdiff/drift.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Distribution-drift measures.
+The population stability index (PSI) is the workhorse: a single number that
+summarizes how far a new distribution has moved from a baseline. It is cheap,
+interpretable, and the de-facto standard for tabular monitoring. All functions
+are pure and operate on counts, so they are exhaustively unit-tested.
+"""
+from __future__ import annotations
+from enum import Enum
+import numpy as np
+# Conventional PSI thresholds used across the industry.
+PSI_MEDIUM = 0.1
+PSI_HIGH = 0.25
+_EPSILON = 1e-6
+class Severity(str, Enum):
+    HIGH = "high"
+    MEDIUM = "medium"
+    LOW = "low"
+def _normalize(counts: np.ndarray) -> np.ndarray:
+    total = counts.sum()
+    if total <= 0:
+        # Uniform fallback keeps PSI finite when a side is empty.
+        return np.full(counts.shape, 1.0 / counts.size)
+    fractions = counts / total
+    return np.clip(fractions, _EPSILON, None)
+def psi_from_counts(expected: np.ndarray, actual: np.ndarray) -> float:
+    """Population stability index between two binned distributions."""
+    expected = np.asarray(expected, dtype=float)
+    actual = np.asarray(actual, dtype=float)
+    if expected.shape != actual.shape:
+        raise ValueError("expected and actual must have the same number of bins")
+    e = _normalize(expected)
+    a = _normalize(actual)
+    return float(np.sum((a - e) * np.log(a / e)))
+def psi_from_frequencies(expected: dict[str, float], actual: dict[str, float]) -> float:
+    """PSI over categorical frequency maps, aligned on the union of keys."""
+    keys = sorted(set(expected) | set(actual))
+    e = np.array([expected.get(k, 0.0) for k in keys], dtype=float)
+    a = np.array([actual.get(k, 0.0) for k in keys], dtype=float)
+    return psi_from_counts(e, a)
+def severity_for_psi(psi: float) -> Severity:
+    if psi >= PSI_HIGH:
+        return Severity.HIGH
+    if psi >= PSI_MEDIUM:
+        return Severity.MEDIUM
+    return Severity.LOW

dsdiff/profile.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""Column-level profiles computed from raw values.
+Everything here works on plain Python sequences and NumPy arrays so it can be
+unit-tested without reading a file or constructing a dataframe. The dataframe
+layer in :mod:`dsdiff.dataset` adapts polars columns into these calls.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass, field
+import numpy as np
+# Canonical dtype groups, so CSV/Parquet/JSON all map onto the same vocabulary.
+NUMERIC = "numeric"
+CATEGORICAL = "categorical"
+BOOLEAN = "boolean"
+DATETIME = "datetime"
+OTHER = "other"
+DEFAULT_BINS = 20
+@dataclass(frozen=True, slots=True)
+class NumericSummary:
+    minimum: float
+    maximum: float
+    mean: float
+    std: float
+    edges: tuple[float, ...]
+    counts: tuple[int, ...]
+@dataclass(frozen=True, slots=True)
+class ColumnProfile:
+    name: str
+    kind: str
+    count: int
+    null_count: int
+    n_unique: int
+    numeric: NumericSummary | None = None
+    top_categories: tuple[tuple[str, int], ...] = field(default_factory=tuple)
+    @property
+    def null_rate(self) -> float:
+        total = self.count + self.null_count
+        return self.null_count / total if total else 0.0
+def quantile_edges(values: np.ndarray, bins: int = DEFAULT_BINS) -> np.ndarray:
+    """Return monotonically increasing bin edges from value quantiles.
+    Quantile binning keeps roughly equal mass per bin in the baseline, which
+    is what the population stability index expects. Degenerate columns (few
+    distinct values) collapse to as many edges as can be made unique.
+    """
+    finite = values[np.isfinite(values)]
+    if finite.size == 0:
+        return np.array([0.0, 1.0])
+    qs = np.linspace(0.0, 1.0, bins + 1)
+    edges = np.quantile(finite, qs)
+    edges = np.unique(edges)
+    if edges.size < 2:
+        lo = float(edges[0])
+        edges = np.array([lo, lo + 1.0])
+    # Nudge the outer edges so min and max values fall inside the range.
+    edges = edges.astype(float)
+    edges[0] = np.nextafter(edges[0], -np.inf)
+    edges[-1] = np.nextafter(edges[-1], np.inf)
+    return edges
+def bin_counts(values: np.ndarray, edges: np.ndarray) -> np.ndarray:
+    """Count finite values per bucket, clamping out-of-range values to the
+    edge bins.
+    Clamping matters for drift: if a new dataset shifts entirely past the
+    baseline's range, the moved mass must land in the outer bins rather than
+    being dropped, otherwise a large shift would read as no change.
+    """
+    finite = values[np.isfinite(values)]
+    if finite.size == 0:
+        return np.zeros(len(edges) - 1, dtype=int)
+    clamped = np.clip(finite, edges[0], edges[-1])
+    counts, _ = np.histogram(clamped, bins=edges)
+    return counts.astype(int)
+def profile_numeric(name: str, values: np.ndarray, *, bins: int = DEFAULT_BINS) -> ColumnProfile:
+    arr = np.asarray(values, dtype=float)
+    null_count = int(np.count_nonzero(~np.isfinite(arr)))
+    finite = arr[np.isfinite(arr)]
+    count = int(finite.size)
+    if count == 0:
+        summary = NumericSummary(0.0, 0.0, 0.0, 0.0, (0.0, 1.0), (0,))
+        return ColumnProfile(name, NUMERIC, 0, null_count, 0, numeric=summary)
+    edges = quantile_edges(finite, bins=bins)
+    counts = bin_counts(finite, edges)
+    summary = NumericSummary(
+        minimum=float(finite.min()),
+        maximum=float(finite.max()),
+        mean=float(finite.mean()),
+        std=float(finite.std(ddof=0)),
+        edges=tuple(float(e) for e in edges),
+        counts=tuple(int(c) for c in counts),
+    )
+    return ColumnProfile(
+        name=name,
+        kind=NUMERIC,
+        count=count,
+        null_count=null_count,
+        n_unique=int(np.unique(finite).size),
+        numeric=summary,
+    )
+def profile_categorical(
+    name: str,
+    values: list[str | None],
+    *,
+    kind: str = CATEGORICAL,
+    top_k: int = 20,
+) -> ColumnProfile:
+    null_count = sum(1 for v in values if v is None)
+    present = [str(v) for v in values if v is not None]
+    counts: dict[str, int] = {}
+    for value in present:
+        counts[value] = counts.get(value, 0) + 1
+    ordered = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))
+    return ColumnProfile(
+        name=name,
+        kind=kind,
+        count=len(present),
+        null_count=null_count,
+        n_unique=len(counts),
+        top_categories=tuple(ordered[:top_k]),
+    )
+def category_frequencies(profile: ColumnProfile) -> dict[str, float]:
+    total = sum(c for _, c in profile.top_categories)
+    if total == 0:
+        return {}
+    return {value: c / total for value, c in profile.top_categories}
+def is_finite_number(value: object) -> bool:
+    return isinstance(value, (int, float)) and math.isfinite(value)

dsdiff/render.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Render findings for the terminal, as Markdown, and as JSON."""
+from __future__ import annotations
+from rich.console import Group
+from rich.table import Table
+from rich.text import Text
+from dsdiff.compare import Finding
+from dsdiff.drift import Severity
+_STYLE = {Severity.HIGH: "bold red", Severity.MEDIUM: "yellow", Severity.LOW: "dim"}
+def findings_to_json(findings: list[Finding]) -> list[dict]:
+    return [
+        {
+            "column": f.column,
+            "kind": f.kind.value,
+            "severity": f.severity.value,
+            "detail": f.detail,
+            "psi": f.psi,
+        }
+        for f in findings
+    ]
+def render_terminal(findings: list[Finding]) -> Group:
+    if not findings:
+        return Group(Text("no differences", style="green"))
+    table = Table(box=None, pad_edge=False)
+    table.add_column("severity")
+    table.add_column("column", style="cyan")
+    table.add_column("change")
+    table.add_column("detail")
+    for f in findings:
+        table.add_row(
+            Text(f.severity.value, style=_STYLE[f.severity]),
+            f.column,
+            f.kind.value,
+            f.detail,
+        )
+    return Group(table)
+def render_markdown(findings: list[Finding]) -> str:
+    if not findings:
+        return "No differences found."
+    lines = [
+        "| severity | column | change | detail |",
+        "| --- | --- | --- | --- |",
+    ]
+    for f in findings:
+        lines.append(f"| {f.severity.value} | {f.column} | {f.kind.value} | {f.detail} |")
+    return "\n".join(lines)

dsdiff-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,138 @@
+Metadata-Version: 2.4
+Name: dsdiff
+Version: 0.2.0
+Summary: Diff two dataset files: schema changes plus column-level distribution drift.
+Project-URL: Homepage, https://github.com/jmweb-org/dsdiff
+Project-URL: Repository, https://github.com/jmweb-org/dsdiff
+Project-URL: Issues, https://github.com/jmweb-org/dsdiff/issues
+Author: José del Río
+License: MIT License
+        Copyright (c) 2026 José del Río
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+License-File: LICENSE
+Keywords: cli,data,dataset,diff,drift,mlops,psi,schema
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Utilities
+Requires-Python: >=3.10
+Requires-Dist: numpy>=1.24
+Requires-Dist: polars>=1.0
+Requires-Dist: rich>=13.0
+Requires-Dist: typer>=0.12
+Description-Content-Type: text/markdown
+# dsdiff
+[![CI](https://github.com/jmweb-org/dsdiff/actions/workflows/ci.yml/badge.svg)](https://github.com/jmweb-org/dsdiff/actions/workflows/ci.yml)
+[![PyPI](https://img.shields.io/pypi/v/dsdiff.svg)](https://pypi.org/project/dsdiff/)
+[![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org)
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
+A git-style diff between two dataset files: schema changes and column-level
+distribution drift, with a CI gate.
+When a dataset is regenerated, columns quietly get renamed, retyped, gain
+nulls, or shift distribution, and the pipeline keeps running while the model
+degrades. There is no quick "git diff for data" a reviewer can read on a pull
+request. `dsdiff` is that: point it at two files and it reports what changed,
+ranked by how much it should worry you.
+```console
+$ dsdiff diff yesterday.parquet today.parquet
+severity  column      change         detail
+high      income      drift          PSI 0.412
+high      signup_date column_added   new column
+medium    age          null_rate      null rate 0.0% -> 7.3%
+low       country      cardinality    distinct values 41 -> 44
+```
+## Install
+```console
+$ pip install dsdiff                 # from PyPI, once released
+$ pip install git+https://github.com/jmweb-org/dsdiff   # latest, available now
+```
+Reads CSV, Parquet and JSON Lines through polars. No services, no schema files
+to author.
+## Usage
+```console
+$ dsdiff diff a.csv b.csv            # human-readable table
+$ dsdiff diff a.csv b.csv --json     # machine-readable findings
+$ dsdiff diff a.csv b.csv --markdown # a table to paste into a PR
+$ dsdiff diff a.csv b.csv --check    # exit non-zero on a high-severity change
+```
+### Commit a baseline
+Profile a dataset once and compare future data against the saved profile,
+without re-reading the original file:
+```console
+$ dsdiff profile reference.parquet -o baseline.json
+$ dsdiff diff baseline.json new_batch.parquet --check
+```
+The baseline stores the bin edges, so drift on `new_batch` is measured against
+exactly the same buckets as the reference.
+### In CI
+```yaml
+- run: dsdiff diff baseline.json data/current.parquet --check
+```
+## What it checks
+- **Schema**: columns added, removed, or retyped (all high severity).
+- **Nulls**: a jump in the null rate of a shared column.
+- **Cardinality**: a categorical column gaining or losing distinct values.
+- **Distribution drift**: the population stability index (PSI) per column,
+  numeric columns binned by quantiles and categoricals by frequency.
+## Severity and the PSI scale
+PSI is the standard tabular drift measure. The conventional reading is used
+here: below 0.1 is **low** (no meaningful shift), 0.1 to 0.25 is **medium**,
+and 0.25 or above is **high**. Schema and type changes are always high. By
+default `--check` fails only on high-severity findings; pass `--fail-on medium`
+to tighten the gate.
+## Exit codes
+| Code | Meaning |
+| --- | --- |
+| 0 | Ran; no blocking finding (or `--check` not set) |
+| 1 | `--check` found a finding at or above the fail threshold |
+| 2 | A file was missing or in an unsupported format |
+## License
+MIT. See [LICENSE](LICENSE).

dsdiff-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+dsdiff/__init__.py,sha256=ILXacDdPwBYK0U9lAff7oD7QzWi8MENxkwSY8Ftznek,477
+dsdiff/__main__.py,sha256=RreqBxAXlznJ5-x5tuI-d6ibQoPCkBzdzdehbJkOaew,79
+dsdiff/cli.py,sha256=-9zgiHApOEBRA7fPoIYSexS9feUISVZmXfDJ1M3U-Q8,3346
+dsdiff/compare.py,sha256=CVTqb5WVa4ku-ZCJiCnR-mcBl5XbH1djS2IAynvYXvg,5003
+dsdiff/dataset.py,sha256=a3JrD7Sizb-0_XC3SZ5ULwXsxg1Pjbh5jzc2baqa_YE,5482
+dsdiff/drift.py,sha256=nk2Q-Fe8Ypxbu8hARiUipi8MygZp200xEhTfaeMjp88,1938
+dsdiff/profile.py,sha256=B8jZdaPL6E6YURRVDfbzhxCL8N_McUUPeHCKPscEPkQ,4716
+dsdiff/render.py,sha256=oUXpikXn2AfIlgDMmZY5SCNGRkSIPEk2iTbQZpCPGNQ,1539
+dsdiff-0.2.0.dist-info/METADATA,sha256=Pf7HHgJi0DNa069XKvIR4oJS_NnOEo23Ousiq0przWA,5507
+dsdiff-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+dsdiff-0.2.0.dist-info/entry_points.txt,sha256=10ThOMIXktqFDbf7C59Pn82zZwzRmatSO_LCdLonapw,49
+dsdiff-0.2.0.dist-info/licenses/LICENSE,sha256=N4nJy_wSxYwULjDvuE2GupQWZSSwgOOU_HJSzuxHBsI,1071
+dsdiff-0.2.0.dist-info/RECORD,,

dsdiff-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

dsdiff-0.2.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ dsdiff = dsdiff.cli:entrypoint

dsdiff-0.2.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 José del Río
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.