dsdiff 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dsdiff/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """dsdiff: diff two dataset files by schema and column-level drift."""
2
+
3
+ from dsdiff.compare import Finding, FindingKind, compare_files, compare_profiles
4
+ from dsdiff.dataset import DatasetProfile, profile_file
5
+ from dsdiff.drift import Severity, psi_from_counts
6
+
7
+ __version__ = "0.2.0"
8
+
9
+ __all__ = [
10
+ "DatasetProfile",
11
+ "Finding",
12
+ "FindingKind",
13
+ "Severity",
14
+ "__version__",
15
+ "compare_files",
16
+ "compare_profiles",
17
+ "profile_file",
18
+ "psi_from_counts",
19
+ ]
dsdiff/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from dsdiff.cli import entrypoint
2
+
3
+ if __name__ == "__main__":
4
+ entrypoint()
dsdiff/cli.py ADDED
@@ -0,0 +1,114 @@
1
+ """Command-line interface for dsdiff."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ from enum import Enum
8
+ from pathlib import Path
9
+
10
+ import typer
11
+ from rich.console import Console
12
+
13
+ from dsdiff import __version__
14
+ from dsdiff.compare import compare_files, has_blocking
15
+ from dsdiff.dataset import profile_file
16
+ from dsdiff.drift import Severity
17
+ from dsdiff.render import findings_to_json, render_markdown, render_terminal
18
+
19
+ app = typer.Typer(
20
+ add_completion=False,
21
+ no_args_is_help=True,
22
+ help="Diff two dataset files: schema changes plus column-level drift.",
23
+ )
24
+ _out = Console()
25
+ _err = Console(stderr=True)
26
+
27
+ EXIT_OK = 0
28
+ EXIT_BLOCKING = 1
29
+ EXIT_BAD_INPUT = 2
30
+
31
+
32
+ class FailOn(str, Enum):
33
+ high = "high"
34
+ medium = "medium"
35
+
36
+
37
+ def _version_callback(value: bool) -> None:
38
+ if value:
39
+ _out.print(f"dsdiff {__version__}")
40
+ raise typer.Exit()
41
+
42
+
43
+ @app.callback()
44
+ def main(
45
+ _version: bool = typer.Option(
46
+ False,
47
+ "--version",
48
+ callback=_version_callback,
49
+ is_eager=True,
50
+ help="Show the version and exit.",
51
+ ),
52
+ ) -> None:
53
+ """dsdiff command-line interface."""
54
+
55
+
56
+ @app.command("diff")
57
+ def diff(
58
+ baseline: Path = typer.Argument(..., help="Baseline dataset, or a saved profile JSON."),
59
+ candidate: Path = typer.Argument(..., help="Dataset to compare against the baseline."),
60
+ as_json: bool = typer.Option(False, "--json", help="Emit findings as JSON."),
61
+ markdown: bool = typer.Option(False, "--markdown", help="Emit a Markdown table."),
62
+ check: bool = typer.Option(False, "--check", help="Exit non-zero on blocking findings."),
63
+ fail_on: FailOn = typer.Option(
64
+ FailOn.high, "--fail-on", help="Severity that --check treats as blocking."
65
+ ),
66
+ ) -> None:
67
+ """Compare two datasets and report schema and distribution changes."""
68
+
69
+ try:
70
+ findings = compare_files(baseline, candidate)
71
+ except (OSError, ValueError, json.JSONDecodeError) as exc:
72
+ _err.print(f"dsdiff: {exc}")
73
+ raise typer.Exit(EXIT_BAD_INPUT) from exc
74
+
75
+ if as_json:
76
+ _out.print_json(json.dumps(findings_to_json(findings)))
77
+ elif markdown:
78
+ _out.print(render_markdown(findings))
79
+ else:
80
+ _out.print(render_terminal(findings))
81
+
82
+ threshold = Severity.HIGH if fail_on is FailOn.high else Severity.MEDIUM
83
+ if check and has_blocking(findings, threshold):
84
+ raise typer.Exit(EXIT_BLOCKING)
85
+
86
+
87
+ @app.command("profile")
88
+ def profile(
89
+ dataset: Path = typer.Argument(..., help="Dataset to profile."),
90
+ output: Path | None = typer.Option(
91
+ None, "-o", "--output", help="Write the profile JSON here (default: stdout)."
92
+ ),
93
+ ) -> None:
94
+ """Write a committable baseline profile of a dataset."""
95
+
96
+ try:
97
+ prof = profile_file(dataset)
98
+ except (OSError, ValueError) as exc:
99
+ _err.print(f"dsdiff: {exc}")
100
+ raise typer.Exit(EXIT_BAD_INPUT) from exc
101
+ payload = json.dumps(prof.to_dict(), indent=2)
102
+ if output is None:
103
+ _out.print_json(payload)
104
+ else:
105
+ Path(output).write_text(payload + "\n", encoding="utf-8")
106
+ _err.print(f"dsdiff: wrote {output}")
107
+
108
+
109
+ def entrypoint() -> None:
110
+ try:
111
+ app()
112
+ except KeyboardInterrupt: # pragma: no cover - interactive only
113
+ print("dsdiff: interrupted", file=sys.stderr)
114
+ raise SystemExit(130) from None
dsdiff/compare.py ADDED
@@ -0,0 +1,161 @@
1
+ """Turn two dataset profiles into an ordered list of findings.
2
+
3
+ A finding is one human-readable difference with a severity. The high-level
4
+ :func:`compare_files` ties it together: it profiles the baseline, bins the new
5
+ dataset against the baseline's edges so drift is comparable, and runs the
6
+ comparison. The baseline may be a data file or a previously saved profile JSON.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+ from pathlib import Path
15
+
16
+ from dsdiff.dataset import DatasetProfile, profile_file
17
+ from dsdiff.drift import (
18
+ Severity,
19
+ psi_from_counts,
20
+ psi_from_frequencies,
21
+ severity_for_psi,
22
+ )
23
+ from dsdiff.profile import NUMERIC, ColumnProfile, category_frequencies
24
+
25
+ _NULL_RATE_HIGH = 0.2
26
+ _NULL_RATE_MEDIUM = 0.05
27
+ _SEVERITY_ORDER = {Severity.HIGH: 0, Severity.MEDIUM: 1, Severity.LOW: 2}
28
+
29
+
30
+ class FindingKind(str, Enum):
31
+ COLUMN_ADDED = "column_added"
32
+ COLUMN_REMOVED = "column_removed"
33
+ TYPE_CHANGED = "type_changed"
34
+ NULL_RATE = "null_rate"
35
+ CARDINALITY = "cardinality"
36
+ DRIFT = "drift"
37
+
38
+
39
+ @dataclass(frozen=True, slots=True)
40
+ class Finding:
41
+ column: str
42
+ kind: FindingKind
43
+ severity: Severity
44
+ detail: str
45
+ psi: float | None = None
46
+
47
+ @property
48
+ def sort_key(self) -> tuple[int, str]:
49
+ return (_SEVERITY_ORDER[self.severity], self.column)
50
+
51
+
52
+ def compare_profiles(old: DatasetProfile, new: DatasetProfile) -> list[Finding]:
53
+ findings: list[Finding] = []
54
+ old_cols = set(old.columns)
55
+ new_cols = set(new.columns)
56
+
57
+ for name in sorted(new_cols - old_cols):
58
+ findings.append(Finding(name, FindingKind.COLUMN_ADDED, Severity.HIGH, "new column"))
59
+ for name in sorted(old_cols - new_cols):
60
+ findings.append(Finding(name, FindingKind.COLUMN_REMOVED, Severity.HIGH, "column removed"))
61
+
62
+ for name in sorted(old_cols & new_cols):
63
+ findings.extend(_compare_column(old.columns[name], new.columns[name]))
64
+
65
+ findings.sort(key=lambda f: f.sort_key)
66
+ return findings
67
+
68
+
69
+ def _compare_column(old: ColumnProfile, new: ColumnProfile) -> list[Finding]:
70
+ if old.kind != new.kind:
71
+ return [
72
+ Finding(
73
+ old.name,
74
+ FindingKind.TYPE_CHANGED,
75
+ Severity.HIGH,
76
+ f"{old.kind} -> {new.kind}",
77
+ )
78
+ ]
79
+
80
+ findings: list[Finding] = []
81
+ findings.extend(_null_rate_finding(old, new))
82
+ findings.extend(_cardinality_finding(old, new))
83
+ findings.extend(_drift_finding(old, new))
84
+ return findings
85
+
86
+
87
+ def _null_rate_finding(old: ColumnProfile, new: ColumnProfile) -> list[Finding]:
88
+ delta = abs(new.null_rate - old.null_rate)
89
+ if delta >= _NULL_RATE_HIGH:
90
+ severity = Severity.HIGH
91
+ elif delta >= _NULL_RATE_MEDIUM:
92
+ severity = Severity.MEDIUM
93
+ else:
94
+ return []
95
+ return [
96
+ Finding(
97
+ old.name,
98
+ FindingKind.NULL_RATE,
99
+ severity,
100
+ f"null rate {old.null_rate:.1%} -> {new.null_rate:.1%}",
101
+ )
102
+ ]
103
+
104
+
105
+ def _cardinality_finding(old: ColumnProfile, new: ColumnProfile) -> list[Finding]:
106
+ if old.kind == NUMERIC or old.n_unique == 0:
107
+ return []
108
+ ratio = new.n_unique / old.n_unique
109
+ if ratio >= 2.0 or ratio <= 0.5:
110
+ return [
111
+ Finding(
112
+ old.name,
113
+ FindingKind.CARDINALITY,
114
+ Severity.MEDIUM,
115
+ f"distinct values {old.n_unique} -> {new.n_unique}",
116
+ )
117
+ ]
118
+ return []
119
+
120
+
121
+ def _drift_finding(old: ColumnProfile, new: ColumnProfile) -> list[Finding]:
122
+ if old.kind == NUMERIC and old.numeric and new.numeric:
123
+ psi = psi_from_counts(old.numeric.counts, new.numeric.counts)
124
+ else:
125
+ psi = psi_from_frequencies(category_frequencies(old), category_frequencies(new))
126
+ severity = severity_for_psi(psi)
127
+ if severity is Severity.LOW:
128
+ return []
129
+ return [
130
+ Finding(
131
+ old.name,
132
+ FindingKind.DRIFT,
133
+ severity,
134
+ f"PSI {psi:.3f}",
135
+ psi=psi,
136
+ )
137
+ ]
138
+
139
+
140
+ def has_blocking(findings: list[Finding], threshold: Severity = Severity.HIGH) -> bool:
141
+ limit = _SEVERITY_ORDER[threshold]
142
+ return any(_SEVERITY_ORDER[f.severity] <= limit for f in findings)
143
+
144
+
145
+ def _load_baseline(path: Path) -> tuple[DatasetProfile, dict[str, tuple[float, ...]]]:
146
+ if path.suffix.lower() == ".json":
147
+ profile = DatasetProfile.from_dict(json.loads(path.read_text(encoding="utf-8")))
148
+ else:
149
+ profile = profile_file(path)
150
+ edges = {
151
+ name: col.numeric.edges for name, col in profile.columns.items() if col.numeric is not None
152
+ }
153
+ return profile, edges
154
+
155
+
156
+ def compare_files(baseline: str | Path, candidate: str | Path) -> list[Finding]:
157
+ """Profile both inputs and return findings, highest severity first."""
158
+
159
+ old_profile, edges = _load_baseline(Path(baseline))
160
+ new_profile = profile_file(candidate, edges=edges)
161
+ return compare_profiles(old_profile, new_profile)
dsdiff/dataset.py ADDED
@@ -0,0 +1,174 @@
1
+ """Read tabular files and turn them into dataset profiles.
2
+
3
+ This is the only module that knows about polars and file formats. It adapts
4
+ columns into the pure profiling functions in :mod:`dsdiff.profile`, optionally
5
+ reusing a baseline's bin edges so a new dataset is binned the same way (which
6
+ is what makes the population stability index comparable).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+
14
+ import numpy as np
15
+ import polars as pl
16
+
17
+ from dsdiff.profile import (
18
+ BOOLEAN,
19
+ CATEGORICAL,
20
+ DATETIME,
21
+ NUMERIC,
22
+ OTHER,
23
+ ColumnProfile,
24
+ NumericSummary,
25
+ bin_counts,
26
+ profile_categorical,
27
+ profile_numeric,
28
+ )
29
+
30
+
31
+ @dataclass(frozen=True, slots=True)
32
+ class DatasetProfile:
33
+ row_count: int
34
+ columns: dict[str, ColumnProfile]
35
+
36
+ def to_dict(self) -> dict:
37
+ return {
38
+ "row_count": self.row_count,
39
+ "columns": {name: _column_to_dict(p) for name, p in self.columns.items()},
40
+ }
41
+
42
+ @classmethod
43
+ def from_dict(cls, data: dict) -> DatasetProfile:
44
+ columns = {
45
+ name: _column_from_dict(name, payload)
46
+ for name, payload in data.get("columns", {}).items()
47
+ }
48
+ return cls(row_count=int(data.get("row_count", 0)), columns=columns)
49
+
50
+
51
+ def read_frame(path: str | Path) -> pl.DataFrame:
52
+ path = Path(path)
53
+ suffix = path.suffix.lower()
54
+ if suffix == ".csv":
55
+ return pl.read_csv(path)
56
+ if suffix in {".parquet", ".pq"}:
57
+ return pl.read_parquet(path)
58
+ if suffix in {".jsonl", ".ndjson"}:
59
+ return pl.read_ndjson(path)
60
+ if suffix == ".json":
61
+ return pl.read_json(path)
62
+ raise ValueError(f"unsupported file type: {path.suffix or '(none)'}")
63
+
64
+
65
+ def _kind_of(dtype: pl.DataType) -> str:
66
+ if dtype.is_numeric():
67
+ return NUMERIC
68
+ if dtype == pl.Boolean:
69
+ return BOOLEAN
70
+ if dtype in (pl.Utf8, pl.Categorical) or dtype == pl.String:
71
+ return CATEGORICAL
72
+ if dtype in (pl.Date, pl.Time) or isinstance(dtype, pl.Datetime):
73
+ return DATETIME
74
+ return OTHER
75
+
76
+
77
+ def profile_frame(
78
+ frame: pl.DataFrame,
79
+ *,
80
+ edges: dict[str, tuple[float, ...]] | None = None,
81
+ ) -> DatasetProfile:
82
+ edges = edges or {}
83
+ columns: dict[str, ColumnProfile] = {}
84
+ for name in frame.columns:
85
+ series = frame.get_column(name)
86
+ kind = _kind_of(series.dtype)
87
+ if kind == NUMERIC:
88
+ columns[name] = _profile_numeric_series(name, series, edges.get(name))
89
+ elif kind == BOOLEAN:
90
+ values = ["true" if v else "false" if v is not None else None for v in series.to_list()]
91
+ columns[name] = profile_categorical(name, values, kind=BOOLEAN)
92
+ else:
93
+ values = [None if v is None else str(v) for v in series.to_list()]
94
+ columns[name] = profile_categorical(name, values, kind=kind)
95
+ return DatasetProfile(row_count=frame.height, columns=columns)
96
+
97
+
98
+ def _profile_numeric_series(
99
+ name: str, series: pl.Series, baseline_edges: tuple[float, ...] | None
100
+ ) -> ColumnProfile:
101
+ arr = series.cast(pl.Float64, strict=False).to_numpy()
102
+ profile = profile_numeric(name, arr)
103
+ if baseline_edges is None or profile.numeric is None:
104
+ return profile
105
+ # Re-bin against the baseline edges so PSI is comparable.
106
+ edges = np.asarray(baseline_edges, dtype=float)
107
+ counts = bin_counts(arr, edges)
108
+ summary = profile.numeric
109
+ rebinned = NumericSummary(
110
+ minimum=summary.minimum,
111
+ maximum=summary.maximum,
112
+ mean=summary.mean,
113
+ std=summary.std,
114
+ edges=tuple(float(e) for e in edges),
115
+ counts=tuple(int(c) for c in counts),
116
+ )
117
+ return ColumnProfile(
118
+ name=profile.name,
119
+ kind=profile.kind,
120
+ count=profile.count,
121
+ null_count=profile.null_count,
122
+ n_unique=profile.n_unique,
123
+ numeric=rebinned,
124
+ )
125
+
126
+
127
+ def profile_file(path: str | Path, *, edges: dict[str, tuple[float, ...]] | None = None):
128
+ return profile_frame(read_frame(path), edges=edges)
129
+
130
+
131
+ def _column_to_dict(profile: ColumnProfile) -> dict:
132
+ out: dict = {
133
+ "kind": profile.kind,
134
+ "count": profile.count,
135
+ "null_count": profile.null_count,
136
+ "n_unique": profile.n_unique,
137
+ }
138
+ if profile.numeric is not None:
139
+ n = profile.numeric
140
+ out["numeric"] = {
141
+ "minimum": n.minimum,
142
+ "maximum": n.maximum,
143
+ "mean": n.mean,
144
+ "std": n.std,
145
+ "edges": list(n.edges),
146
+ "counts": list(n.counts),
147
+ }
148
+ if profile.top_categories:
149
+ out["top_categories"] = [list(item) for item in profile.top_categories]
150
+ return out
151
+
152
+
153
+ def _column_from_dict(name: str, payload: dict) -> ColumnProfile:
154
+ numeric = None
155
+ if "numeric" in payload:
156
+ n = payload["numeric"]
157
+ numeric = NumericSummary(
158
+ minimum=float(n["minimum"]),
159
+ maximum=float(n["maximum"]),
160
+ mean=float(n["mean"]),
161
+ std=float(n["std"]),
162
+ edges=tuple(float(e) for e in n["edges"]),
163
+ counts=tuple(int(c) for c in n["counts"]),
164
+ )
165
+ top = tuple((str(v), int(c)) for v, c in payload.get("top_categories", []))
166
+ return ColumnProfile(
167
+ name=name,
168
+ kind=str(payload["kind"]),
169
+ count=int(payload["count"]),
170
+ null_count=int(payload["null_count"]),
171
+ n_unique=int(payload["n_unique"]),
172
+ numeric=numeric,
173
+ top_categories=top,
174
+ )
dsdiff/drift.py ADDED
@@ -0,0 +1,63 @@
1
+ """Distribution-drift measures.
2
+
3
+ The population stability index (PSI) is the workhorse: a single number that
4
+ summarizes how far a new distribution has moved from a baseline. It is cheap,
5
+ interpretable, and the de-facto standard for tabular monitoring. All functions
6
+ are pure and operate on counts, so they are exhaustively unit-tested.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from enum import Enum
12
+
13
+ import numpy as np
14
+
15
+ # Conventional PSI thresholds used across the industry.
16
+ PSI_MEDIUM = 0.1
17
+ PSI_HIGH = 0.25
18
+
19
+ _EPSILON = 1e-6
20
+
21
+
22
+ class Severity(str, Enum):
23
+ HIGH = "high"
24
+ MEDIUM = "medium"
25
+ LOW = "low"
26
+
27
+
28
+ def _normalize(counts: np.ndarray) -> np.ndarray:
29
+ total = counts.sum()
30
+ if total <= 0:
31
+ # Uniform fallback keeps PSI finite when a side is empty.
32
+ return np.full(counts.shape, 1.0 / counts.size)
33
+ fractions = counts / total
34
+ return np.clip(fractions, _EPSILON, None)
35
+
36
+
37
+ def psi_from_counts(expected: np.ndarray, actual: np.ndarray) -> float:
38
+ """Population stability index between two binned distributions."""
39
+
40
+ expected = np.asarray(expected, dtype=float)
41
+ actual = np.asarray(actual, dtype=float)
42
+ if expected.shape != actual.shape:
43
+ raise ValueError("expected and actual must have the same number of bins")
44
+ e = _normalize(expected)
45
+ a = _normalize(actual)
46
+ return float(np.sum((a - e) * np.log(a / e)))
47
+
48
+
49
+ def psi_from_frequencies(expected: dict[str, float], actual: dict[str, float]) -> float:
50
+ """PSI over categorical frequency maps, aligned on the union of keys."""
51
+
52
+ keys = sorted(set(expected) | set(actual))
53
+ e = np.array([expected.get(k, 0.0) for k in keys], dtype=float)
54
+ a = np.array([actual.get(k, 0.0) for k in keys], dtype=float)
55
+ return psi_from_counts(e, a)
56
+
57
+
58
+ def severity_for_psi(psi: float) -> Severity:
59
+ if psi >= PSI_HIGH:
60
+ return Severity.HIGH
61
+ if psi >= PSI_MEDIUM:
62
+ return Severity.MEDIUM
63
+ return Severity.LOW
dsdiff/profile.py ADDED
@@ -0,0 +1,151 @@
1
+ """Column-level profiles computed from raw values.
2
+
3
+ Everything here works on plain Python sequences and NumPy arrays so it can be
4
+ unit-tested without reading a file or constructing a dataframe. The dataframe
5
+ layer in :mod:`dsdiff.dataset` adapts polars columns into these calls.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ from dataclasses import dataclass, field
12
+
13
+ import numpy as np
14
+
15
+ # Canonical dtype groups, so CSV/Parquet/JSON all map onto the same vocabulary.
16
+ NUMERIC = "numeric"
17
+ CATEGORICAL = "categorical"
18
+ BOOLEAN = "boolean"
19
+ DATETIME = "datetime"
20
+ OTHER = "other"
21
+
22
+ DEFAULT_BINS = 20
23
+
24
+
25
+ @dataclass(frozen=True, slots=True)
26
+ class NumericSummary:
27
+ minimum: float
28
+ maximum: float
29
+ mean: float
30
+ std: float
31
+ edges: tuple[float, ...]
32
+ counts: tuple[int, ...]
33
+
34
+
35
+ @dataclass(frozen=True, slots=True)
36
+ class ColumnProfile:
37
+ name: str
38
+ kind: str
39
+ count: int
40
+ null_count: int
41
+ n_unique: int
42
+ numeric: NumericSummary | None = None
43
+ top_categories: tuple[tuple[str, int], ...] = field(default_factory=tuple)
44
+
45
+ @property
46
+ def null_rate(self) -> float:
47
+ total = self.count + self.null_count
48
+ return self.null_count / total if total else 0.0
49
+
50
+
51
+ def quantile_edges(values: np.ndarray, bins: int = DEFAULT_BINS) -> np.ndarray:
52
+ """Return monotonically increasing bin edges from value quantiles.
53
+
54
+ Quantile binning keeps roughly equal mass per bin in the baseline, which
55
+ is what the population stability index expects. Degenerate columns (few
56
+ distinct values) collapse to as many edges as can be made unique.
57
+ """
58
+
59
+ finite = values[np.isfinite(values)]
60
+ if finite.size == 0:
61
+ return np.array([0.0, 1.0])
62
+ qs = np.linspace(0.0, 1.0, bins + 1)
63
+ edges = np.quantile(finite, qs)
64
+ edges = np.unique(edges)
65
+ if edges.size < 2:
66
+ lo = float(edges[0])
67
+ edges = np.array([lo, lo + 1.0])
68
+ # Nudge the outer edges so min and max values fall inside the range.
69
+ edges = edges.astype(float)
70
+ edges[0] = np.nextafter(edges[0], -np.inf)
71
+ edges[-1] = np.nextafter(edges[-1], np.inf)
72
+ return edges
73
+
74
+
75
+ def bin_counts(values: np.ndarray, edges: np.ndarray) -> np.ndarray:
76
+ """Count finite values per bucket, clamping out-of-range values to the
77
+ edge bins.
78
+
79
+ Clamping matters for drift: if a new dataset shifts entirely past the
80
+ baseline's range, the moved mass must land in the outer bins rather than
81
+ being dropped, otherwise a large shift would read as no change.
82
+ """
83
+
84
+ finite = values[np.isfinite(values)]
85
+ if finite.size == 0:
86
+ return np.zeros(len(edges) - 1, dtype=int)
87
+ clamped = np.clip(finite, edges[0], edges[-1])
88
+ counts, _ = np.histogram(clamped, bins=edges)
89
+ return counts.astype(int)
90
+
91
+
92
+ def profile_numeric(name: str, values: np.ndarray, *, bins: int = DEFAULT_BINS) -> ColumnProfile:
93
+ arr = np.asarray(values, dtype=float)
94
+ null_count = int(np.count_nonzero(~np.isfinite(arr)))
95
+ finite = arr[np.isfinite(arr)]
96
+ count = int(finite.size)
97
+ if count == 0:
98
+ summary = NumericSummary(0.0, 0.0, 0.0, 0.0, (0.0, 1.0), (0,))
99
+ return ColumnProfile(name, NUMERIC, 0, null_count, 0, numeric=summary)
100
+ edges = quantile_edges(finite, bins=bins)
101
+ counts = bin_counts(finite, edges)
102
+ summary = NumericSummary(
103
+ minimum=float(finite.min()),
104
+ maximum=float(finite.max()),
105
+ mean=float(finite.mean()),
106
+ std=float(finite.std(ddof=0)),
107
+ edges=tuple(float(e) for e in edges),
108
+ counts=tuple(int(c) for c in counts),
109
+ )
110
+ return ColumnProfile(
111
+ name=name,
112
+ kind=NUMERIC,
113
+ count=count,
114
+ null_count=null_count,
115
+ n_unique=int(np.unique(finite).size),
116
+ numeric=summary,
117
+ )
118
+
119
+
120
+ def profile_categorical(
121
+ name: str,
122
+ values: list[str | None],
123
+ *,
124
+ kind: str = CATEGORICAL,
125
+ top_k: int = 20,
126
+ ) -> ColumnProfile:
127
+ null_count = sum(1 for v in values if v is None)
128
+ present = [str(v) for v in values if v is not None]
129
+ counts: dict[str, int] = {}
130
+ for value in present:
131
+ counts[value] = counts.get(value, 0) + 1
132
+ ordered = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))
133
+ return ColumnProfile(
134
+ name=name,
135
+ kind=kind,
136
+ count=len(present),
137
+ null_count=null_count,
138
+ n_unique=len(counts),
139
+ top_categories=tuple(ordered[:top_k]),
140
+ )
141
+
142
+
143
+ def category_frequencies(profile: ColumnProfile) -> dict[str, float]:
144
+ total = sum(c for _, c in profile.top_categories)
145
+ if total == 0:
146
+ return {}
147
+ return {value: c / total for value, c in profile.top_categories}
148
+
149
+
150
+ def is_finite_number(value: object) -> bool:
151
+ return isinstance(value, (int, float)) and math.isfinite(value)
dsdiff/render.py ADDED
@@ -0,0 +1,55 @@
1
+ """Render findings for the terminal, as Markdown, and as JSON."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from rich.console import Group
6
+ from rich.table import Table
7
+ from rich.text import Text
8
+
9
+ from dsdiff.compare import Finding
10
+ from dsdiff.drift import Severity
11
+
12
+ _STYLE = {Severity.HIGH: "bold red", Severity.MEDIUM: "yellow", Severity.LOW: "dim"}
13
+
14
+
15
+ def findings_to_json(findings: list[Finding]) -> list[dict]:
16
+ return [
17
+ {
18
+ "column": f.column,
19
+ "kind": f.kind.value,
20
+ "severity": f.severity.value,
21
+ "detail": f.detail,
22
+ "psi": f.psi,
23
+ }
24
+ for f in findings
25
+ ]
26
+
27
+
28
+ def render_terminal(findings: list[Finding]) -> Group:
29
+ if not findings:
30
+ return Group(Text("no differences", style="green"))
31
+ table = Table(box=None, pad_edge=False)
32
+ table.add_column("severity")
33
+ table.add_column("column", style="cyan")
34
+ table.add_column("change")
35
+ table.add_column("detail")
36
+ for f in findings:
37
+ table.add_row(
38
+ Text(f.severity.value, style=_STYLE[f.severity]),
39
+ f.column,
40
+ f.kind.value,
41
+ f.detail,
42
+ )
43
+ return Group(table)
44
+
45
+
46
+ def render_markdown(findings: list[Finding]) -> str:
47
+ if not findings:
48
+ return "No differences found."
49
+ lines = [
50
+ "| severity | column | change | detail |",
51
+ "| --- | --- | --- | --- |",
52
+ ]
53
+ for f in findings:
54
+ lines.append(f"| {f.severity.value} | {f.column} | {f.kind.value} | {f.detail} |")
55
+ return "\n".join(lines)
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.4
2
+ Name: dsdiff
3
+ Version: 0.2.0
4
+ Summary: Diff two dataset files: schema changes plus column-level distribution drift.
5
+ Project-URL: Homepage, https://github.com/jmweb-org/dsdiff
6
+ Project-URL: Repository, https://github.com/jmweb-org/dsdiff
7
+ Project-URL: Issues, https://github.com/jmweb-org/dsdiff/issues
8
+ Author: José del Río
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 José del Río
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: cli,data,dataset,diff,drift,mlops,psi,schema
32
+ Classifier: Development Status :: 4 - Beta
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Operating System :: OS Independent
37
+ Classifier: Programming Language :: Python :: 3.10
38
+ Classifier: Programming Language :: Python :: 3.11
39
+ Classifier: Programming Language :: Python :: 3.12
40
+ Classifier: Topic :: Scientific/Engineering
41
+ Classifier: Topic :: Utilities
42
+ Requires-Python: >=3.10
43
+ Requires-Dist: numpy>=1.24
44
+ Requires-Dist: polars>=1.0
45
+ Requires-Dist: rich>=13.0
46
+ Requires-Dist: typer>=0.12
47
+ Description-Content-Type: text/markdown
48
+
49
+ # dsdiff
50
+
51
+ [![CI](https://github.com/jmweb-org/dsdiff/actions/workflows/ci.yml/badge.svg)](https://github.com/jmweb-org/dsdiff/actions/workflows/ci.yml)
52
+ [![PyPI](https://img.shields.io/pypi/v/dsdiff.svg)](https://pypi.org/project/dsdiff/)
53
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org)
54
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
55
+
56
+ A git-style diff between two dataset files: schema changes and column-level
57
+ distribution drift, with a CI gate.
58
+
59
+ When a dataset is regenerated, columns quietly get renamed, retyped, gain
60
+ nulls, or shift distribution, and the pipeline keeps running while the model
61
+ degrades. There is no quick "git diff for data" a reviewer can read on a pull
62
+ request. `dsdiff` is that: point it at two files and it reports what changed,
63
+ ranked by how much it should worry you.
64
+
65
+ ```console
66
+ $ dsdiff diff yesterday.parquet today.parquet
67
+ severity column change detail
68
+ high income drift PSI 0.412
69
+ high signup_date column_added new column
70
+ medium age null_rate null rate 0.0% -> 7.3%
71
+ low country cardinality distinct values 41 -> 44
72
+ ```
73
+
74
+ ## Install
75
+
76
+ ```console
77
+ $ pip install dsdiff # from PyPI, once released
78
+ $ pip install git+https://github.com/jmweb-org/dsdiff # latest, available now
79
+ ```
80
+
81
+ Reads CSV, Parquet and JSON Lines through polars. No services, no schema files
82
+ to author.
83
+
84
+ ## Usage
85
+
86
+ ```console
87
+ $ dsdiff diff a.csv b.csv # human-readable table
88
+ $ dsdiff diff a.csv b.csv --json # machine-readable findings
89
+ $ dsdiff diff a.csv b.csv --markdown # a table to paste into a PR
90
+ $ dsdiff diff a.csv b.csv --check # exit non-zero on a high-severity change
91
+ ```
92
+
93
+ ### Commit a baseline
94
+
95
+ Profile a dataset once and compare future data against the saved profile,
96
+ without re-reading the original file:
97
+
98
+ ```console
99
+ $ dsdiff profile reference.parquet -o baseline.json
100
+ $ dsdiff diff baseline.json new_batch.parquet --check
101
+ ```
102
+
103
+ The baseline stores the bin edges, so drift on `new_batch` is measured against
104
+ exactly the same buckets as the reference.
105
+
106
+ ### In CI
107
+
108
+ ```yaml
109
+ - run: dsdiff diff baseline.json data/current.parquet --check
110
+ ```
111
+
112
+ ## What it checks
113
+
114
+ - **Schema**: columns added, removed, or retyped (all high severity).
115
+ - **Nulls**: a jump in the null rate of a shared column.
116
+ - **Cardinality**: a categorical column gaining or losing distinct values.
117
+ - **Distribution drift**: the population stability index (PSI) per column,
118
+ numeric columns binned by quantiles and categoricals by frequency.
119
+
120
+ ## Severity and the PSI scale
121
+
122
+ PSI is the standard tabular drift measure. The conventional reading is used
123
+ here: below 0.1 is **low** (no meaningful shift), 0.1 to 0.25 is **medium**,
124
+ and 0.25 or above is **high**. Schema and type changes are always high. By
125
+ default `--check` fails only on high-severity findings; pass `--fail-on medium`
126
+ to tighten the gate.
127
+
128
+ ## Exit codes
129
+
130
+ | Code | Meaning |
131
+ | --- | --- |
132
+ | 0 | Ran; no blocking finding (or `--check` not set) |
133
+ | 1 | `--check` found a finding at or above the fail threshold |
134
+ | 2 | A file was missing or in an unsupported format |
135
+
136
+ ## License
137
+
138
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,13 @@
1
+ dsdiff/__init__.py,sha256=ILXacDdPwBYK0U9lAff7oD7QzWi8MENxkwSY8Ftznek,477
2
+ dsdiff/__main__.py,sha256=RreqBxAXlznJ5-x5tuI-d6ibQoPCkBzdzdehbJkOaew,79
3
+ dsdiff/cli.py,sha256=-9zgiHApOEBRA7fPoIYSexS9feUISVZmXfDJ1M3U-Q8,3346
4
+ dsdiff/compare.py,sha256=CVTqb5WVa4ku-ZCJiCnR-mcBl5XbH1djS2IAynvYXvg,5003
5
+ dsdiff/dataset.py,sha256=a3JrD7Sizb-0_XC3SZ5ULwXsxg1Pjbh5jzc2baqa_YE,5482
6
+ dsdiff/drift.py,sha256=nk2Q-Fe8Ypxbu8hARiUipi8MygZp200xEhTfaeMjp88,1938
7
+ dsdiff/profile.py,sha256=B8jZdaPL6E6YURRVDfbzhxCL8N_McUUPeHCKPscEPkQ,4716
8
+ dsdiff/render.py,sha256=oUXpikXn2AfIlgDMmZY5SCNGRkSIPEk2iTbQZpCPGNQ,1539
9
+ dsdiff-0.2.0.dist-info/METADATA,sha256=Pf7HHgJi0DNa069XKvIR4oJS_NnOEo23Ousiq0przWA,5507
10
+ dsdiff-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ dsdiff-0.2.0.dist-info/entry_points.txt,sha256=10ThOMIXktqFDbf7C59Pn82zZwzRmatSO_LCdLonapw,49
12
+ dsdiff-0.2.0.dist-info/licenses/LICENSE,sha256=N4nJy_wSxYwULjDvuE2GupQWZSSwgOOU_HJSzuxHBsI,1071
13
+ dsdiff-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dsdiff = dsdiff.cli:entrypoint
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 José del Río
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.