diffmonkey 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffmonkey/compare.py ADDED
@@ -0,0 +1,253 @@
1
+ """The ``compare()`` entry point: end-to-end structural diff of two datasets.
2
+
3
+ This module exists to orchestrate the pipeline — column mapping, key indexing
4
+ (:mod:`diffmonkey.matching`), per-column type-aware comparison
5
+ (:mod:`diffmonkey.comparators`) and bucketing into a :class:`DiffResult`
6
+ (:mod:`diffmonkey.models`). It holds no comparison logic of its own; it decides
7
+ *what* to compare and *how to categorise* the outcome.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, Iterable, Mapping, Sequence
13
+
14
+ from .comparators import CompareSettings, make_comparator
15
+ from .matching import index_rows, make_key, validate_policies
16
+ from .models import DiffResult, DiffSummary, FieldChange, RowDiff
17
+
18
+ Row = Mapping[str, Any]
19
+
20
+
21
+ def _coerce_key(key: str | Sequence[str]) -> tuple[str, ...]:
22
+ if isinstance(key, str):
23
+ return (key,)
24
+ key_t = tuple(key)
25
+ if not key_t:
26
+ raise ValueError("`key` must name at least one column")
27
+ if not all(isinstance(k, str) for k in key_t):
28
+ raise TypeError("`key` columns must be strings")
29
+ return key_t
30
+
31
+
32
+ def _apply_column_map(
33
+ rows: Iterable[Row], column_map: Mapping[str, str] | None
34
+ ) -> list[dict[str, Any]]:
35
+ """Return rows as dicts, renaming old->new column names where mapped.
36
+
37
+ A mapping ``{"old_name": "new_name"}`` is applied to *old* rows so both
38
+ sides share the new namespace. If both ``old_name`` and ``new_name`` are
39
+ present in a row, the explicitly-new value wins (the rename target is not
40
+ clobbered).
41
+ """
42
+ if not column_map:
43
+ return [dict(r) for r in rows]
44
+ out: list[dict[str, Any]] = []
45
+ for r in rows:
46
+ rd = dict(r)
47
+ for old_name, new_name in column_map.items():
48
+ if old_name in rd:
49
+ value = rd.pop(old_name)
50
+ rd.setdefault(new_name, value)
51
+ out.append(rd)
52
+ return out
53
+
54
+
55
+ def _resolve_columns(
56
+ old_rows: list[dict[str, Any]],
57
+ new_rows: list[dict[str, Any]],
58
+ key_columns: tuple[str, ...],
59
+ columns: Sequence[str] | None,
60
+ ignore: Sequence[str] | None,
61
+ ) -> tuple[str, ...]:
62
+ """Decide which columns to compare.
63
+
64
+ Default is the union of all columns seen on either side, in first-seen
65
+ order, minus key columns and minus the ignore list. An explicit ``columns``
66
+ list overrides the union (still minus key and ignore).
67
+ """
68
+ ignore_set = set(ignore or ())
69
+ key_set = set(key_columns)
70
+ if columns is not None:
71
+ ordered = list(columns)
72
+ else:
73
+ ordered = []
74
+ seen: set[str] = set()
75
+ for row in (*old_rows, *new_rows):
76
+ for col in row:
77
+ if col not in seen:
78
+ seen.add(col)
79
+ ordered.append(col)
80
+ return tuple(c for c in ordered if c not in key_set and c not in ignore_set)
81
+
82
+
83
+ def compare(
84
+ old: Iterable[Row],
85
+ new: Iterable[Row],
86
+ *,
87
+ key: str | Sequence[str],
88
+ columns: Sequence[str] | None = None,
89
+ ignore: Sequence[str] | None = None,
90
+ column_map: Mapping[str, str] | None = None,
91
+ rel_tol: float = 1e-9,
92
+ abs_tol: float = 0.0,
93
+ normalize_whitespace: bool = True,
94
+ null_equivalent: bool = True,
95
+ type_aware: bool = True,
96
+ date_aware: bool = True,
97
+ locale: str | None = None,
98
+ null_values: Iterable[str] | None = None,
99
+ include_unchanged: bool = False,
100
+ on_duplicate: str = "warn",
101
+ on_missing_key: str = "warn",
102
+ ) -> DiffResult:
103
+ """Compare two tabular datasets and categorise the differences.
104
+
105
+ Args:
106
+ old: The baseline rows (list of mappings / dicts).
107
+ new: The current rows to compare against the baseline.
108
+ key: Identity column name, or a sequence of names for a composite key.
109
+ Names refer to the *new* namespace (i.e. after ``column_map``).
110
+ columns: Restrict comparison to these columns. Default: every non-key
111
+ column seen on either side.
112
+ ignore: Columns to exclude from comparison (timestamps, audit fields,
113
+ row numbers). Applied after ``columns``.
114
+ column_map: ``{old_name: new_name}`` renames applied to ``old`` rows so
115
+ renamed columns are not reported as removed+added.
116
+ rel_tol: Relative floating-point tolerance for numeric columns
117
+ (``math.isclose``). Default ``1e-9``.
118
+ abs_tol: Absolute floating-point tolerance for numeric columns. Default
119
+ ``0.0`` — set this to compare values near zero.
120
+ normalize_whitespace: Collapse/strip whitespace and drop invisible
121
+ characters before comparing strings (via ``cleanmonkey``). Default
122
+ True.
123
+ null_equivalent: Treat all null spellings (``None``, ``""``,
124
+ whitespace-only, ``"na"``, …) as one value, so two different nulls
125
+ are equal and null↔value is a change. Default True.
126
+ type_aware: Infer each column's type and compare accordingly (numbers
127
+ by value, dates by calendar date, booleans by truth). When False,
128
+ every column is compared as a normalised string. Default True.
129
+ date_aware: Within type-aware mode, compare date columns by parsed date
130
+ rather than as strings. Default True.
131
+ locale: Number/date locale hint (``"us"``/``"eu"``). Default None
132
+ (auto-detect).
133
+ null_values: Override the null vocabulary (an iterable of spellings).
134
+ Default None (use typemonkey's default set).
135
+ include_unchanged: Populate :attr:`DiffResult.unchanged`. Default False
136
+ (unchanged rows are counted but not retained).
137
+ on_duplicate: Duplicate-key policy: ``"warn"`` (keep first + warn),
138
+ ``"first"``, ``"last"``, or ``"error"``. Any other value raises
139
+ :class:`ValueError`.
140
+ on_missing_key: Missing-key policy: ``"warn"`` (keep + warn), ``"skip"``,
141
+ or ``"error"``. Any other value raises :class:`ValueError`.
142
+
143
+ Returns:
144
+ A :class:`DiffResult` with added/removed/changed/unchanged buckets and a
145
+ :class:`DiffSummary`.
146
+
147
+ Raises:
148
+ ValueError: if ``key`` is empty, or ``on_duplicate`` / ``on_missing_key``
149
+ is not one of its documented values.
150
+ TypeError: if a ``key`` column name is not a string.
151
+ """
152
+ key_columns = _coerce_key(key)
153
+ # Validate configuration before consuming `old`/`new`, which may be
154
+ # expensive, side-effecting, or non-terminating generators: a misspelled
155
+ # policy must raise ValueError without touching the inputs.
156
+ validate_policies(on_duplicate, on_missing_key)
157
+ null_set = frozenset(null_values) if null_values is not None else None
158
+ settings = CompareSettings(
159
+ rel_tol=rel_tol,
160
+ abs_tol=abs_tol,
161
+ normalize_whitespace=normalize_whitespace,
162
+ null_equivalent=null_equivalent,
163
+ type_aware=type_aware,
164
+ date_aware=date_aware,
165
+ locale=locale,
166
+ null_values=null_set,
167
+ )
168
+
169
+ old_rows = _apply_column_map(old, column_map)
170
+ new_rows = [dict(r) for r in new]
171
+
172
+ compared = _resolve_columns(old_rows, new_rows, key_columns, columns, ignore)
173
+
174
+ warnings: list[str] = []
175
+ old_index, total_old, dup_old = index_rows(
176
+ old_rows,
177
+ key_columns,
178
+ side="old",
179
+ on_duplicate=on_duplicate,
180
+ on_missing_key=on_missing_key,
181
+ warnings=warnings,
182
+ )
183
+ new_index, total_new, dup_new = index_rows(
184
+ new_rows,
185
+ key_columns,
186
+ side="new",
187
+ on_duplicate=on_duplicate,
188
+ on_missing_key=on_missing_key,
189
+ warnings=warnings,
190
+ )
191
+
192
+ # Build one comparator per compared column over both sides' values.
193
+ comparators = {}
194
+ for col in compared:
195
+ old_vals = [r.get(col) for r in old_index.values()]
196
+ new_vals = [r.get(col) for r in new_index.values()]
197
+ comparators[col] = make_comparator(col, old_vals, new_vals, settings)
198
+
199
+ added: list[RowDiff] = []
200
+ removed: list[RowDiff] = []
201
+ changed: list[RowDiff] = []
202
+ unchanged: list[RowDiff] = []
203
+
204
+ # Matched + added, in new-dataset order.
205
+ for k, new_row in new_index.items():
206
+ old_row = old_index.get(k)
207
+ if old_row is None:
208
+ added.append(RowDiff(key=k, new=new_row))
209
+ continue
210
+ field_changes: list[FieldChange] = []
211
+ for col in compared:
212
+ ov = old_row.get(col)
213
+ nv = new_row.get(col)
214
+ if not comparators[col].equal(ov, nv):
215
+ field_changes.append(FieldChange(column=col, old=ov, new=nv))
216
+ if field_changes:
217
+ changed.append(
218
+ RowDiff(key=k, old=old_row, new=new_row, changes=tuple(field_changes))
219
+ )
220
+ elif include_unchanged:
221
+ unchanged.append(RowDiff(key=k, old=old_row, new=new_row))
222
+
223
+ # Removed, in old-dataset order.
224
+ for k, old_row in old_index.items():
225
+ if k not in new_index:
226
+ removed.append(RowDiff(key=k, old=old_row))
227
+
228
+ matched = sum(1 for k in new_index if k in old_index)
229
+ unchanged_count = matched - len(changed)
230
+
231
+ summary = DiffSummary(
232
+ total_old=total_old,
233
+ total_new=total_new,
234
+ matched=matched,
235
+ added=len(added),
236
+ removed=len(removed),
237
+ changed=len(changed),
238
+ unchanged=unchanged_count,
239
+ key_columns=key_columns,
240
+ compared_columns=compared,
241
+ duplicate_keys=dup_old + dup_new,
242
+ )
243
+
244
+ return DiffResult(
245
+ added=added,
246
+ removed=removed,
247
+ changed=changed,
248
+ unchanged=unchanged,
249
+ summary=summary,
250
+ key_columns=key_columns,
251
+ compared_columns=compared,
252
+ warnings=warnings,
253
+ )
@@ -0,0 +1,13 @@
1
+ """Report renderers for a :class:`diffmonkey.models.DiffResult`.
2
+
3
+ Each submodule exports a single ``render(result, ...) -> str`` function. They
4
+ live behind :class:`DiffResult`'s ``to_markdown`` / ``to_html`` / ``to_csv``
5
+ methods so callers rarely import them directly, but they are importable for
6
+ golden-file testing and custom pipelines.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from . import csv_out, html, markdown
12
+
13
+ __all__ = ["markdown", "html", "csv_out"]
@@ -0,0 +1,55 @@
1
+ """CSV of changes.
2
+
3
+ This module exists to produce a flat, spreadsheet-friendly record of *every*
4
+ field-level change plus added/removed rows — one CSV row per change — so an
5
+ analyst can pivot/filter the diff. The schema is fixed:
6
+
7
+ ``change_type, key, column, old, new``
8
+
9
+ * ``change_type`` is ``changed`` / ``added`` / ``removed``.
10
+ * ``key`` is the row key rendered as ``col=value`` (joined by ``; `` for
11
+ composite keys).
12
+ * For ``added``/``removed`` rows there is no per-field breakdown, so one row is
13
+ emitted with empty ``column``/``old``/``new`` (added) or ``column``/``new``
14
+ empty (removed) — the presence of the row is the change.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import csv
20
+ import io
21
+ from typing import Any
22
+
23
+ from ..models import DiffResult, RowDiff
24
+
25
+ HEADER = ["change_type", "key", "column", "old", "new"]
26
+
27
+
28
+ def _key_str(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
29
+ return "; ".join(
30
+ f"{col}={'' if val is None else val}" for col, val in zip(key_columns, rd.key)
31
+ )
32
+
33
+
34
+ def _cell(value: Any) -> str:
35
+ """CSV cell text: ``None`` becomes empty string (csv has no null)."""
36
+ return "" if value is None else str(value)
37
+
38
+
39
+ def render(result: DiffResult) -> str:
40
+ """Render ``result`` as CSV text (``\\n`` line endings, UTF-8 ready)."""
41
+ buf = io.StringIO()
42
+ writer = csv.writer(buf, lineterminator="\n")
43
+ writer.writerow(HEADER)
44
+ kc = result.key_columns
45
+
46
+ for rd in result.changed:
47
+ key = _key_str(rd, kc)
48
+ for ch in rd.changes:
49
+ writer.writerow(["changed", key, ch.column, _cell(ch.old), _cell(ch.new)])
50
+ for rd in result.added:
51
+ writer.writerow(["added", _key_str(rd, kc), "", "", ""])
52
+ for rd in result.removed:
53
+ writer.writerow(["removed", _key_str(rd, kc), "", "", ""])
54
+
55
+ return buf.getvalue()
@@ -0,0 +1,93 @@
1
+ """Standalone HTML diff report.
2
+
3
+ This module exists to produce a single self-contained HTML document (inline
4
+ CSS, no external assets) suitable for emailing or attaching to a build. All
5
+ dynamic values are HTML-escaped via the stdlib ``html.escape`` so cell content
6
+ containing ``<``/``&``/quotes cannot break the markup or inject script.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from html import escape
12
+ from typing import Any
13
+
14
+ from ..models import DiffResult, RowDiff
15
+
16
+ _STYLE = """\
17
+ body{font-family:-apple-system,Segoe UI,Roboto,sans-serif;margin:2rem;color:#1a1a1a}
18
+ h1{font-size:1.4rem}h2{font-size:1.1rem;border-bottom:1px solid #ddd;padding-bottom:.2rem}
19
+ .summary{font-weight:600;margin:.5rem 0 1rem}
20
+ table{border-collapse:collapse;width:100%;margin:.5rem 0}
21
+ th,td{border:1px solid #ddd;padding:.3rem .5rem;text-align:left;vertical-align:top}
22
+ th{background:#f4f4f4}
23
+ .old{background:#ffecec}.new{background:#eaffea}
24
+ .null{color:#999;font-style:italic}
25
+ .meta{color:#555;font-size:.9rem}
26
+ """
27
+
28
+
29
+ def _cell(value: Any) -> str:
30
+ if value is None:
31
+ return '<span class="null">(null)</span>'
32
+ return escape(str(value))
33
+
34
+
35
+ def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
36
+ return escape(
37
+ ", ".join(
38
+ f"{col}={'' if val is None else val}"
39
+ for col, val in zip(key_columns, rd.key)
40
+ )
41
+ )
42
+
43
+
44
+ def render(result: DiffResult, *, title: str = "diffmonkey report") -> str:
45
+ """Render ``result`` as a complete HTML document string."""
46
+ s = result.summary
47
+ parts: list[str] = [
48
+ "<!DOCTYPE html>",
49
+ '<html lang="en"><head><meta charset="utf-8">',
50
+ f"<title>{escape(title)}</title>",
51
+ f"<style>{_STYLE}</style></head><body>",
52
+ f"<h1>{escape(title)}</h1>",
53
+ f'<p class="summary">{escape(s.one_line())}</p>',
54
+ '<p class="meta">Key: '
55
+ + escape(", ".join(result.key_columns))
56
+ + " &middot; Compared: "
57
+ + escape(", ".join(result.compared_columns) or "(none)")
58
+ + f" &middot; {s.total_old} old &rarr; {s.total_new} new</p>",
59
+ ]
60
+
61
+ parts.append(f"<h2>Changed ({s.changed})</h2>")
62
+ if result.changed:
63
+ parts.append("<table><tr><th>Key</th><th>Column</th><th>Old</th><th>New</th></tr>")
64
+ for rd in result.changed:
65
+ klabel = _key_label(rd, result.key_columns)
66
+ for i, ch in enumerate(rd.changes):
67
+ key_cell = klabel if i == 0 else ""
68
+ parts.append(
69
+ f"<tr><td>{key_cell}</td><td>{escape(ch.column)}</td>"
70
+ f'<td class="old">{_cell(ch.old)}</td>'
71
+ f'<td class="new">{_cell(ch.new)}</td></tr>'
72
+ )
73
+ parts.append("</table>")
74
+
75
+ parts.append(f"<h2>Added ({s.added})</h2>")
76
+ if result.added:
77
+ parts.append("<ul>")
78
+ parts.extend(f"<li>{_key_label(rd, result.key_columns)}</li>" for rd in result.added)
79
+ parts.append("</ul>")
80
+
81
+ parts.append(f"<h2>Removed ({s.removed})</h2>")
82
+ if result.removed:
83
+ parts.append("<ul>")
84
+ parts.extend(f"<li>{_key_label(rd, result.key_columns)}</li>" for rd in result.removed)
85
+ parts.append("</ul>")
86
+
87
+ if result.warnings:
88
+ parts.append(f"<h2>Warnings ({len(result.warnings)})</h2><ul>")
89
+ parts.extend(f"<li>{escape(w)}</li>" for w in result.warnings)
90
+ parts.append("</ul>")
91
+
92
+ parts.append("</body></html>")
93
+ return "\n".join(parts) + "\n"
@@ -0,0 +1,94 @@
1
+ """Markdown diff report.
2
+
3
+ This module exists to turn a :class:`DiffResult` into a human-skimmable report
4
+ for PRs, emails and chat. The layout is deliberately stable (fixed section
5
+ order, fixed headline wording) so it can be pinned with golden-file tests and
6
+ downstream scripts can rely on its shape.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+ from ..models import DiffResult, RowDiff
14
+
15
+
16
+ def _fmt(value: Any) -> str:
17
+ """Render a cell value for display; ``None`` becomes ``(null)``."""
18
+ if value is None:
19
+ return "(null)"
20
+ return str(value)
21
+
22
+
23
+ def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
24
+ parts = [f"{col}={_fmt(val)}" for col, val in zip(key_columns, rd.key)]
25
+ return ", ".join(parts)
26
+
27
+
28
+ def _limit(rows: list[RowDiff], max_rows: int | None) -> tuple[list[RowDiff], int]:
29
+ """Return ``(shown_rows, hidden_count)`` honouring ``max_rows``."""
30
+ if max_rows is None or len(rows) <= max_rows:
31
+ return rows, 0
32
+ return rows[:max_rows], len(rows) - max_rows
33
+
34
+
35
+ def render(result: DiffResult, *, max_rows: int | None = None) -> str:
36
+ """Render ``result`` as a markdown string.
37
+
38
+ ``max_rows`` caps the number of rows listed *per section* (added, removed,
39
+ changed); the omitted count is noted with an ellipsis line. ``None`` (the
40
+ default) lists every row. The summary headline always reflects full counts.
41
+ """
42
+ s = result.summary
43
+ lines: list[str] = ["# diffmonkey report", "", f"**{s.one_line()}**", ""]
44
+
45
+ key_label = ", ".join(f"`{c}`" for c in result.key_columns)
46
+ cols_label = ", ".join(f"`{c}`" for c in result.compared_columns) or "_(none)_"
47
+ lines.append(f"- Key: {key_label}")
48
+ lines.append(f"- Compared columns: {cols_label}")
49
+ lines.append(f"- Rows: {s.total_old} old → {s.total_new} new")
50
+ if s.duplicate_keys:
51
+ lines.append(f"- Duplicate keys: {s.duplicate_keys}")
52
+ lines.append("")
53
+
54
+ # Changed
55
+ lines.append(f"## Changed ({s.changed})")
56
+ lines.append("")
57
+ shown, hidden = _limit(result.changed, max_rows)
58
+ for rd in shown:
59
+ lines.append(f"### {_key_label(rd, result.key_columns)}")
60
+ for ch in rd.changes:
61
+ lines.append(f"- **{ch.column}**: `{_fmt(ch.old)}` → `{_fmt(ch.new)}`")
62
+ lines.append("")
63
+ if hidden:
64
+ lines.append(f"_… and {hidden} more changed row(s)._")
65
+ lines.append("")
66
+
67
+ # Added
68
+ lines.append(f"## Added ({s.added})")
69
+ lines.append("")
70
+ shown, hidden = _limit(result.added, max_rows)
71
+ for rd in shown:
72
+ lines.append(f"- {_key_label(rd, result.key_columns)}")
73
+ if hidden:
74
+ lines.append(f"_… and {hidden} more added row(s)._")
75
+ lines.append("")
76
+
77
+ # Removed
78
+ lines.append(f"## Removed ({s.removed})")
79
+ lines.append("")
80
+ shown, hidden = _limit(result.removed, max_rows)
81
+ for rd in shown:
82
+ lines.append(f"- {_key_label(rd, result.key_columns)}")
83
+ if hidden:
84
+ lines.append(f"_… and {hidden} more removed row(s)._")
85
+ lines.append("")
86
+
87
+ if result.warnings:
88
+ lines.append(f"## Warnings ({len(result.warnings)})")
89
+ lines.append("")
90
+ for w in result.warnings:
91
+ lines.append(f"- {w}")
92
+ lines.append("")
93
+
94
+ return "\n".join(lines).rstrip("\n") + "\n"
diffmonkey/matching.py ADDED
@@ -0,0 +1,141 @@
1
+ """Key-based row matching.
2
+
3
+ This module exists to turn two row lists into a keyed lookup so ``compare()``
4
+ can find the partner (if any) of each row. It owns the messy parts of matching:
5
+ composite keys, duplicate key values, and rows missing a key column. Each of
6
+ those has a configurable policy because there is no single right answer — see
7
+ ``on_duplicate`` / ``on_missing_key`` in :func:`compare`.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, Iterable, Mapping
13
+
14
+ from .models import DuplicateKeyError, MissingKeyError
15
+
16
+ _MISSING = object()
17
+
18
+ DUPLICATE_POLICIES = frozenset({"warn", "first", "last", "error"})
19
+ MISSING_KEY_POLICIES = frozenset({"warn", "skip", "error"})
20
+
21
+
22
+ def validate_policies(on_duplicate: str, on_missing_key: str) -> None:
23
+ """Validate duplicate/missing-key policy names, raising :class:`ValueError`.
24
+
25
+ Shared by :func:`index_rows` and :func:`diffmonkey.compare.compare` so both
26
+ entry points reject a misspelled policy with the same message and neither
27
+ drifts from the supported set. Callers should invoke this *before* consuming
28
+ their input rows so configuration errors fail eagerly.
29
+ """
30
+ if on_duplicate not in DUPLICATE_POLICIES:
31
+ raise ValueError(
32
+ f"on_duplicate must be one of {sorted(DUPLICATE_POLICIES)}, "
33
+ f"got {on_duplicate!r}"
34
+ )
35
+ if on_missing_key not in MISSING_KEY_POLICIES:
36
+ raise ValueError(
37
+ f"on_missing_key must be one of {sorted(MISSING_KEY_POLICIES)}, "
38
+ f"got {on_missing_key!r}"
39
+ )
40
+
41
+
42
+ def make_key(row: Mapping[str, Any], key_columns: tuple[str, ...]) -> tuple[Any, ...]:
43
+ """Extract the key tuple for ``row``.
44
+
45
+ A column absent from the row contributes ``None`` (so a missing key column
46
+ and an explicit ``None`` collide, which the missing-key policy handles).
47
+ """
48
+ return tuple(row.get(col, None) for col in key_columns)
49
+
50
+
51
+ def _has_missing_component(
52
+ row: Mapping[str, Any], key_columns: tuple[str, ...]
53
+ ) -> bool:
54
+ for col in key_columns:
55
+ if row.get(col, _MISSING) in (_MISSING, None):
56
+ return True
57
+ return False
58
+
59
+
60
+ def index_rows(
61
+ rows: Iterable[Mapping[str, Any]],
62
+ key_columns: tuple[str, ...],
63
+ *,
64
+ side: str,
65
+ on_duplicate: str = "warn",
66
+ on_missing_key: str = "warn",
67
+ warnings: list[str] | None = None,
68
+ ) -> tuple[dict[tuple[Any, ...], dict[str, Any]], int, int]:
69
+ """Index ``rows`` by key.
70
+
71
+ Returns ``(index, total_rows, duplicate_keys)`` where ``index`` maps each
72
+ key tuple to a single row dict, ``total_rows`` is the number of rows that
73
+ made it into the index, and ``duplicate_keys`` is the count of *distinct*
74
+ keys that appeared more than once.
75
+
76
+ Policies:
77
+
78
+ * ``on_duplicate``: ``"warn"`` keeps the first occurrence and records a
79
+ warning; ``"first"``/``"last"`` keep that occurrence silently; ``"error"``
80
+ raises :class:`DuplicateKeyError`.
81
+ * ``on_missing_key``: ``"warn"`` keeps the row (key components default to
82
+ ``None``) and records a warning; ``"skip"`` drops the row; ``"error"``
83
+ raises :class:`MissingKeyError`.
84
+ """
85
+ # Validate policies eagerly (before touching any row) so a misspelled
86
+ # data-integrity policy fails loudly rather than silently degrading to a
87
+ # default that changes which data is compared.
88
+ validate_policies(on_duplicate, on_missing_key)
89
+
90
+ warnings = warnings if warnings is not None else []
91
+ index: dict[tuple[Any, ...], dict[str, Any]] = {}
92
+ seen_counts: dict[tuple[Any, ...], int] = {}
93
+ duplicate_keys = 0
94
+ total = 0
95
+
96
+ for position, row in enumerate(rows):
97
+ rowd = dict(row)
98
+ if _has_missing_component(rowd, key_columns):
99
+ if on_missing_key == "error":
100
+ raise MissingKeyError(
101
+ f"{side} row {position} is missing key column(s) "
102
+ f"{key_columns}: {rowd!r}"
103
+ )
104
+ if on_missing_key == "skip":
105
+ warnings.append(
106
+ f"{side}: row {position} skipped (missing key column)"
107
+ )
108
+ continue
109
+ # "warn": keep, with None for the missing component
110
+ warnings.append(
111
+ f"{side}: row {position} has a missing/None key component "
112
+ f"(key={make_key(rowd, key_columns)!r})"
113
+ )
114
+
115
+ key = make_key(rowd, key_columns)
116
+ prior = seen_counts.get(key, 0)
117
+ seen_counts[key] = prior + 1
118
+
119
+ if prior == 0:
120
+ index[key] = rowd
121
+ total += 1
122
+ continue
123
+
124
+ # Duplicate key.
125
+ if prior == 1:
126
+ duplicate_keys += 1 # count this key once, on its first repeat
127
+ if on_duplicate == "error":
128
+ raise DuplicateKeyError(
129
+ f"{side}: duplicate key {key!r} (first at an earlier row, "
130
+ f"again at row {position})"
131
+ )
132
+ if on_duplicate == "last":
133
+ index[key] = rowd # overwrite; total unchanged
134
+ elif on_duplicate == "warn":
135
+ warnings.append(
136
+ f"{side}: duplicate key {key!r} at row {position}; "
137
+ f"keeping first occurrence"
138
+ )
139
+ # "first" / "warn": keep existing; "last": already overwrote.
140
+
141
+ return index, total, duplicate_keys