diffmonkey 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffmonkey/__init__.py +49 -0
- diffmonkey/cli.py +168 -0
- diffmonkey/comparators.py +259 -0
- diffmonkey/compare.py +253 -0
- diffmonkey/formatters/__init__.py +13 -0
- diffmonkey/formatters/csv_out.py +55 -0
- diffmonkey/formatters/html.py +93 -0
- diffmonkey/formatters/markdown.py +94 -0
- diffmonkey/matching.py +141 -0
- diffmonkey/models.py +185 -0
- diffmonkey/readers.py +117 -0
- diffmonkey-1.0.0.dist-info/METADATA +153 -0
- diffmonkey-1.0.0.dist-info/RECORD +17 -0
- diffmonkey-1.0.0.dist-info/WHEEL +5 -0
- diffmonkey-1.0.0.dist-info/entry_points.txt +2 -0
- diffmonkey-1.0.0.dist-info/licenses/LICENSE +21 -0
- diffmonkey-1.0.0.dist-info/top_level.txt +1 -0
diffmonkey/compare.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""The ``compare()`` entry point: end-to-end structural diff of two datasets.
|
|
2
|
+
|
|
3
|
+
This module exists to orchestrate the pipeline — column mapping, key indexing
|
|
4
|
+
(:mod:`diffmonkey.matching`), per-column type-aware comparison
|
|
5
|
+
(:mod:`diffmonkey.comparators`) and bucketing into a :class:`DiffResult`
|
|
6
|
+
(:mod:`diffmonkey.models`). It holds no comparison logic of its own; it decides
|
|
7
|
+
*what* to compare and *how to categorise* the outcome.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Iterable, Mapping, Sequence
|
|
13
|
+
|
|
14
|
+
from .comparators import CompareSettings, make_comparator
|
|
15
|
+
from .matching import index_rows, make_key, validate_policies
|
|
16
|
+
from .models import DiffResult, DiffSummary, FieldChange, RowDiff
|
|
17
|
+
|
|
18
|
+
Row = Mapping[str, Any]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _coerce_key(key: str | Sequence[str]) -> tuple[str, ...]:
|
|
22
|
+
if isinstance(key, str):
|
|
23
|
+
return (key,)
|
|
24
|
+
key_t = tuple(key)
|
|
25
|
+
if not key_t:
|
|
26
|
+
raise ValueError("`key` must name at least one column")
|
|
27
|
+
if not all(isinstance(k, str) for k in key_t):
|
|
28
|
+
raise TypeError("`key` columns must be strings")
|
|
29
|
+
return key_t
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _apply_column_map(
|
|
33
|
+
rows: Iterable[Row], column_map: Mapping[str, str] | None
|
|
34
|
+
) -> list[dict[str, Any]]:
|
|
35
|
+
"""Return rows as dicts, renaming old->new column names where mapped.
|
|
36
|
+
|
|
37
|
+
A mapping ``{"old_name": "new_name"}`` is applied to *old* rows so both
|
|
38
|
+
sides share the new namespace. If both ``old_name`` and ``new_name`` are
|
|
39
|
+
present in a row, the explicitly-new value wins (the rename target is not
|
|
40
|
+
clobbered).
|
|
41
|
+
"""
|
|
42
|
+
if not column_map:
|
|
43
|
+
return [dict(r) for r in rows]
|
|
44
|
+
out: list[dict[str, Any]] = []
|
|
45
|
+
for r in rows:
|
|
46
|
+
rd = dict(r)
|
|
47
|
+
for old_name, new_name in column_map.items():
|
|
48
|
+
if old_name in rd:
|
|
49
|
+
value = rd.pop(old_name)
|
|
50
|
+
rd.setdefault(new_name, value)
|
|
51
|
+
out.append(rd)
|
|
52
|
+
return out
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _resolve_columns(
|
|
56
|
+
old_rows: list[dict[str, Any]],
|
|
57
|
+
new_rows: list[dict[str, Any]],
|
|
58
|
+
key_columns: tuple[str, ...],
|
|
59
|
+
columns: Sequence[str] | None,
|
|
60
|
+
ignore: Sequence[str] | None,
|
|
61
|
+
) -> tuple[str, ...]:
|
|
62
|
+
"""Decide which columns to compare.
|
|
63
|
+
|
|
64
|
+
Default is the union of all columns seen on either side, in first-seen
|
|
65
|
+
order, minus key columns and minus the ignore list. An explicit ``columns``
|
|
66
|
+
list overrides the union (still minus key and ignore).
|
|
67
|
+
"""
|
|
68
|
+
ignore_set = set(ignore or ())
|
|
69
|
+
key_set = set(key_columns)
|
|
70
|
+
if columns is not None:
|
|
71
|
+
ordered = list(columns)
|
|
72
|
+
else:
|
|
73
|
+
ordered = []
|
|
74
|
+
seen: set[str] = set()
|
|
75
|
+
for row in (*old_rows, *new_rows):
|
|
76
|
+
for col in row:
|
|
77
|
+
if col not in seen:
|
|
78
|
+
seen.add(col)
|
|
79
|
+
ordered.append(col)
|
|
80
|
+
return tuple(c for c in ordered if c not in key_set and c not in ignore_set)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def compare(
|
|
84
|
+
old: Iterable[Row],
|
|
85
|
+
new: Iterable[Row],
|
|
86
|
+
*,
|
|
87
|
+
key: str | Sequence[str],
|
|
88
|
+
columns: Sequence[str] | None = None,
|
|
89
|
+
ignore: Sequence[str] | None = None,
|
|
90
|
+
column_map: Mapping[str, str] | None = None,
|
|
91
|
+
rel_tol: float = 1e-9,
|
|
92
|
+
abs_tol: float = 0.0,
|
|
93
|
+
normalize_whitespace: bool = True,
|
|
94
|
+
null_equivalent: bool = True,
|
|
95
|
+
type_aware: bool = True,
|
|
96
|
+
date_aware: bool = True,
|
|
97
|
+
locale: str | None = None,
|
|
98
|
+
null_values: Iterable[str] | None = None,
|
|
99
|
+
include_unchanged: bool = False,
|
|
100
|
+
on_duplicate: str = "warn",
|
|
101
|
+
on_missing_key: str = "warn",
|
|
102
|
+
) -> DiffResult:
|
|
103
|
+
"""Compare two tabular datasets and categorise the differences.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
old: The baseline rows (list of mappings / dicts).
|
|
107
|
+
new: The current rows to compare against the baseline.
|
|
108
|
+
key: Identity column name, or a sequence of names for a composite key.
|
|
109
|
+
Names refer to the *new* namespace (i.e. after ``column_map``).
|
|
110
|
+
columns: Restrict comparison to these columns. Default: every non-key
|
|
111
|
+
column seen on either side.
|
|
112
|
+
ignore: Columns to exclude from comparison (timestamps, audit fields,
|
|
113
|
+
row numbers). Applied after ``columns``.
|
|
114
|
+
column_map: ``{old_name: new_name}`` renames applied to ``old`` rows so
|
|
115
|
+
renamed columns are not reported as removed+added.
|
|
116
|
+
rel_tol: Relative floating-point tolerance for numeric columns
|
|
117
|
+
(``math.isclose``). Default ``1e-9``.
|
|
118
|
+
abs_tol: Absolute floating-point tolerance for numeric columns. Default
|
|
119
|
+
``0.0`` — set this to compare values near zero.
|
|
120
|
+
normalize_whitespace: Collapse/strip whitespace and drop invisible
|
|
121
|
+
characters before comparing strings (via ``cleanmonkey``). Default
|
|
122
|
+
True.
|
|
123
|
+
null_equivalent: Treat all null spellings (``None``, ``""``,
|
|
124
|
+
whitespace-only, ``"na"``, …) as one value, so two different nulls
|
|
125
|
+
are equal and null↔value is a change. Default True.
|
|
126
|
+
type_aware: Infer each column's type and compare accordingly (numbers
|
|
127
|
+
by value, dates by calendar date, booleans by truth). When False,
|
|
128
|
+
every column is compared as a normalised string. Default True.
|
|
129
|
+
date_aware: Within type-aware mode, compare date columns by parsed date
|
|
130
|
+
rather than as strings. Default True.
|
|
131
|
+
locale: Number/date locale hint (``"us"``/``"eu"``). Default None
|
|
132
|
+
(auto-detect).
|
|
133
|
+
null_values: Override the null vocabulary (an iterable of spellings).
|
|
134
|
+
Default None (use typemonkey's default set).
|
|
135
|
+
include_unchanged: Populate :attr:`DiffResult.unchanged`. Default False
|
|
136
|
+
(unchanged rows are counted but not retained).
|
|
137
|
+
on_duplicate: Duplicate-key policy: ``"warn"`` (keep first + warn),
|
|
138
|
+
``"first"``, ``"last"``, or ``"error"``. Any other value raises
|
|
139
|
+
:class:`ValueError`.
|
|
140
|
+
on_missing_key: Missing-key policy: ``"warn"`` (keep + warn), ``"skip"``,
|
|
141
|
+
or ``"error"``. Any other value raises :class:`ValueError`.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
A :class:`DiffResult` with added/removed/changed/unchanged buckets and a
|
|
145
|
+
:class:`DiffSummary`.
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
ValueError: if ``key`` is empty, or ``on_duplicate`` / ``on_missing_key``
|
|
149
|
+
is not one of its documented values.
|
|
150
|
+
TypeError: if a ``key`` column name is not a string.
|
|
151
|
+
"""
|
|
152
|
+
key_columns = _coerce_key(key)
|
|
153
|
+
# Validate configuration before consuming `old`/`new`, which may be
|
|
154
|
+
# expensive, side-effecting, or non-terminating generators: a misspelled
|
|
155
|
+
# policy must raise ValueError without touching the inputs.
|
|
156
|
+
validate_policies(on_duplicate, on_missing_key)
|
|
157
|
+
null_set = frozenset(null_values) if null_values is not None else None
|
|
158
|
+
settings = CompareSettings(
|
|
159
|
+
rel_tol=rel_tol,
|
|
160
|
+
abs_tol=abs_tol,
|
|
161
|
+
normalize_whitespace=normalize_whitespace,
|
|
162
|
+
null_equivalent=null_equivalent,
|
|
163
|
+
type_aware=type_aware,
|
|
164
|
+
date_aware=date_aware,
|
|
165
|
+
locale=locale,
|
|
166
|
+
null_values=null_set,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
old_rows = _apply_column_map(old, column_map)
|
|
170
|
+
new_rows = [dict(r) for r in new]
|
|
171
|
+
|
|
172
|
+
compared = _resolve_columns(old_rows, new_rows, key_columns, columns, ignore)
|
|
173
|
+
|
|
174
|
+
warnings: list[str] = []
|
|
175
|
+
old_index, total_old, dup_old = index_rows(
|
|
176
|
+
old_rows,
|
|
177
|
+
key_columns,
|
|
178
|
+
side="old",
|
|
179
|
+
on_duplicate=on_duplicate,
|
|
180
|
+
on_missing_key=on_missing_key,
|
|
181
|
+
warnings=warnings,
|
|
182
|
+
)
|
|
183
|
+
new_index, total_new, dup_new = index_rows(
|
|
184
|
+
new_rows,
|
|
185
|
+
key_columns,
|
|
186
|
+
side="new",
|
|
187
|
+
on_duplicate=on_duplicate,
|
|
188
|
+
on_missing_key=on_missing_key,
|
|
189
|
+
warnings=warnings,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Build one comparator per compared column over both sides' values.
|
|
193
|
+
comparators = {}
|
|
194
|
+
for col in compared:
|
|
195
|
+
old_vals = [r.get(col) for r in old_index.values()]
|
|
196
|
+
new_vals = [r.get(col) for r in new_index.values()]
|
|
197
|
+
comparators[col] = make_comparator(col, old_vals, new_vals, settings)
|
|
198
|
+
|
|
199
|
+
added: list[RowDiff] = []
|
|
200
|
+
removed: list[RowDiff] = []
|
|
201
|
+
changed: list[RowDiff] = []
|
|
202
|
+
unchanged: list[RowDiff] = []
|
|
203
|
+
|
|
204
|
+
# Matched + added, in new-dataset order.
|
|
205
|
+
for k, new_row in new_index.items():
|
|
206
|
+
old_row = old_index.get(k)
|
|
207
|
+
if old_row is None:
|
|
208
|
+
added.append(RowDiff(key=k, new=new_row))
|
|
209
|
+
continue
|
|
210
|
+
field_changes: list[FieldChange] = []
|
|
211
|
+
for col in compared:
|
|
212
|
+
ov = old_row.get(col)
|
|
213
|
+
nv = new_row.get(col)
|
|
214
|
+
if not comparators[col].equal(ov, nv):
|
|
215
|
+
field_changes.append(FieldChange(column=col, old=ov, new=nv))
|
|
216
|
+
if field_changes:
|
|
217
|
+
changed.append(
|
|
218
|
+
RowDiff(key=k, old=old_row, new=new_row, changes=tuple(field_changes))
|
|
219
|
+
)
|
|
220
|
+
elif include_unchanged:
|
|
221
|
+
unchanged.append(RowDiff(key=k, old=old_row, new=new_row))
|
|
222
|
+
|
|
223
|
+
# Removed, in old-dataset order.
|
|
224
|
+
for k, old_row in old_index.items():
|
|
225
|
+
if k not in new_index:
|
|
226
|
+
removed.append(RowDiff(key=k, old=old_row))
|
|
227
|
+
|
|
228
|
+
matched = sum(1 for k in new_index if k in old_index)
|
|
229
|
+
unchanged_count = matched - len(changed)
|
|
230
|
+
|
|
231
|
+
summary = DiffSummary(
|
|
232
|
+
total_old=total_old,
|
|
233
|
+
total_new=total_new,
|
|
234
|
+
matched=matched,
|
|
235
|
+
added=len(added),
|
|
236
|
+
removed=len(removed),
|
|
237
|
+
changed=len(changed),
|
|
238
|
+
unchanged=unchanged_count,
|
|
239
|
+
key_columns=key_columns,
|
|
240
|
+
compared_columns=compared,
|
|
241
|
+
duplicate_keys=dup_old + dup_new,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return DiffResult(
|
|
245
|
+
added=added,
|
|
246
|
+
removed=removed,
|
|
247
|
+
changed=changed,
|
|
248
|
+
unchanged=unchanged,
|
|
249
|
+
summary=summary,
|
|
250
|
+
key_columns=key_columns,
|
|
251
|
+
compared_columns=compared,
|
|
252
|
+
warnings=warnings,
|
|
253
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Report renderers for a :class:`diffmonkey.models.DiffResult`.
|
|
2
|
+
|
|
3
|
+
Each submodule exports a single ``render(result, ...) -> str`` function. They
|
|
4
|
+
live behind :class:`DiffResult`'s ``to_markdown`` / ``to_html`` / ``to_csv``
|
|
5
|
+
methods so callers rarely import them directly, but they are importable for
|
|
6
|
+
golden-file testing and custom pipelines.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from . import csv_out, html, markdown
|
|
12
|
+
|
|
13
|
+
__all__ = ["markdown", "html", "csv_out"]
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""CSV of changes.
|
|
2
|
+
|
|
3
|
+
This module exists to produce a flat, spreadsheet-friendly record of *every*
|
|
4
|
+
field-level change plus added/removed rows — one CSV row per change — so an
|
|
5
|
+
analyst can pivot/filter the diff. The schema is fixed:
|
|
6
|
+
|
|
7
|
+
``change_type, key, column, old, new``
|
|
8
|
+
|
|
9
|
+
* ``change_type`` is ``changed`` / ``added`` / ``removed``.
|
|
10
|
+
* ``key`` is the row key rendered as ``col=value`` (joined by ``; `` for
|
|
11
|
+
composite keys).
|
|
12
|
+
* For ``added``/``removed`` rows there is no per-field breakdown, so one row is
|
|
13
|
+
emitted with empty ``column``/``old``/``new`` (added) or ``column``/``new``
|
|
14
|
+
empty (removed) — the presence of the row is the change.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import csv
|
|
20
|
+
import io
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from ..models import DiffResult, RowDiff
|
|
24
|
+
|
|
25
|
+
HEADER = ["change_type", "key", "column", "old", "new"]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _key_str(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
29
|
+
return "; ".join(
|
|
30
|
+
f"{col}={'' if val is None else val}" for col, val in zip(key_columns, rd.key)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _cell(value: Any) -> str:
|
|
35
|
+
"""CSV cell text: ``None`` becomes empty string (csv has no null)."""
|
|
36
|
+
return "" if value is None else str(value)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def render(result: DiffResult) -> str:
|
|
40
|
+
"""Render ``result`` as CSV text (``\\n`` line endings, UTF-8 ready)."""
|
|
41
|
+
buf = io.StringIO()
|
|
42
|
+
writer = csv.writer(buf, lineterminator="\n")
|
|
43
|
+
writer.writerow(HEADER)
|
|
44
|
+
kc = result.key_columns
|
|
45
|
+
|
|
46
|
+
for rd in result.changed:
|
|
47
|
+
key = _key_str(rd, kc)
|
|
48
|
+
for ch in rd.changes:
|
|
49
|
+
writer.writerow(["changed", key, ch.column, _cell(ch.old), _cell(ch.new)])
|
|
50
|
+
for rd in result.added:
|
|
51
|
+
writer.writerow(["added", _key_str(rd, kc), "", "", ""])
|
|
52
|
+
for rd in result.removed:
|
|
53
|
+
writer.writerow(["removed", _key_str(rd, kc), "", "", ""])
|
|
54
|
+
|
|
55
|
+
return buf.getvalue()
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Standalone HTML diff report.
|
|
2
|
+
|
|
3
|
+
This module exists to produce a single self-contained HTML document (inline
|
|
4
|
+
CSS, no external assets) suitable for emailing or attaching to a build. All
|
|
5
|
+
dynamic values are HTML-escaped via the stdlib ``html.escape`` so cell content
|
|
6
|
+
containing ``<``/``&``/quotes cannot break the markup or inject script.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from html import escape
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from ..models import DiffResult, RowDiff
|
|
15
|
+
|
|
16
|
+
_STYLE = """\
|
|
17
|
+
body{font-family:-apple-system,Segoe UI,Roboto,sans-serif;margin:2rem;color:#1a1a1a}
|
|
18
|
+
h1{font-size:1.4rem}h2{font-size:1.1rem;border-bottom:1px solid #ddd;padding-bottom:.2rem}
|
|
19
|
+
.summary{font-weight:600;margin:.5rem 0 1rem}
|
|
20
|
+
table{border-collapse:collapse;width:100%;margin:.5rem 0}
|
|
21
|
+
th,td{border:1px solid #ddd;padding:.3rem .5rem;text-align:left;vertical-align:top}
|
|
22
|
+
th{background:#f4f4f4}
|
|
23
|
+
.old{background:#ffecec}.new{background:#eaffea}
|
|
24
|
+
.null{color:#999;font-style:italic}
|
|
25
|
+
.meta{color:#555;font-size:.9rem}
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _cell(value: Any) -> str:
|
|
30
|
+
if value is None:
|
|
31
|
+
return '<span class="null">(null)</span>'
|
|
32
|
+
return escape(str(value))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
36
|
+
return escape(
|
|
37
|
+
", ".join(
|
|
38
|
+
f"{col}={'' if val is None else val}"
|
|
39
|
+
for col, val in zip(key_columns, rd.key)
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def render(result: DiffResult, *, title: str = "diffmonkey report") -> str:
|
|
45
|
+
"""Render ``result`` as a complete HTML document string."""
|
|
46
|
+
s = result.summary
|
|
47
|
+
parts: list[str] = [
|
|
48
|
+
"<!DOCTYPE html>",
|
|
49
|
+
'<html lang="en"><head><meta charset="utf-8">',
|
|
50
|
+
f"<title>{escape(title)}</title>",
|
|
51
|
+
f"<style>{_STYLE}</style></head><body>",
|
|
52
|
+
f"<h1>{escape(title)}</h1>",
|
|
53
|
+
f'<p class="summary">{escape(s.one_line())}</p>',
|
|
54
|
+
'<p class="meta">Key: '
|
|
55
|
+
+ escape(", ".join(result.key_columns))
|
|
56
|
+
+ " · Compared: "
|
|
57
|
+
+ escape(", ".join(result.compared_columns) or "(none)")
|
|
58
|
+
+ f" · {s.total_old} old → {s.total_new} new</p>",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
parts.append(f"<h2>Changed ({s.changed})</h2>")
|
|
62
|
+
if result.changed:
|
|
63
|
+
parts.append("<table><tr><th>Key</th><th>Column</th><th>Old</th><th>New</th></tr>")
|
|
64
|
+
for rd in result.changed:
|
|
65
|
+
klabel = _key_label(rd, result.key_columns)
|
|
66
|
+
for i, ch in enumerate(rd.changes):
|
|
67
|
+
key_cell = klabel if i == 0 else ""
|
|
68
|
+
parts.append(
|
|
69
|
+
f"<tr><td>{key_cell}</td><td>{escape(ch.column)}</td>"
|
|
70
|
+
f'<td class="old">{_cell(ch.old)}</td>'
|
|
71
|
+
f'<td class="new">{_cell(ch.new)}</td></tr>'
|
|
72
|
+
)
|
|
73
|
+
parts.append("</table>")
|
|
74
|
+
|
|
75
|
+
parts.append(f"<h2>Added ({s.added})</h2>")
|
|
76
|
+
if result.added:
|
|
77
|
+
parts.append("<ul>")
|
|
78
|
+
parts.extend(f"<li>{_key_label(rd, result.key_columns)}</li>" for rd in result.added)
|
|
79
|
+
parts.append("</ul>")
|
|
80
|
+
|
|
81
|
+
parts.append(f"<h2>Removed ({s.removed})</h2>")
|
|
82
|
+
if result.removed:
|
|
83
|
+
parts.append("<ul>")
|
|
84
|
+
parts.extend(f"<li>{_key_label(rd, result.key_columns)}</li>" for rd in result.removed)
|
|
85
|
+
parts.append("</ul>")
|
|
86
|
+
|
|
87
|
+
if result.warnings:
|
|
88
|
+
parts.append(f"<h2>Warnings ({len(result.warnings)})</h2><ul>")
|
|
89
|
+
parts.extend(f"<li>{escape(w)}</li>" for w in result.warnings)
|
|
90
|
+
parts.append("</ul>")
|
|
91
|
+
|
|
92
|
+
parts.append("</body></html>")
|
|
93
|
+
return "\n".join(parts) + "\n"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Markdown diff report.
|
|
2
|
+
|
|
3
|
+
This module exists to turn a :class:`DiffResult` into a human-skimmable report
|
|
4
|
+
for PRs, emails and chat. The layout is deliberately stable (fixed section
|
|
5
|
+
order, fixed headline wording) so it can be pinned with golden-file tests and
|
|
6
|
+
downstream scripts can rely on its shape.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ..models import DiffResult, RowDiff
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _fmt(value: Any) -> str:
|
|
17
|
+
"""Render a cell value for display; ``None`` becomes ``(null)``."""
|
|
18
|
+
if value is None:
|
|
19
|
+
return "(null)"
|
|
20
|
+
return str(value)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
24
|
+
parts = [f"{col}={_fmt(val)}" for col, val in zip(key_columns, rd.key)]
|
|
25
|
+
return ", ".join(parts)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _limit(rows: list[RowDiff], max_rows: int | None) -> tuple[list[RowDiff], int]:
|
|
29
|
+
"""Return ``(shown_rows, hidden_count)`` honouring ``max_rows``."""
|
|
30
|
+
if max_rows is None or len(rows) <= max_rows:
|
|
31
|
+
return rows, 0
|
|
32
|
+
return rows[:max_rows], len(rows) - max_rows
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def render(result: DiffResult, *, max_rows: int | None = None) -> str:
|
|
36
|
+
"""Render ``result`` as a markdown string.
|
|
37
|
+
|
|
38
|
+
``max_rows`` caps the number of rows listed *per section* (added, removed,
|
|
39
|
+
changed); the omitted count is noted with an ellipsis line. ``None`` (the
|
|
40
|
+
default) lists every row. The summary headline always reflects full counts.
|
|
41
|
+
"""
|
|
42
|
+
s = result.summary
|
|
43
|
+
lines: list[str] = ["# diffmonkey report", "", f"**{s.one_line()}**", ""]
|
|
44
|
+
|
|
45
|
+
key_label = ", ".join(f"`{c}`" for c in result.key_columns)
|
|
46
|
+
cols_label = ", ".join(f"`{c}`" for c in result.compared_columns) or "_(none)_"
|
|
47
|
+
lines.append(f"- Key: {key_label}")
|
|
48
|
+
lines.append(f"- Compared columns: {cols_label}")
|
|
49
|
+
lines.append(f"- Rows: {s.total_old} old → {s.total_new} new")
|
|
50
|
+
if s.duplicate_keys:
|
|
51
|
+
lines.append(f"- Duplicate keys: {s.duplicate_keys}")
|
|
52
|
+
lines.append("")
|
|
53
|
+
|
|
54
|
+
# Changed
|
|
55
|
+
lines.append(f"## Changed ({s.changed})")
|
|
56
|
+
lines.append("")
|
|
57
|
+
shown, hidden = _limit(result.changed, max_rows)
|
|
58
|
+
for rd in shown:
|
|
59
|
+
lines.append(f"### {_key_label(rd, result.key_columns)}")
|
|
60
|
+
for ch in rd.changes:
|
|
61
|
+
lines.append(f"- **{ch.column}**: `{_fmt(ch.old)}` → `{_fmt(ch.new)}`")
|
|
62
|
+
lines.append("")
|
|
63
|
+
if hidden:
|
|
64
|
+
lines.append(f"_… and {hidden} more changed row(s)._")
|
|
65
|
+
lines.append("")
|
|
66
|
+
|
|
67
|
+
# Added
|
|
68
|
+
lines.append(f"## Added ({s.added})")
|
|
69
|
+
lines.append("")
|
|
70
|
+
shown, hidden = _limit(result.added, max_rows)
|
|
71
|
+
for rd in shown:
|
|
72
|
+
lines.append(f"- {_key_label(rd, result.key_columns)}")
|
|
73
|
+
if hidden:
|
|
74
|
+
lines.append(f"_… and {hidden} more added row(s)._")
|
|
75
|
+
lines.append("")
|
|
76
|
+
|
|
77
|
+
# Removed
|
|
78
|
+
lines.append(f"## Removed ({s.removed})")
|
|
79
|
+
lines.append("")
|
|
80
|
+
shown, hidden = _limit(result.removed, max_rows)
|
|
81
|
+
for rd in shown:
|
|
82
|
+
lines.append(f"- {_key_label(rd, result.key_columns)}")
|
|
83
|
+
if hidden:
|
|
84
|
+
lines.append(f"_… and {hidden} more removed row(s)._")
|
|
85
|
+
lines.append("")
|
|
86
|
+
|
|
87
|
+
if result.warnings:
|
|
88
|
+
lines.append(f"## Warnings ({len(result.warnings)})")
|
|
89
|
+
lines.append("")
|
|
90
|
+
for w in result.warnings:
|
|
91
|
+
lines.append(f"- {w}")
|
|
92
|
+
lines.append("")
|
|
93
|
+
|
|
94
|
+
return "\n".join(lines).rstrip("\n") + "\n"
|
diffmonkey/matching.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Key-based row matching.
|
|
2
|
+
|
|
3
|
+
This module exists to turn two row lists into a keyed lookup so ``compare()``
|
|
4
|
+
can find the partner (if any) of each row. It owns the messy parts of matching:
|
|
5
|
+
composite keys, duplicate key values, and rows missing a key column. Each of
|
|
6
|
+
those has a configurable policy because there is no single right answer — see
|
|
7
|
+
``on_duplicate`` / ``on_missing_key`` in :func:`compare`.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Iterable, Mapping
|
|
13
|
+
|
|
14
|
+
from .models import DuplicateKeyError, MissingKeyError
|
|
15
|
+
|
|
16
|
+
_MISSING = object()
|
|
17
|
+
|
|
18
|
+
DUPLICATE_POLICIES = frozenset({"warn", "first", "last", "error"})
|
|
19
|
+
MISSING_KEY_POLICIES = frozenset({"warn", "skip", "error"})
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def validate_policies(on_duplicate: str, on_missing_key: str) -> None:
|
|
23
|
+
"""Validate duplicate/missing-key policy names, raising :class:`ValueError`.
|
|
24
|
+
|
|
25
|
+
Shared by :func:`index_rows` and :func:`diffmonkey.compare.compare` so both
|
|
26
|
+
entry points reject a misspelled policy with the same message and neither
|
|
27
|
+
drifts from the supported set. Callers should invoke this *before* consuming
|
|
28
|
+
their input rows so configuration errors fail eagerly.
|
|
29
|
+
"""
|
|
30
|
+
if on_duplicate not in DUPLICATE_POLICIES:
|
|
31
|
+
raise ValueError(
|
|
32
|
+
f"on_duplicate must be one of {sorted(DUPLICATE_POLICIES)}, "
|
|
33
|
+
f"got {on_duplicate!r}"
|
|
34
|
+
)
|
|
35
|
+
if on_missing_key not in MISSING_KEY_POLICIES:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"on_missing_key must be one of {sorted(MISSING_KEY_POLICIES)}, "
|
|
38
|
+
f"got {on_missing_key!r}"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def make_key(row: Mapping[str, Any], key_columns: tuple[str, ...]) -> tuple[Any, ...]:
|
|
43
|
+
"""Extract the key tuple for ``row``.
|
|
44
|
+
|
|
45
|
+
A column absent from the row contributes ``None`` (so a missing key column
|
|
46
|
+
and an explicit ``None`` collide, which the missing-key policy handles).
|
|
47
|
+
"""
|
|
48
|
+
return tuple(row.get(col, None) for col in key_columns)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _has_missing_component(
|
|
52
|
+
row: Mapping[str, Any], key_columns: tuple[str, ...]
|
|
53
|
+
) -> bool:
|
|
54
|
+
for col in key_columns:
|
|
55
|
+
if row.get(col, _MISSING) in (_MISSING, None):
|
|
56
|
+
return True
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def index_rows(
|
|
61
|
+
rows: Iterable[Mapping[str, Any]],
|
|
62
|
+
key_columns: tuple[str, ...],
|
|
63
|
+
*,
|
|
64
|
+
side: str,
|
|
65
|
+
on_duplicate: str = "warn",
|
|
66
|
+
on_missing_key: str = "warn",
|
|
67
|
+
warnings: list[str] | None = None,
|
|
68
|
+
) -> tuple[dict[tuple[Any, ...], dict[str, Any]], int, int]:
|
|
69
|
+
"""Index ``rows`` by key.
|
|
70
|
+
|
|
71
|
+
Returns ``(index, total_rows, duplicate_keys)`` where ``index`` maps each
|
|
72
|
+
key tuple to a single row dict, ``total_rows`` is the number of rows that
|
|
73
|
+
made it into the index, and ``duplicate_keys`` is the count of *distinct*
|
|
74
|
+
keys that appeared more than once.
|
|
75
|
+
|
|
76
|
+
Policies:
|
|
77
|
+
|
|
78
|
+
* ``on_duplicate``: ``"warn"`` keeps the first occurrence and records a
|
|
79
|
+
warning; ``"first"``/``"last"`` keep that occurrence silently; ``"error"``
|
|
80
|
+
raises :class:`DuplicateKeyError`.
|
|
81
|
+
* ``on_missing_key``: ``"warn"`` keeps the row (key components default to
|
|
82
|
+
``None``) and records a warning; ``"skip"`` drops the row; ``"error"``
|
|
83
|
+
raises :class:`MissingKeyError`.
|
|
84
|
+
"""
|
|
85
|
+
# Validate policies eagerly (before touching any row) so a misspelled
|
|
86
|
+
# data-integrity policy fails loudly rather than silently degrading to a
|
|
87
|
+
# default that changes which data is compared.
|
|
88
|
+
validate_policies(on_duplicate, on_missing_key)
|
|
89
|
+
|
|
90
|
+
warnings = warnings if warnings is not None else []
|
|
91
|
+
index: dict[tuple[Any, ...], dict[str, Any]] = {}
|
|
92
|
+
seen_counts: dict[tuple[Any, ...], int] = {}
|
|
93
|
+
duplicate_keys = 0
|
|
94
|
+
total = 0
|
|
95
|
+
|
|
96
|
+
for position, row in enumerate(rows):
|
|
97
|
+
rowd = dict(row)
|
|
98
|
+
if _has_missing_component(rowd, key_columns):
|
|
99
|
+
if on_missing_key == "error":
|
|
100
|
+
raise MissingKeyError(
|
|
101
|
+
f"{side} row {position} is missing key column(s) "
|
|
102
|
+
f"{key_columns}: {rowd!r}"
|
|
103
|
+
)
|
|
104
|
+
if on_missing_key == "skip":
|
|
105
|
+
warnings.append(
|
|
106
|
+
f"{side}: row {position} skipped (missing key column)"
|
|
107
|
+
)
|
|
108
|
+
continue
|
|
109
|
+
# "warn": keep, with None for the missing component
|
|
110
|
+
warnings.append(
|
|
111
|
+
f"{side}: row {position} has a missing/None key component "
|
|
112
|
+
f"(key={make_key(rowd, key_columns)!r})"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
key = make_key(rowd, key_columns)
|
|
116
|
+
prior = seen_counts.get(key, 0)
|
|
117
|
+
seen_counts[key] = prior + 1
|
|
118
|
+
|
|
119
|
+
if prior == 0:
|
|
120
|
+
index[key] = rowd
|
|
121
|
+
total += 1
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
# Duplicate key.
|
|
125
|
+
if prior == 1:
|
|
126
|
+
duplicate_keys += 1 # count this key once, on its first repeat
|
|
127
|
+
if on_duplicate == "error":
|
|
128
|
+
raise DuplicateKeyError(
|
|
129
|
+
f"{side}: duplicate key {key!r} (first at an earlier row, "
|
|
130
|
+
f"again at row {position})"
|
|
131
|
+
)
|
|
132
|
+
if on_duplicate == "last":
|
|
133
|
+
index[key] = rowd # overwrite; total unchanged
|
|
134
|
+
elif on_duplicate == "warn":
|
|
135
|
+
warnings.append(
|
|
136
|
+
f"{side}: duplicate key {key!r} at row {position}; "
|
|
137
|
+
f"keeping first occurrence"
|
|
138
|
+
)
|
|
139
|
+
# "first" / "warn": keep existing; "last": already overwrote.
|
|
140
|
+
|
|
141
|
+
return index, total, duplicate_keys
|