diffmonkey 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffmonkey-1.0.0/src/diffmonkey.egg-info → diffmonkey-1.1.0}/PKG-INFO +11 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/README.md +8 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/pyproject.toml +16 -2
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/__init__.py +1 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/cli.py +14 -5
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/comparators.py +83 -15
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/compare.py +21 -8
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/formatters/csv_out.py +8 -3
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/formatters/html.py +1 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/formatters/markdown.py +25 -3
- diffmonkey-1.1.0/src/diffmonkey/readers.py +192 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0/src/diffmonkey.egg-info}/PKG-INFO +11 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey.egg-info/requires.txt +2 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/tests/test_cli.py +3 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/tests/test_comparators.py +5 -4
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/tests/test_compare.py +0 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/tests/test_compare_properties.py +0 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/tests/test_formatters.py +0 -1
- diffmonkey-1.1.0/tests/test_review_fixes.py +526 -0
- diffmonkey-1.0.0/src/diffmonkey/readers.py +0 -117
- diffmonkey-1.0.0/tests/test_review_fixes.py +0 -146
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/LICENSE +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/setup.cfg +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/formatters/__init__.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/matching.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/models.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey.egg-info/SOURCES.txt +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey.egg-info/dependency_links.txt +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey.egg-info/entry_points.txt +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey.egg-info/top_level.txt +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/tests/test_matching.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/tests/test_models.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.0}/tests/test_readers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diffmonkey
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports.
|
|
5
5
|
Author-email: RexBytes <pythonic@rexbytes.com>
|
|
6
6
|
License: MIT License
|
|
@@ -51,6 +51,8 @@ Provides-Extra: dev
|
|
|
51
51
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
52
52
|
Requires-Dist: pytest-cov; extra == "dev"
|
|
53
53
|
Requires-Dist: hypothesis>=6.0; extra == "dev"
|
|
54
|
+
Requires-Dist: ruff; extra == "dev"
|
|
55
|
+
Requires-Dist: mypy; extra == "dev"
|
|
54
56
|
Dynamic: license-file
|
|
55
57
|
|
|
56
58
|
# diffmonkey
|
|
@@ -144,6 +146,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
|
|
|
144
146
|
deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
|
|
145
147
|
handling) so behaviour that looks surprising is not mistaken for a bug.
|
|
146
148
|
|
|
149
|
+
## Contributing & quality
|
|
150
|
+
|
|
151
|
+
diffmonkey is tested and reviewed against an explicit quality contract. See
|
|
152
|
+
[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
|
|
153
|
+
competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
|
|
154
|
+
for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
|
|
155
|
+
for the release rubric (`python scripts/readiness.py`).
|
|
156
|
+
|
|
147
157
|
## License
|
|
148
158
|
|
|
149
159
|
MIT — see [`LICENSE`](./LICENSE).
|
|
@@ -89,6 +89,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
|
|
|
89
89
|
deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
|
|
90
90
|
handling) so behaviour that looks surprising is not mistaken for a bug.
|
|
91
91
|
|
|
92
|
+
## Contributing & quality
|
|
93
|
+
|
|
94
|
+
diffmonkey is tested and reviewed against an explicit quality contract. See
|
|
95
|
+
[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
|
|
96
|
+
competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
|
|
97
|
+
for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
|
|
98
|
+
for the release rubric (`python scripts/readiness.py`).
|
|
99
|
+
|
|
92
100
|
## License
|
|
93
101
|
|
|
94
102
|
MIT — see [`LICENSE`](./LICENSE).
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "diffmonkey"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.1.0"
|
|
8
8
|
description = "Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -31,7 +31,7 @@ dependencies = [
|
|
|
31
31
|
[project.optional-dependencies]
|
|
32
32
|
excel = ["openpyxl>=3.0"]
|
|
33
33
|
dsv = ["dsvmonkey"]
|
|
34
|
-
dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0"]
|
|
34
|
+
dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0", "ruff", "mypy"]
|
|
35
35
|
|
|
36
36
|
[project.scripts]
|
|
37
37
|
diffmonkey = "diffmonkey.cli:main"
|
|
@@ -46,3 +46,17 @@ where = ["src"]
|
|
|
46
46
|
[tool.pytest.ini_options]
|
|
47
47
|
testpaths = ["tests"]
|
|
48
48
|
pythonpath = ["src"]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
src = ["src", "tests"]
|
|
52
|
+
|
|
53
|
+
[tool.mypy]
|
|
54
|
+
files = ["src"]
|
|
55
|
+
python_version = "3.11"
|
|
56
|
+
warn_unused_ignores = true
|
|
57
|
+
|
|
58
|
+
# The rexbytes monkey libraries and openpyxl ship without type stubs; diffmonkey
|
|
59
|
+
# pins their runtime behaviour with tests, not types.
|
|
60
|
+
[[tool.mypy.overrides]]
|
|
61
|
+
module = ["cleanmonkey", "datemonkey", "typemonkey", "openpyxl", "openpyxl.*"]
|
|
62
|
+
ignore_missing_imports = true
|
|
@@ -23,10 +23,19 @@ EXIT_DIFF = 1
|
|
|
23
23
|
EXIT_ERROR = 2
|
|
24
24
|
|
|
25
25
|
|
|
26
|
+
def _split_csv(value: str) -> list[str]:
|
|
27
|
+
return [part.strip() for part in value.split(",") if part.strip()]
|
|
28
|
+
|
|
29
|
+
|
|
26
30
|
def _split_csv_opt(value: str | None) -> list[str] | None:
|
|
31
|
+
# An empty / all-blank option (e.g. --columns "" or a shell variable that
|
|
32
|
+
# expanded to nothing) means "unset", not "an explicit empty list". The
|
|
33
|
+
# latter would make _resolve_columns compare ZERO columns and exit 0,
|
|
34
|
+
# silently masking real differences from a CI gate.
|
|
27
35
|
if value is None:
|
|
28
36
|
return None
|
|
29
|
-
|
|
37
|
+
parts = _split_csv(value)
|
|
38
|
+
return parts or None
|
|
30
39
|
|
|
31
40
|
|
|
32
41
|
def _parse_column_map(pairs: Sequence[str] | None) -> dict[str, str] | None:
|
|
@@ -83,7 +92,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
83
92
|
)
|
|
84
93
|
cmp.add_argument(
|
|
85
94
|
"--include-unchanged", action="store_true",
|
|
86
|
-
help="
|
|
95
|
+
help="Retain unchanged rows (listed in json output; counted elsewhere).",
|
|
87
96
|
)
|
|
88
97
|
cmp.add_argument(
|
|
89
98
|
"--on-duplicate", choices=["warn", "first", "last", "error"], default="warn",
|
|
@@ -97,7 +106,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
97
106
|
return parser
|
|
98
107
|
|
|
99
108
|
|
|
100
|
-
def _render(result, fmt: str
|
|
109
|
+
def _render(result, fmt: str) -> str:
|
|
101
110
|
if fmt == "summary":
|
|
102
111
|
text = result.summary.one_line()
|
|
103
112
|
if result.warnings:
|
|
@@ -131,7 +140,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
131
140
|
result = compare(
|
|
132
141
|
old_rows,
|
|
133
142
|
new_rows,
|
|
134
|
-
key=
|
|
143
|
+
key=_split_csv(args.key),
|
|
135
144
|
columns=_split_csv_opt(args.columns),
|
|
136
145
|
ignore=_split_csv_opt(args.ignore),
|
|
137
146
|
column_map=_parse_column_map(args.map),
|
|
@@ -148,7 +157,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
148
157
|
print(f"diffmonkey: {exc}", file=sys.stderr)
|
|
149
158
|
return EXIT_ERROR
|
|
150
159
|
|
|
151
|
-
text = _render(result, args.format
|
|
160
|
+
text = _render(result, args.format)
|
|
152
161
|
if args.output:
|
|
153
162
|
try:
|
|
154
163
|
with open(args.output, "w", encoding="utf-8", newline="") as fh:
|
|
@@ -95,15 +95,31 @@ class ColumnComparator:
|
|
|
95
95
|
non-null and not already string-equal.
|
|
96
96
|
"""
|
|
97
97
|
|
|
98
|
-
def __init__(
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
column: str,
|
|
101
|
+
kind: str,
|
|
102
|
+
settings: CompareSettings,
|
|
103
|
+
*,
|
|
104
|
+
old_locale: str = "us",
|
|
105
|
+
new_locale: str = "us",
|
|
106
|
+
) -> None:
|
|
99
107
|
self.column = column
|
|
100
108
|
self.kind = kind # "numeric" | "date" | "boolean" | "string"
|
|
101
109
|
self.settings = settings
|
|
110
|
+
# Number locale resolved per side (see _side_locale): when the caller
|
|
111
|
+
# left ``locale=None`` we honour typemonkey's auto-detection instead of
|
|
112
|
+
# forcing US, and each side keeps its own — so an ``old`` file in EU
|
|
113
|
+
# format and a ``new`` file in US format are each parsed correctly.
|
|
114
|
+
# (The date path is NOT per-side: it uses the single ``settings.locale``
|
|
115
|
+
# hint, so disambiguating DD/MM vs MM/DD across files still needs an
|
|
116
|
+
# explicit ``locale=`` — see LIMITATIONS.md and ``_as_date``.)
|
|
117
|
+
self.old_locale = old_locale
|
|
118
|
+
self.new_locale = new_locale
|
|
102
119
|
|
|
103
120
|
# -- per-kind normalisation -------------------------------------------
|
|
104
121
|
|
|
105
|
-
def _as_number(self, value: Any) -> float | _Unparseable:
|
|
106
|
-
locale = self.settings.locale or "us"
|
|
122
|
+
def _as_number(self, value: Any, locale: str) -> float | _Unparseable:
|
|
107
123
|
try:
|
|
108
124
|
return float(typemonkey.parse_number(value, locale=locale).value)
|
|
109
125
|
except (ValueError, TypeError):
|
|
@@ -121,6 +137,22 @@ class ColumnComparator:
|
|
|
121
137
|
# whole point of a diff is that old and new differ), and per-value
|
|
122
138
|
# parsing resolves DD/MM vs MM/DD via ``locale`` instead of guessing
|
|
123
139
|
# from the majority and failing the minority. See LIMITATIONS.md.
|
|
140
|
+
#
|
|
141
|
+
# A bare number like ``"2024"`` is NOT a date a user writes; datemonkey
|
|
142
|
+
# would read it as an Excel serial day-number (``1`` -> 1900-01-01), so a
|
|
143
|
+
# numeric column compared against a date column would silently match the
|
|
144
|
+
# wrong day. Refuse SHORT bare numbers so the pair falls back to string
|
|
145
|
+
# comparison instead of accepting a serial-date interpretation. This
|
|
146
|
+
# covers integer (``"45000"``) AND float-form (``"45000.0"``, or the
|
|
147
|
+
# native float openpyxl returns for a number-formatted cell) serials.
|
|
148
|
+
# The length gate is on the INTEGER part: Excel serials top out at 7
|
|
149
|
+
# digits (``2958465`` is year 9999), so ``< 8`` integer digits blocks
|
|
150
|
+
# every serial while letting the 8-digit compact ISO ``YYYYMMDD`` parse.
|
|
151
|
+
text = str(value).strip()
|
|
152
|
+
core = text.lstrip("+-")
|
|
153
|
+
int_part, _, frac = core.partition(".")
|
|
154
|
+
if int_part.isdigit() and len(int_part) < 8 and (frac == "" or frac.isdigit()):
|
|
155
|
+
return UNPARSEABLE
|
|
124
156
|
try:
|
|
125
157
|
batch = datemonkey.parse_dates(
|
|
126
158
|
[value], locale_preference=self.settings.locale
|
|
@@ -152,9 +184,9 @@ class ColumnComparator:
|
|
|
152
184
|
return True
|
|
153
185
|
|
|
154
186
|
if self.kind == "numeric":
|
|
155
|
-
a = self._as_number(old)
|
|
156
|
-
b = self._as_number(new)
|
|
157
|
-
if a
|
|
187
|
+
a = self._as_number(old, self.old_locale)
|
|
188
|
+
b = self._as_number(new, self.new_locale)
|
|
189
|
+
if isinstance(a, _Unparseable) or isinstance(b, _Unparseable):
|
|
158
190
|
return _strings_equal(old, new, s)
|
|
159
191
|
return math.isclose(a, b, rel_tol=s.rel_tol, abs_tol=s.abs_tol)
|
|
160
192
|
|
|
@@ -181,7 +213,9 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
|
|
|
181
213
|
|
|
182
214
|
Returns ``"empty"`` when the side has no values to judge (so the other
|
|
183
215
|
side decides). ``preserve_as_string`` columns (leading-zero IDs, zips,
|
|
184
|
-
phone numbers) report ``"
|
|
216
|
+
phone numbers) report ``"preserve"`` — a *dominant* string kind that
|
|
217
|
+
:func:`_reconcile` will not let the other side out-vote, so they are never
|
|
218
|
+
numeric-compared even when the opposite side parses as a number.
|
|
185
219
|
"""
|
|
186
220
|
values = list(values)
|
|
187
221
|
if not values:
|
|
@@ -190,7 +224,7 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
|
|
|
190
224
|
values, null_values=settings.null_values, locale=settings.locale
|
|
191
225
|
)
|
|
192
226
|
if profile.preserve_as_string:
|
|
193
|
-
return "
|
|
227
|
+
return "preserve"
|
|
194
228
|
t = profile.type
|
|
195
229
|
if t in NUMERIC_TYPES:
|
|
196
230
|
return "numeric"
|
|
@@ -211,13 +245,17 @@ def _reconcile(old_kind: str, new_kind: str, settings: CompareSettings) -> str:
|
|
|
211
245
|
is exactly what we must not treat as a change. So we infer per side and
|
|
212
246
|
pick the more specific kind, trusting :meth:`ColumnComparator.equal` to
|
|
213
247
|
fall back to string comparison for any individual pair that cannot be
|
|
214
|
-
parsed under that kind. Precedence: **
|
|
215
|
-
|
|
216
|
-
|
|
248
|
+
parsed under that kind. Precedence: **preserve** (leading-zero IDs etc.,
|
|
249
|
+
dominant — never numeric-compared, even against a numeric side) > **date**
|
|
250
|
+
(if ``date_aware``) > numeric. Boolean requires *both* sides to look
|
|
251
|
+
boolean, so an integer column versus a ``true``/``false`` column is not
|
|
252
|
+
silently equated (``1`` == ``true``).
|
|
217
253
|
"""
|
|
218
254
|
kinds = {old_kind, new_kind} - {"empty"}
|
|
219
255
|
if not kinds:
|
|
220
256
|
return "string"
|
|
257
|
+
if "preserve" in kinds:
|
|
258
|
+
return "string"
|
|
221
259
|
if "date" in kinds and settings.date_aware:
|
|
222
260
|
return "date"
|
|
223
261
|
if "numeric" in kinds:
|
|
@@ -238,6 +276,27 @@ def _infer_kind(
|
|
|
238
276
|
)
|
|
239
277
|
|
|
240
278
|
|
|
279
|
+
def _side_locale(values: Sequence[Any], settings: CompareSettings) -> str:
|
|
280
|
+
"""Resolve the number-parsing locale for one side.
|
|
281
|
+
|
|
282
|
+
Honours an explicit ``settings.locale`` when given; otherwise auto-detects
|
|
283
|
+
it from the values via ``typemonkey.infer_type`` (whose ``profile.locale``
|
|
284
|
+
distinguishes ``"1.234,56"`` EU from ``"1,234.56"`` US), falling back to
|
|
285
|
+
``"us"`` when there is nothing to detect. This is what makes the documented
|
|
286
|
+
``locale=None`` "auto-detect" promise true for numbers — the comparator must
|
|
287
|
+
not silently force ``"us"`` and mis-compare EU-formatted data.
|
|
288
|
+
"""
|
|
289
|
+
if settings.locale:
|
|
290
|
+
return settings.locale
|
|
291
|
+
values = list(values)
|
|
292
|
+
if not values:
|
|
293
|
+
return "us"
|
|
294
|
+
profile = typemonkey.infer_type(
|
|
295
|
+
values, null_values=settings.null_values, locale=None
|
|
296
|
+
)
|
|
297
|
+
return getattr(profile, "locale", None) or "us"
|
|
298
|
+
|
|
299
|
+
|
|
241
300
|
def make_comparator(
|
|
242
301
|
column: str,
|
|
243
302
|
old_values: Sequence[Any],
|
|
@@ -248,12 +307,21 @@ def make_comparator(
|
|
|
248
307
|
|
|
249
308
|
When ``settings.type_aware`` is false the comparator is a pure
|
|
250
309
|
string/null comparator. Otherwise the column's kind is inferred from both
|
|
251
|
-
sides combined; date columns
|
|
252
|
-
|
|
253
|
-
|
|
310
|
+
sides combined; date columns compare by parsed calendar date, each value
|
|
311
|
+
parsed independently with the ``locale`` hint. An ``old`` column in an
|
|
312
|
+
unambiguous format and a ``new`` column in another (e.g. ISO) are not false
|
|
313
|
+
diffs; an *ambiguous* mix such as ``DD/MM/YYYY`` vs ISO needs an explicit
|
|
314
|
+
``locale=`` to disambiguate (see LIMITATIONS.md) — unlike numbers, the date
|
|
315
|
+
path does not auto-detect a per-side locale.
|
|
254
316
|
"""
|
|
255
317
|
if not settings.type_aware:
|
|
256
318
|
return ColumnComparator(column, "string", settings)
|
|
257
319
|
|
|
258
320
|
kind = _infer_kind(old_values, new_values, settings)
|
|
259
|
-
return ColumnComparator(
|
|
321
|
+
return ColumnComparator(
|
|
322
|
+
column,
|
|
323
|
+
kind,
|
|
324
|
+
settings,
|
|
325
|
+
old_locale=_side_locale(old_values, settings),
|
|
326
|
+
new_locale=_side_locale(new_values, settings),
|
|
327
|
+
)
|
|
@@ -12,7 +12,7 @@ from __future__ import annotations
|
|
|
12
12
|
from typing import Any, Iterable, Mapping, Sequence
|
|
13
13
|
|
|
14
14
|
from .comparators import CompareSettings, make_comparator
|
|
15
|
-
from .matching import index_rows,
|
|
15
|
+
from .matching import index_rows, validate_policies
|
|
16
16
|
from .models import DiffResult, DiffSummary, FieldChange, RowDiff
|
|
17
17
|
|
|
18
18
|
Row = Mapping[str, Any]
|
|
@@ -38,17 +38,27 @@ def _apply_column_map(
|
|
|
38
38
|
sides share the new namespace. If both ``old_name`` and ``new_name`` are
|
|
39
39
|
present in a row, the explicitly-new value wins (the rename target is not
|
|
40
40
|
clobbered).
|
|
41
|
+
|
|
42
|
+
Renames are resolved against each row's *original* column names in a single
|
|
43
|
+
pass, so they never chain: ``{"a": "b", "b": "c"}`` renames the original
|
|
44
|
+
``a``→``b`` and the original ``b``→``c`` independently — it does not rename a
|
|
45
|
+
freshly-produced ``b`` on to ``c``.
|
|
41
46
|
"""
|
|
42
47
|
if not column_map:
|
|
43
48
|
return [dict(r) for r in rows]
|
|
44
49
|
out: list[dict[str, Any]] = []
|
|
45
50
|
for r in rows:
|
|
46
51
|
rd = dict(r)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
+
# Columns whose names are taken literally (not rename sources); an
|
|
53
|
+
# explicit value already under a rename target lives here and must win.
|
|
54
|
+
literal_targets = {c for c in rd if c not in column_map}
|
|
55
|
+
renamed: dict[str, Any] = {}
|
|
56
|
+
for col, value in rd.items():
|
|
57
|
+
target = column_map.get(col, col)
|
|
58
|
+
if col in column_map and target in literal_targets:
|
|
59
|
+
continue # explicit-new value present elsewhere wins
|
|
60
|
+
renamed.setdefault(target, value)
|
|
61
|
+
out.append(renamed)
|
|
52
62
|
return out
|
|
53
63
|
|
|
54
64
|
|
|
@@ -108,9 +118,12 @@ def compare(
|
|
|
108
118
|
key: Identity column name, or a sequence of names for a composite key.
|
|
109
119
|
Names refer to the *new* namespace (i.e. after ``column_map``).
|
|
110
120
|
columns: Restrict comparison to these columns. Default: every non-key
|
|
111
|
-
column seen on either side.
|
|
121
|
+
column seen on either side. Like ``key``, names are in the *new*
|
|
122
|
+
namespace (i.e. after ``column_map``) — pass the renamed name, not
|
|
123
|
+
the original, or the column will match nothing and its change be
|
|
124
|
+
missed.
|
|
112
125
|
ignore: Columns to exclude from comparison (timestamps, audit fields,
|
|
113
|
-
row numbers). Applied after ``columns``.
|
|
126
|
+
row numbers), also in the *new* namespace. Applied after ``columns``.
|
|
114
127
|
column_map: ``{old_name: new_name}`` renames applied to ``old`` rows so
|
|
115
128
|
renamed columns are not reported as removed+added.
|
|
116
129
|
rel_tol: Relative floating-point tolerance for numeric columns
|
|
@@ -10,8 +10,8 @@ analyst can pivot/filter the diff. The schema is fixed:
|
|
|
10
10
|
* ``key`` is the row key rendered as ``col=value`` (joined by ``; `` for
|
|
11
11
|
composite keys).
|
|
12
12
|
* For ``added``/``removed`` rows there is no per-field breakdown, so one row is
|
|
13
|
-
emitted with
|
|
14
|
-
|
|
13
|
+
emitted with ``column``/``old``/``new`` all empty — the row's presence under
|
|
14
|
+
``change_type`` (and its key) is the change.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from __future__ import annotations
|
|
@@ -26,8 +26,13 @@ HEADER = ["change_type", "key", "column", "old", "new"]
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def _key_str(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
29
|
+
# Render a None key component as the (null) sentinel, like the markdown and
|
|
30
|
+
# HTML reports. A None key arises from on_missing_key="warn" (the default);
|
|
31
|
+
# leaving it "" would make it indistinguishable from a genuine empty-string
|
|
32
|
+
# key in the CSV.
|
|
29
33
|
return "; ".join(
|
|
30
|
-
f"{col}={'' if val is None else val}"
|
|
34
|
+
f"{col}={'(null)' if val is None else val}"
|
|
35
|
+
for col, val in zip(key_columns, rd.key)
|
|
31
36
|
)
|
|
32
37
|
|
|
33
38
|
|
|
@@ -35,7 +35,7 @@ def _cell(value: Any) -> str:
|
|
|
35
35
|
def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
36
36
|
return escape(
|
|
37
37
|
", ".join(
|
|
38
|
-
f"{col}={'' if val is None else val}"
|
|
38
|
+
f"{col}={'(null)' if val is None else val}"
|
|
39
39
|
for col, val in zip(key_columns, rd.key)
|
|
40
40
|
)
|
|
41
41
|
)
|
|
@@ -8,6 +8,7 @@ downstream scripts can rely on its shape.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import re
|
|
11
12
|
from typing import Any
|
|
12
13
|
|
|
13
14
|
from ..models import DiffResult, RowDiff
|
|
@@ -20,14 +21,35 @@ def _fmt(value: Any) -> str:
|
|
|
20
21
|
return str(value)
|
|
21
22
|
|
|
22
23
|
|
|
24
|
+
def _code(value: Any) -> str:
|
|
25
|
+
"""Wrap a value in a backtick code span that survives embedded backticks.
|
|
26
|
+
|
|
27
|
+
A naive ```{value}``` breaks when the value itself contains a
|
|
28
|
+
backtick (the span closes early and the rest renders as garbled text). Per
|
|
29
|
+
CommonMark, a code span may be fenced by any number of backticks, so we pick
|
|
30
|
+
a fence one longer than the longest run inside the value and pad with spaces
|
|
31
|
+
when needed — lossless, no character substitution.
|
|
32
|
+
"""
|
|
33
|
+
text = _fmt(value)
|
|
34
|
+
if "`" not in text:
|
|
35
|
+
return f"`{text}`"
|
|
36
|
+
longest = max(len(run) for run in re.findall(r"`+", text))
|
|
37
|
+
fence = "`" * (longest + 1)
|
|
38
|
+
return f"{fence} {text} {fence}"
|
|
39
|
+
|
|
40
|
+
|
|
23
41
|
def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
24
42
|
parts = [f"{col}={_fmt(val)}" for col, val in zip(key_columns, rd.key)]
|
|
25
43
|
return ", ".join(parts)
|
|
26
44
|
|
|
27
45
|
|
|
28
46
|
def _limit(rows: list[RowDiff], max_rows: int | None) -> tuple[list[RowDiff], int]:
|
|
29
|
-
"""Return ``(shown_rows, hidden_count)`` honouring ``max_rows``.
|
|
30
|
-
|
|
47
|
+
"""Return ``(shown_rows, hidden_count)`` honouring ``max_rows``.
|
|
48
|
+
|
|
49
|
+
A negative ``max_rows`` means "no limit" (same as ``None``); it never
|
|
50
|
+
produces a phantom "N more" notice from slicing with a negative bound.
|
|
51
|
+
"""
|
|
52
|
+
if max_rows is None or max_rows < 0 or len(rows) <= max_rows:
|
|
31
53
|
return rows, 0
|
|
32
54
|
return rows[:max_rows], len(rows) - max_rows
|
|
33
55
|
|
|
@@ -58,7 +80,7 @@ def render(result: DiffResult, *, max_rows: int | None = None) -> str:
|
|
|
58
80
|
for rd in shown:
|
|
59
81
|
lines.append(f"### {_key_label(rd, result.key_columns)}")
|
|
60
82
|
for ch in rd.changes:
|
|
61
|
-
lines.append(f"- **{ch.column}**:
|
|
83
|
+
lines.append(f"- **{ch.column}**: {_code(ch.old)} → {_code(ch.new)}")
|
|
62
84
|
lines.append("")
|
|
63
85
|
if hidden:
|
|
64
86
|
lines.append(f"_… and {hidden} more changed row(s)._")
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Read tabular inputs into lists of dicts for :func:`diffmonkey.compare`.
|
|
2
|
+
|
|
3
|
+
This module exists so the CLI (and callers who start from files) can hand
|
|
4
|
+
``compare()`` the list-of-dicts it expects, regardless of source format. DSV
|
|
5
|
+
files (CSV/TSV/pipe/…) are read with the stdlib ``csv`` module — robust and
|
|
6
|
+
dependency-free. Excel is *optional*: if ``openpyxl`` is installed we use it,
|
|
7
|
+
otherwise :func:`read_excel` raises a clear, actionable error rather than
|
|
8
|
+
failing obscurely.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import csv
|
|
14
|
+
import io
|
|
15
|
+
import os
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
DELIMITER_BY_EXT = {
|
|
19
|
+
".csv": ",",
|
|
20
|
+
".tsv": "\t",
|
|
21
|
+
".tab": "\t",
|
|
22
|
+
".psv": "|",
|
|
23
|
+
".pipe": "|",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def read_csv(
|
|
28
|
+
path: str,
|
|
29
|
+
*,
|
|
30
|
+
delimiter: str | None = None,
|
|
31
|
+
encoding: str = "utf-8-sig",
|
|
32
|
+
) -> list[dict[str, Any]]:
|
|
33
|
+
"""Read a delimited file into a list of dicts using the stdlib ``csv``.
|
|
34
|
+
|
|
35
|
+
``encoding`` defaults to ``utf-8-sig`` so a leading BOM is stripped from the
|
|
36
|
+
first header. ``delimiter`` defaults to the one implied by the file
|
|
37
|
+
extension (``,`` for unknown extensions). All cell values are strings; type
|
|
38
|
+
awareness happens later in :func:`compare`. A row with *more* fields than the
|
|
39
|
+
header raises :class:`ValueError` rather than silently bucketing the overflow
|
|
40
|
+
under a ``None`` key (which would smuggle a non-string, non-column cell into
|
|
41
|
+
the diff).
|
|
42
|
+
|
|
43
|
+
Note: *duplicate* header names collapse to one column (stdlib
|
|
44
|
+
``csv.DictReader`` keeps the last) — unlike :func:`read_excel`, which
|
|
45
|
+
de-duplicates. A delimited file with repeated headers is malformed; fix it
|
|
46
|
+
upstream or read it with :func:`read_excel` if it came from a spreadsheet.
|
|
47
|
+
"""
|
|
48
|
+
if delimiter is None:
|
|
49
|
+
ext = os.path.splitext(path)[1].lower()
|
|
50
|
+
delimiter = DELIMITER_BY_EXT.get(ext, ",")
|
|
51
|
+
with open(path, "r", encoding=encoding, newline="") as fh:
|
|
52
|
+
return _read_dictreader(csv.DictReader(fh, delimiter=delimiter), path)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
_OVERFLOW_KEY = "__diffmonkey_overflow__"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _read_dictreader(reader: "csv.DictReader[str]", source: str) -> list[dict[str, Any]]:
|
|
59
|
+
"""Materialise a ``DictReader``, rejecting rows wider than the header.
|
|
60
|
+
|
|
61
|
+
``csv.DictReader`` would otherwise stash extra fields in a list under the
|
|
62
|
+
``None`` key; that yields a non-string key and a list value that flow
|
|
63
|
+
straight into :func:`compare`, contradicting the "all cell values are
|
|
64
|
+
strings" contract.
|
|
65
|
+
|
|
66
|
+
Overflow is detected by the bucket being a *list* (DictReader only ever puts
|
|
67
|
+
a list under ``restkey``), not merely by the key's presence — so a file that
|
|
68
|
+
legitimately has a column literally named ``__diffmonkey_overflow__`` is not
|
|
69
|
+
mistaken for a ragged row.
|
|
70
|
+
"""
|
|
71
|
+
reader.restkey = _OVERFLOW_KEY
|
|
72
|
+
rows: list[dict[str, Any]] = []
|
|
73
|
+
for n, r in enumerate(reader, start=2): # row 1 is the header
|
|
74
|
+
if isinstance(r.get(_OVERFLOW_KEY), list):
|
|
75
|
+
extra = r[_OVERFLOW_KEY]
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"{source}: row {n} has more fields than the header "
|
|
78
|
+
f"({len(reader.fieldnames or ())} columns); extra values {extra!r}"
|
|
79
|
+
)
|
|
80
|
+
rows.append(dict(r))
|
|
81
|
+
return rows
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _unique_headers(names: list[str]) -> list[str]:
|
|
85
|
+
"""Make header names unique so no two columns share a dict key and one
|
|
86
|
+
silently overwrites the other.
|
|
87
|
+
|
|
88
|
+
A repeated name gets the first free ``_2``/``_3``/… suffix, in order of
|
|
89
|
+
appearance. This covers both an injected ``column_N`` (for a blank header)
|
|
90
|
+
clashing with a real column literally named ``column_N`` and two equal
|
|
91
|
+
literal headers; the first occurrence keeps the name, later ones are
|
|
92
|
+
suffixed, so every column's data survives under a distinct key.
|
|
93
|
+
"""
|
|
94
|
+
assigned: set[str] = set()
|
|
95
|
+
out: list[str] = []
|
|
96
|
+
for name in names:
|
|
97
|
+
if name not in assigned:
|
|
98
|
+
assigned.add(name)
|
|
99
|
+
out.append(name)
|
|
100
|
+
continue
|
|
101
|
+
n = 2
|
|
102
|
+
while f"{name}_{n}" in assigned:
|
|
103
|
+
n += 1
|
|
104
|
+
candidate = f"{name}_{n}"
|
|
105
|
+
assigned.add(candidate)
|
|
106
|
+
out.append(candidate)
|
|
107
|
+
return out
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def read_excel(path: str, *, sheet: str | int | None = None) -> list[dict[str, Any]]:
|
|
111
|
+
"""Read the first (or named) worksheet into a list of dicts.
|
|
112
|
+
|
|
113
|
+
Requires the ``excel`` extra (``pip install diffmonkey[excel]``). The first
|
|
114
|
+
row is treated as the header. Raises :class:`RuntimeError` with install
|
|
115
|
+
guidance if ``openpyxl`` is unavailable.
|
|
116
|
+
|
|
117
|
+
Empty header cells are given stable positional names (``column_1``,
|
|
118
|
+
``column_2``, …) rather than a shared ``""`` key, so a blank or ragged
|
|
119
|
+
header never silently collapses several data columns onto one key and loses
|
|
120
|
+
values. ``openpyxl``'s ``read_only`` mode pads every row to the sheet's
|
|
121
|
+
widest, so an over-wide data row turns the trailing header padding into such
|
|
122
|
+
a positional column instead of dropping the stray cell.
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
from openpyxl import load_workbook
|
|
126
|
+
except ImportError as exc: # pragma: no cover - exercised via monkeypatch
|
|
127
|
+
raise RuntimeError(
|
|
128
|
+
"Reading Excel files requires openpyxl. "
|
|
129
|
+
"Install it with: pip install diffmonkey[excel]"
|
|
130
|
+
) from exc
|
|
131
|
+
|
|
132
|
+
wb = load_workbook(path, read_only=True, data_only=True)
|
|
133
|
+
ws = wb[sheet] if isinstance(sheet, str) else (
|
|
134
|
+
wb.worksheets[sheet] if isinstance(sheet, int) else wb.active
|
|
135
|
+
)
|
|
136
|
+
rows_iter = ws.iter_rows(values_only=True)
|
|
137
|
+
try:
|
|
138
|
+
header = next(rows_iter)
|
|
139
|
+
except StopIteration:
|
|
140
|
+
return []
|
|
141
|
+
headers = _unique_headers(
|
|
142
|
+
[(f"column_{i + 1}" if h is None or str(h) == "" else str(h))
|
|
143
|
+
for i, h in enumerate(header)]
|
|
144
|
+
)
|
|
145
|
+
out: list[dict[str, Any]] = []
|
|
146
|
+
for raw in rows_iter:
|
|
147
|
+
out.append({headers[i]: raw[i] if i < len(raw) else None for i in range(len(headers))})
|
|
148
|
+
wb.close()
|
|
149
|
+
return out
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
EXCEL_EXTENSIONS = (".xlsx", ".xlsm", ".xltx", ".xltm")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def read_table(
|
|
156
|
+
path: str,
|
|
157
|
+
*,
|
|
158
|
+
delimiter: str | None = None,
|
|
159
|
+
encoding: str = "utf-8-sig",
|
|
160
|
+
sheet: str | int | None = None,
|
|
161
|
+
) -> list[dict[str, Any]]:
|
|
162
|
+
"""Read ``path`` by dispatching on its extension.
|
|
163
|
+
|
|
164
|
+
``.xlsx``/``.xlsm``/``.xltx``/``.xltm`` go to :func:`read_excel`; everything
|
|
165
|
+
else is treated as delimited text via :func:`read_csv`. Arguments are routed
|
|
166
|
+
to the format that understands them: ``delimiter``/``encoding`` apply to
|
|
167
|
+
delimited text only, ``sheet`` to Excel only. Passing ``delimiter`` for an
|
|
168
|
+
Excel input raises :class:`ValueError` rather than failing obscurely deep in
|
|
169
|
+
the Excel reader.
|
|
170
|
+
"""
|
|
171
|
+
ext = os.path.splitext(path)[1].lower()
|
|
172
|
+
if ext in EXCEL_EXTENSIONS:
|
|
173
|
+
if delimiter is not None:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"delimiter is not applicable to Excel input {path!r}"
|
|
176
|
+
)
|
|
177
|
+
return read_excel(path, sheet=sheet)
|
|
178
|
+
if sheet is not None:
|
|
179
|
+
raise ValueError(f"sheet is not applicable to delimited input {path!r}")
|
|
180
|
+
return read_csv(path, delimiter=delimiter, encoding=encoding)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def read_csv_string(
|
|
184
|
+
text: str, *, delimiter: str = ","
|
|
185
|
+
) -> list[dict[str, Any]]:
|
|
186
|
+
"""Parse a delimited *string* into a list of dicts (used in tests/pipes).
|
|
187
|
+
|
|
188
|
+
Like :func:`read_csv`, a row with more fields than the header raises
|
|
189
|
+
:class:`ValueError` rather than producing a ``None``-keyed overflow cell.
|
|
190
|
+
"""
|
|
191
|
+
reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
|
|
192
|
+
return _read_dictreader(reader, "<string>")
|