diffmonkey 1.0.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffmonkey-1.0.0/src/diffmonkey.egg-info → diffmonkey-1.1.1}/PKG-INFO +14 -4
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/README.md +8 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/pyproject.toml +19 -5
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/__init__.py +1 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/cli.py +14 -5
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/comparators.py +85 -16
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/compare.py +26 -11
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/csv_out.py +8 -3
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/html.py +1 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/markdown.py +25 -3
- diffmonkey-1.1.1/src/diffmonkey/readers.py +192 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1/src/diffmonkey.egg-info}/PKG-INFO +14 -4
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/requires.txt +5 -3
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_cli.py +3 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_comparators.py +5 -4
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_compare.py +0 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_compare_properties.py +0 -1
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_formatters.py +0 -1
- diffmonkey-1.1.1/tests/test_review_fixes.py +564 -0
- diffmonkey-1.0.0/src/diffmonkey/readers.py +0 -117
- diffmonkey-1.0.0/tests/test_review_fixes.py +0 -146
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/LICENSE +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/setup.cfg +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/__init__.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/matching.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/models.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/SOURCES.txt +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/dependency_links.txt +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/entry_points.txt +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/top_level.txt +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_matching.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_models.py +0 -0
- {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_readers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diffmonkey
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports.
|
|
5
5
|
Author-email: RexBytes <pythonic@rexbytes.com>
|
|
6
6
|
License: MIT License
|
|
@@ -40,9 +40,9 @@ Classifier: Topic :: Utilities
|
|
|
40
40
|
Requires-Python: >=3.11
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
License-File: LICENSE
|
|
43
|
-
Requires-Dist: cleanmonkey
|
|
44
|
-
Requires-Dist: typemonkey
|
|
45
|
-
Requires-Dist: datemonkey
|
|
43
|
+
Requires-Dist: cleanmonkey>=0.2.0
|
|
44
|
+
Requires-Dist: typemonkey>=1.2.0
|
|
45
|
+
Requires-Dist: datemonkey>=0.2.0
|
|
46
46
|
Provides-Extra: excel
|
|
47
47
|
Requires-Dist: openpyxl>=3.0; extra == "excel"
|
|
48
48
|
Provides-Extra: dsv
|
|
@@ -51,6 +51,8 @@ Provides-Extra: dev
|
|
|
51
51
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
52
52
|
Requires-Dist: pytest-cov; extra == "dev"
|
|
53
53
|
Requires-Dist: hypothesis>=6.0; extra == "dev"
|
|
54
|
+
Requires-Dist: ruff; extra == "dev"
|
|
55
|
+
Requires-Dist: mypy; extra == "dev"
|
|
54
56
|
Dynamic: license-file
|
|
55
57
|
|
|
56
58
|
# diffmonkey
|
|
@@ -144,6 +146,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
|
|
|
144
146
|
deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
|
|
145
147
|
handling) so behaviour that looks surprising is not mistaken for a bug.
|
|
146
148
|
|
|
149
|
+
## Contributing & quality
|
|
150
|
+
|
|
151
|
+
diffmonkey is tested and reviewed against an explicit quality contract. See
|
|
152
|
+
[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
|
|
153
|
+
competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
|
|
154
|
+
for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
|
|
155
|
+
for the release rubric (`python scripts/readiness.py`).
|
|
156
|
+
|
|
147
157
|
## License
|
|
148
158
|
|
|
149
159
|
MIT — see [`LICENSE`](./LICENSE).
|
|
@@ -89,6 +89,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
|
|
|
89
89
|
deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
|
|
90
90
|
handling) so behaviour that looks surprising is not mistaken for a bug.
|
|
91
91
|
|
|
92
|
+
## Contributing & quality
|
|
93
|
+
|
|
94
|
+
diffmonkey is tested and reviewed against an explicit quality contract. See
|
|
95
|
+
[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
|
|
96
|
+
competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
|
|
97
|
+
for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
|
|
98
|
+
for the release rubric (`python scripts/readiness.py`).
|
|
99
|
+
|
|
92
100
|
## License
|
|
93
101
|
|
|
94
102
|
MIT — see [`LICENSE`](./LICENSE).
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "diffmonkey"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.1.1"
|
|
8
8
|
description = "Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -23,15 +23,15 @@ classifiers = [
|
|
|
23
23
|
"Topic :: Utilities",
|
|
24
24
|
]
|
|
25
25
|
dependencies = [
|
|
26
|
-
"cleanmonkey",
|
|
27
|
-
"typemonkey",
|
|
28
|
-
"datemonkey",
|
|
26
|
+
"cleanmonkey>=0.2.0",
|
|
27
|
+
"typemonkey>=1.2.0",
|
|
28
|
+
"datemonkey>=0.2.0",
|
|
29
29
|
]
|
|
30
30
|
|
|
31
31
|
[project.optional-dependencies]
|
|
32
32
|
excel = ["openpyxl>=3.0"]
|
|
33
33
|
dsv = ["dsvmonkey"]
|
|
34
|
-
dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0"]
|
|
34
|
+
dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0", "ruff", "mypy"]
|
|
35
35
|
|
|
36
36
|
[project.scripts]
|
|
37
37
|
diffmonkey = "diffmonkey.cli:main"
|
|
@@ -46,3 +46,17 @@ where = ["src"]
|
|
|
46
46
|
[tool.pytest.ini_options]
|
|
47
47
|
testpaths = ["tests"]
|
|
48
48
|
pythonpath = ["src"]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
src = ["src", "tests"]
|
|
52
|
+
|
|
53
|
+
[tool.mypy]
|
|
54
|
+
files = ["src"]
|
|
55
|
+
python_version = "3.11"
|
|
56
|
+
warn_unused_ignores = true
|
|
57
|
+
|
|
58
|
+
# The rexbytes monkey libraries and openpyxl ship without type stubs; diffmonkey
|
|
59
|
+
# pins their runtime behaviour with tests, not types.
|
|
60
|
+
[[tool.mypy.overrides]]
|
|
61
|
+
module = ["cleanmonkey", "datemonkey", "typemonkey", "openpyxl", "openpyxl.*"]
|
|
62
|
+
ignore_missing_imports = true
|
|
@@ -23,10 +23,19 @@ EXIT_DIFF = 1
|
|
|
23
23
|
EXIT_ERROR = 2
|
|
24
24
|
|
|
25
25
|
|
|
26
|
+
def _split_csv(value: str) -> list[str]:
|
|
27
|
+
return [part.strip() for part in value.split(",") if part.strip()]
|
|
28
|
+
|
|
29
|
+
|
|
26
30
|
def _split_csv_opt(value: str | None) -> list[str] | None:
|
|
31
|
+
# An empty / all-blank option (e.g. --columns "" or a shell variable that
|
|
32
|
+
# expanded to nothing) means "unset", not "an explicit empty list". The
|
|
33
|
+
# latter would make _resolve_columns compare ZERO columns and exit 0,
|
|
34
|
+
# silently masking real differences from a CI gate.
|
|
27
35
|
if value is None:
|
|
28
36
|
return None
|
|
29
|
-
|
|
37
|
+
parts = _split_csv(value)
|
|
38
|
+
return parts or None
|
|
30
39
|
|
|
31
40
|
|
|
32
41
|
def _parse_column_map(pairs: Sequence[str] | None) -> dict[str, str] | None:
|
|
@@ -83,7 +92,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
83
92
|
)
|
|
84
93
|
cmp.add_argument(
|
|
85
94
|
"--include-unchanged", action="store_true",
|
|
86
|
-
help="
|
|
95
|
+
help="Retain unchanged rows (listed in json output; counted elsewhere).",
|
|
87
96
|
)
|
|
88
97
|
cmp.add_argument(
|
|
89
98
|
"--on-duplicate", choices=["warn", "first", "last", "error"], default="warn",
|
|
@@ -97,7 +106,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
97
106
|
return parser
|
|
98
107
|
|
|
99
108
|
|
|
100
|
-
def _render(result, fmt: str
|
|
109
|
+
def _render(result, fmt: str) -> str:
|
|
101
110
|
if fmt == "summary":
|
|
102
111
|
text = result.summary.one_line()
|
|
103
112
|
if result.warnings:
|
|
@@ -131,7 +140,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
131
140
|
result = compare(
|
|
132
141
|
old_rows,
|
|
133
142
|
new_rows,
|
|
134
|
-
key=
|
|
143
|
+
key=_split_csv(args.key),
|
|
135
144
|
columns=_split_csv_opt(args.columns),
|
|
136
145
|
ignore=_split_csv_opt(args.ignore),
|
|
137
146
|
column_map=_parse_column_map(args.map),
|
|
@@ -148,7 +157,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
148
157
|
print(f"diffmonkey: {exc}", file=sys.stderr)
|
|
149
158
|
return EXIT_ERROR
|
|
150
159
|
|
|
151
|
-
text = _render(result, args.format
|
|
160
|
+
text = _render(result, args.format)
|
|
152
161
|
if args.output:
|
|
153
162
|
try:
|
|
154
163
|
with open(args.output, "w", encoding="utf-8", newline="") as fh:
|
|
@@ -76,7 +76,8 @@ def _clean(value: Any, settings: CompareSettings) -> str:
|
|
|
76
76
|
text = value if isinstance(value, str) else str(value)
|
|
77
77
|
if settings.normalize_whitespace:
|
|
78
78
|
# 'minimal' would skip whitespace collapsing; 'default' strips and
|
|
79
|
-
# collapses runs
|
|
79
|
+
# collapses runs, removes invisible characters, and folds typographic
|
|
80
|
+
# variants (smart quotes→straight, dashes→'-', '…'→'...') — exactly the
|
|
80
81
|
# false-diff sources we want gone. strip=True is safe for cell values.
|
|
81
82
|
return cleanmonkey.clean(text, profile="default")
|
|
82
83
|
return text
|
|
@@ -95,15 +96,31 @@ class ColumnComparator:
|
|
|
95
96
|
non-null and not already string-equal.
|
|
96
97
|
"""
|
|
97
98
|
|
|
98
|
-
def __init__(
|
|
99
|
+
def __init__(
|
|
100
|
+
self,
|
|
101
|
+
column: str,
|
|
102
|
+
kind: str,
|
|
103
|
+
settings: CompareSettings,
|
|
104
|
+
*,
|
|
105
|
+
old_locale: str = "us",
|
|
106
|
+
new_locale: str = "us",
|
|
107
|
+
) -> None:
|
|
99
108
|
self.column = column
|
|
100
109
|
self.kind = kind # "numeric" | "date" | "boolean" | "string"
|
|
101
110
|
self.settings = settings
|
|
111
|
+
# Number locale resolved per side (see _side_locale): when the caller
|
|
112
|
+
# left ``locale=None`` we honour typemonkey's auto-detection instead of
|
|
113
|
+
# forcing US, and each side keeps its own — so an ``old`` file in EU
|
|
114
|
+
# format and a ``new`` file in US format are each parsed correctly.
|
|
115
|
+
# (The date path is NOT per-side: it uses the single ``settings.locale``
|
|
116
|
+
# hint, so disambiguating DD/MM vs MM/DD across files still needs an
|
|
117
|
+
# explicit ``locale=`` — see LIMITATIONS.md and ``_as_date``.)
|
|
118
|
+
self.old_locale = old_locale
|
|
119
|
+
self.new_locale = new_locale
|
|
102
120
|
|
|
103
121
|
# -- per-kind normalisation -------------------------------------------
|
|
104
122
|
|
|
105
|
-
def _as_number(self, value: Any) -> float | _Unparseable:
|
|
106
|
-
locale = self.settings.locale or "us"
|
|
123
|
+
def _as_number(self, value: Any, locale: str) -> float | _Unparseable:
|
|
107
124
|
try:
|
|
108
125
|
return float(typemonkey.parse_number(value, locale=locale).value)
|
|
109
126
|
except (ValueError, TypeError):
|
|
@@ -121,6 +138,22 @@ class ColumnComparator:
|
|
|
121
138
|
# whole point of a diff is that old and new differ), and per-value
|
|
122
139
|
# parsing resolves DD/MM vs MM/DD via ``locale`` instead of guessing
|
|
123
140
|
# from the majority and failing the minority. See LIMITATIONS.md.
|
|
141
|
+
#
|
|
142
|
+
# A bare number like ``"2024"`` is NOT a date a user writes; datemonkey
|
|
143
|
+
# would read it as an Excel serial day-number (``1`` -> 1900-01-01), so a
|
|
144
|
+
# numeric column compared against a date column would silently match the
|
|
145
|
+
# wrong day. Refuse SHORT bare numbers so the pair falls back to string
|
|
146
|
+
# comparison instead of accepting a serial-date interpretation. This
|
|
147
|
+
# covers integer (``"45000"``) AND float-form (``"45000.0"``, or the
|
|
148
|
+
# native float openpyxl returns for a number-formatted cell) serials.
|
|
149
|
+
# The length gate is on the INTEGER part: Excel serials top out at 7
|
|
150
|
+
# digits (``2958465`` is year 9999), so ``< 8`` integer digits blocks
|
|
151
|
+
# every serial while letting the 8-digit compact ISO ``YYYYMMDD`` parse.
|
|
152
|
+
text = str(value).strip()
|
|
153
|
+
core = text.lstrip("+-")
|
|
154
|
+
int_part, _, frac = core.partition(".")
|
|
155
|
+
if int_part.isdigit() and len(int_part) < 8 and (frac == "" or frac.isdigit()):
|
|
156
|
+
return UNPARSEABLE
|
|
124
157
|
try:
|
|
125
158
|
batch = datemonkey.parse_dates(
|
|
126
159
|
[value], locale_preference=self.settings.locale
|
|
@@ -152,9 +185,9 @@ class ColumnComparator:
|
|
|
152
185
|
return True
|
|
153
186
|
|
|
154
187
|
if self.kind == "numeric":
|
|
155
|
-
a = self._as_number(old)
|
|
156
|
-
b = self._as_number(new)
|
|
157
|
-
if a
|
|
188
|
+
a = self._as_number(old, self.old_locale)
|
|
189
|
+
b = self._as_number(new, self.new_locale)
|
|
190
|
+
if isinstance(a, _Unparseable) or isinstance(b, _Unparseable):
|
|
158
191
|
return _strings_equal(old, new, s)
|
|
159
192
|
return math.isclose(a, b, rel_tol=s.rel_tol, abs_tol=s.abs_tol)
|
|
160
193
|
|
|
@@ -181,7 +214,9 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
|
|
|
181
214
|
|
|
182
215
|
Returns ``"empty"`` when the side has no values to judge (so the other
|
|
183
216
|
side decides). ``preserve_as_string`` columns (leading-zero IDs, zips,
|
|
184
|
-
phone numbers) report ``"
|
|
217
|
+
phone numbers) report ``"preserve"`` — a *dominant* string kind that
|
|
218
|
+
:func:`_reconcile` will not let the other side out-vote, so they are never
|
|
219
|
+
numeric-compared even when the opposite side parses as a number.
|
|
185
220
|
"""
|
|
186
221
|
values = list(values)
|
|
187
222
|
if not values:
|
|
@@ -190,7 +225,7 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
|
|
|
190
225
|
values, null_values=settings.null_values, locale=settings.locale
|
|
191
226
|
)
|
|
192
227
|
if profile.preserve_as_string:
|
|
193
|
-
return "
|
|
228
|
+
return "preserve"
|
|
194
229
|
t = profile.type
|
|
195
230
|
if t in NUMERIC_TYPES:
|
|
196
231
|
return "numeric"
|
|
@@ -211,13 +246,17 @@ def _reconcile(old_kind: str, new_kind: str, settings: CompareSettings) -> str:
|
|
|
211
246
|
is exactly what we must not treat as a change. So we infer per side and
|
|
212
247
|
pick the more specific kind, trusting :meth:`ColumnComparator.equal` to
|
|
213
248
|
fall back to string comparison for any individual pair that cannot be
|
|
214
|
-
parsed under that kind. Precedence: **
|
|
215
|
-
|
|
216
|
-
|
|
249
|
+
parsed under that kind. Precedence: **preserve** (leading-zero IDs etc.,
|
|
250
|
+
dominant — never numeric-compared, even against a numeric side) > **date**
|
|
251
|
+
(if ``date_aware``) > numeric. Boolean requires *both* sides to look
|
|
252
|
+
boolean, so an integer column versus a ``true``/``false`` column is not
|
|
253
|
+
silently equated (``1`` == ``true``).
|
|
217
254
|
"""
|
|
218
255
|
kinds = {old_kind, new_kind} - {"empty"}
|
|
219
256
|
if not kinds:
|
|
220
257
|
return "string"
|
|
258
|
+
if "preserve" in kinds:
|
|
259
|
+
return "string"
|
|
221
260
|
if "date" in kinds and settings.date_aware:
|
|
222
261
|
return "date"
|
|
223
262
|
if "numeric" in kinds:
|
|
@@ -238,6 +277,27 @@ def _infer_kind(
|
|
|
238
277
|
)
|
|
239
278
|
|
|
240
279
|
|
|
280
|
+
def _side_locale(values: Sequence[Any], settings: CompareSettings) -> str:
|
|
281
|
+
"""Resolve the number-parsing locale for one side.
|
|
282
|
+
|
|
283
|
+
Honours an explicit ``settings.locale`` when given; otherwise auto-detects
|
|
284
|
+
it from the values via ``typemonkey.infer_type`` (whose ``profile.locale``
|
|
285
|
+
distinguishes ``"1.234,56"`` EU from ``"1,234.56"`` US), falling back to
|
|
286
|
+
``"us"`` when there is nothing to detect. This is what makes the documented
|
|
287
|
+
``locale=None`` "auto-detect" promise true for numbers — the comparator must
|
|
288
|
+
not silently force ``"us"`` and mis-compare EU-formatted data.
|
|
289
|
+
"""
|
|
290
|
+
if settings.locale:
|
|
291
|
+
return settings.locale
|
|
292
|
+
values = list(values)
|
|
293
|
+
if not values:
|
|
294
|
+
return "us"
|
|
295
|
+
profile = typemonkey.infer_type(
|
|
296
|
+
values, null_values=settings.null_values, locale=None
|
|
297
|
+
)
|
|
298
|
+
return getattr(profile, "locale", None) or "us"
|
|
299
|
+
|
|
300
|
+
|
|
241
301
|
def make_comparator(
|
|
242
302
|
column: str,
|
|
243
303
|
old_values: Sequence[Any],
|
|
@@ -248,12 +308,21 @@ def make_comparator(
|
|
|
248
308
|
|
|
249
309
|
When ``settings.type_aware`` is false the comparator is a pure
|
|
250
310
|
string/null comparator. Otherwise the column's kind is inferred from both
|
|
251
|
-
sides combined; date columns
|
|
252
|
-
|
|
253
|
-
|
|
311
|
+
sides combined; date columns compare by parsed calendar date, each value
|
|
312
|
+
parsed independently with the ``locale`` hint. An ``old`` column in an
|
|
313
|
+
unambiguous format and a ``new`` column in another (e.g. ISO) are not false
|
|
314
|
+
diffs; an *ambiguous* mix such as ``DD/MM/YYYY`` vs ISO needs an explicit
|
|
315
|
+
``locale=`` to disambiguate (see LIMITATIONS.md) — unlike numbers, the date
|
|
316
|
+
path does not auto-detect a per-side locale.
|
|
254
317
|
"""
|
|
255
318
|
if not settings.type_aware:
|
|
256
319
|
return ColumnComparator(column, "string", settings)
|
|
257
320
|
|
|
258
321
|
kind = _infer_kind(old_values, new_values, settings)
|
|
259
|
-
return ColumnComparator(
|
|
322
|
+
return ColumnComparator(
|
|
323
|
+
column,
|
|
324
|
+
kind,
|
|
325
|
+
settings,
|
|
326
|
+
old_locale=_side_locale(old_values, settings),
|
|
327
|
+
new_locale=_side_locale(new_values, settings),
|
|
328
|
+
)
|
|
@@ -12,7 +12,7 @@ from __future__ import annotations
|
|
|
12
12
|
from typing import Any, Iterable, Mapping, Sequence
|
|
13
13
|
|
|
14
14
|
from .comparators import CompareSettings, make_comparator
|
|
15
|
-
from .matching import index_rows,
|
|
15
|
+
from .matching import index_rows, validate_policies
|
|
16
16
|
from .models import DiffResult, DiffSummary, FieldChange, RowDiff
|
|
17
17
|
|
|
18
18
|
Row = Mapping[str, Any]
|
|
@@ -38,17 +38,27 @@ def _apply_column_map(
|
|
|
38
38
|
sides share the new namespace. If both ``old_name`` and ``new_name`` are
|
|
39
39
|
present in a row, the explicitly-new value wins (the rename target is not
|
|
40
40
|
clobbered).
|
|
41
|
+
|
|
42
|
+
Renames are resolved against each row's *original* column names in a single
|
|
43
|
+
pass, so they never chain: ``{"a": "b", "b": "c"}`` renames the original
|
|
44
|
+
``a``→``b`` and the original ``b``→``c`` independently — it does not rename a
|
|
45
|
+
freshly-produced ``b`` on to ``c``.
|
|
41
46
|
"""
|
|
42
47
|
if not column_map:
|
|
43
48
|
return [dict(r) for r in rows]
|
|
44
49
|
out: list[dict[str, Any]] = []
|
|
45
50
|
for r in rows:
|
|
46
51
|
rd = dict(r)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
+
# Columns whose names are taken literally (not rename sources); an
|
|
53
|
+
# explicit value already under a rename target lives here and must win.
|
|
54
|
+
literal_targets = {c for c in rd if c not in column_map}
|
|
55
|
+
renamed: dict[str, Any] = {}
|
|
56
|
+
for col, value in rd.items():
|
|
57
|
+
target = column_map.get(col, col)
|
|
58
|
+
if col in column_map and target in literal_targets:
|
|
59
|
+
continue # explicit-new value present elsewhere wins
|
|
60
|
+
renamed.setdefault(target, value)
|
|
61
|
+
out.append(renamed)
|
|
52
62
|
return out
|
|
53
63
|
|
|
54
64
|
|
|
@@ -108,18 +118,23 @@ def compare(
|
|
|
108
118
|
key: Identity column name, or a sequence of names for a composite key.
|
|
109
119
|
Names refer to the *new* namespace (i.e. after ``column_map``).
|
|
110
120
|
columns: Restrict comparison to these columns. Default: every non-key
|
|
111
|
-
column seen on either side.
|
|
121
|
+
column seen on either side. Like ``key``, names are in the *new*
|
|
122
|
+
namespace (i.e. after ``column_map``) — pass the renamed name, not
|
|
123
|
+
the original, or the column will match nothing and its change be
|
|
124
|
+
missed.
|
|
112
125
|
ignore: Columns to exclude from comparison (timestamps, audit fields,
|
|
113
|
-
row numbers). Applied after ``columns``.
|
|
126
|
+
row numbers), also in the *new* namespace. Applied after ``columns``.
|
|
114
127
|
column_map: ``{old_name: new_name}`` renames applied to ``old`` rows so
|
|
115
128
|
renamed columns are not reported as removed+added.
|
|
116
129
|
rel_tol: Relative floating-point tolerance for numeric columns
|
|
117
130
|
(``math.isclose``). Default ``1e-9``.
|
|
118
131
|
abs_tol: Absolute floating-point tolerance for numeric columns. Default
|
|
119
132
|
``0.0`` — set this to compare values near zero.
|
|
120
|
-
normalize_whitespace: Collapse/strip whitespace
|
|
121
|
-
characters
|
|
122
|
-
|
|
133
|
+
normalize_whitespace: Collapse/strip whitespace, drop invisible
|
|
134
|
+
characters, and fold typographic variants (smart quotes→straight,
|
|
135
|
+
dashes/minus→``-``, ``…``→``...``) before comparing strings (via
|
|
136
|
+
``cleanmonkey``'s ``default`` profile), so cosmetic punctuation
|
|
137
|
+
differences are not reported as changes. Default True.
|
|
123
138
|
null_equivalent: Treat all null spellings (``None``, ``""``,
|
|
124
139
|
whitespace-only, ``"na"``, …) as one value, so two different nulls
|
|
125
140
|
are equal and null↔value is a change. Default True.
|
|
@@ -10,8 +10,8 @@ analyst can pivot/filter the diff. The schema is fixed:
|
|
|
10
10
|
* ``key`` is the row key rendered as ``col=value`` (joined by ``; `` for
|
|
11
11
|
composite keys).
|
|
12
12
|
* For ``added``/``removed`` rows there is no per-field breakdown, so one row is
|
|
13
|
-
emitted with
|
|
14
|
-
|
|
13
|
+
emitted with ``column``/``old``/``new`` all empty — the row's presence under
|
|
14
|
+
``change_type`` (and its key) is the change.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from __future__ import annotations
|
|
@@ -26,8 +26,13 @@ HEADER = ["change_type", "key", "column", "old", "new"]
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def _key_str(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
29
|
+
# Render a None key component as the (null) sentinel, like the markdown and
|
|
30
|
+
# HTML reports. A None key arises from on_missing_key="warn" (the default);
|
|
31
|
+
# leaving it "" would make it indistinguishable from a genuine empty-string
|
|
32
|
+
# key in the CSV.
|
|
29
33
|
return "; ".join(
|
|
30
|
-
f"{col}={'' if val is None else val}"
|
|
34
|
+
f"{col}={'(null)' if val is None else val}"
|
|
35
|
+
for col, val in zip(key_columns, rd.key)
|
|
31
36
|
)
|
|
32
37
|
|
|
33
38
|
|
|
@@ -35,7 +35,7 @@ def _cell(value: Any) -> str:
|
|
|
35
35
|
def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
36
36
|
return escape(
|
|
37
37
|
", ".join(
|
|
38
|
-
f"{col}={'' if val is None else val}"
|
|
38
|
+
f"{col}={'(null)' if val is None else val}"
|
|
39
39
|
for col, val in zip(key_columns, rd.key)
|
|
40
40
|
)
|
|
41
41
|
)
|
|
@@ -8,6 +8,7 @@ downstream scripts can rely on its shape.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import re
|
|
11
12
|
from typing import Any
|
|
12
13
|
|
|
13
14
|
from ..models import DiffResult, RowDiff
|
|
@@ -20,14 +21,35 @@ def _fmt(value: Any) -> str:
|
|
|
20
21
|
return str(value)
|
|
21
22
|
|
|
22
23
|
|
|
24
|
+
def _code(value: Any) -> str:
|
|
25
|
+
"""Wrap a value in a backtick code span that survives embedded backticks.
|
|
26
|
+
|
|
27
|
+
A naive ```{value}``` breaks when the value itself contains a
|
|
28
|
+
backtick (the span closes early and the rest renders as garbled text). Per
|
|
29
|
+
CommonMark, a code span may be fenced by any number of backticks, so we pick
|
|
30
|
+
a fence one longer than the longest run inside the value and pad with spaces
|
|
31
|
+
when needed — lossless, no character substitution.
|
|
32
|
+
"""
|
|
33
|
+
text = _fmt(value)
|
|
34
|
+
if "`" not in text:
|
|
35
|
+
return f"`{text}`"
|
|
36
|
+
longest = max(len(run) for run in re.findall(r"`+", text))
|
|
37
|
+
fence = "`" * (longest + 1)
|
|
38
|
+
return f"{fence} {text} {fence}"
|
|
39
|
+
|
|
40
|
+
|
|
23
41
|
def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
|
|
24
42
|
parts = [f"{col}={_fmt(val)}" for col, val in zip(key_columns, rd.key)]
|
|
25
43
|
return ", ".join(parts)
|
|
26
44
|
|
|
27
45
|
|
|
28
46
|
def _limit(rows: list[RowDiff], max_rows: int | None) -> tuple[list[RowDiff], int]:
|
|
29
|
-
"""Return ``(shown_rows, hidden_count)`` honouring ``max_rows``.
|
|
30
|
-
|
|
47
|
+
"""Return ``(shown_rows, hidden_count)`` honouring ``max_rows``.
|
|
48
|
+
|
|
49
|
+
A negative ``max_rows`` means "no limit" (same as ``None``); it never
|
|
50
|
+
produces a phantom "N more" notice from slicing with a negative bound.
|
|
51
|
+
"""
|
|
52
|
+
if max_rows is None or max_rows < 0 or len(rows) <= max_rows:
|
|
31
53
|
return rows, 0
|
|
32
54
|
return rows[:max_rows], len(rows) - max_rows
|
|
33
55
|
|
|
@@ -58,7 +80,7 @@ def render(result: DiffResult, *, max_rows: int | None = None) -> str:
|
|
|
58
80
|
for rd in shown:
|
|
59
81
|
lines.append(f"### {_key_label(rd, result.key_columns)}")
|
|
60
82
|
for ch in rd.changes:
|
|
61
|
-
lines.append(f"- **{ch.column}**:
|
|
83
|
+
lines.append(f"- **{ch.column}**: {_code(ch.old)} → {_code(ch.new)}")
|
|
62
84
|
lines.append("")
|
|
63
85
|
if hidden:
|
|
64
86
|
lines.append(f"_… and {hidden} more changed row(s)._")
|