diffmonkey 1.0.0__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {diffmonkey-1.0.0/src/diffmonkey.egg-info → diffmonkey-1.1.1}/PKG-INFO +14 -4
  2. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/README.md +8 -0
  3. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/pyproject.toml +19 -5
  4. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/__init__.py +1 -1
  5. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/cli.py +14 -5
  6. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/comparators.py +85 -16
  7. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/compare.py +26 -11
  8. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/csv_out.py +8 -3
  9. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/html.py +1 -1
  10. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/markdown.py +25 -3
  11. diffmonkey-1.1.1/src/diffmonkey/readers.py +192 -0
  12. {diffmonkey-1.0.0 → diffmonkey-1.1.1/src/diffmonkey.egg-info}/PKG-INFO +14 -4
  13. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/requires.txt +5 -3
  14. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_cli.py +3 -1
  15. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_comparators.py +5 -4
  16. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_compare.py +0 -1
  17. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_compare_properties.py +0 -1
  18. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_formatters.py +0 -1
  19. diffmonkey-1.1.1/tests/test_review_fixes.py +564 -0
  20. diffmonkey-1.0.0/src/diffmonkey/readers.py +0 -117
  21. diffmonkey-1.0.0/tests/test_review_fixes.py +0 -146
  22. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/LICENSE +0 -0
  23. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/setup.cfg +0 -0
  24. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/__init__.py +0 -0
  25. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/matching.py +0 -0
  26. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/models.py +0 -0
  27. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/SOURCES.txt +0 -0
  28. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/dependency_links.txt +0 -0
  29. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/entry_points.txt +0 -0
  30. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey.egg-info/top_level.txt +0 -0
  31. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_matching.py +0 -0
  32. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_models.py +0 -0
  33. {diffmonkey-1.0.0 → diffmonkey-1.1.1}/tests/test_readers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffmonkey
3
- Version: 1.0.0
3
+ Version: 1.1.1
4
4
  Summary: Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports.
5
5
  Author-email: RexBytes <pythonic@rexbytes.com>
6
6
  License: MIT License
@@ -40,9 +40,9 @@ Classifier: Topic :: Utilities
40
40
  Requires-Python: >=3.11
41
41
  Description-Content-Type: text/markdown
42
42
  License-File: LICENSE
43
- Requires-Dist: cleanmonkey
44
- Requires-Dist: typemonkey
45
- Requires-Dist: datemonkey
43
+ Requires-Dist: cleanmonkey>=0.2.0
44
+ Requires-Dist: typemonkey>=1.2.0
45
+ Requires-Dist: datemonkey>=0.2.0
46
46
  Provides-Extra: excel
47
47
  Requires-Dist: openpyxl>=3.0; extra == "excel"
48
48
  Provides-Extra: dsv
@@ -51,6 +51,8 @@ Provides-Extra: dev
51
51
  Requires-Dist: pytest>=7.0; extra == "dev"
52
52
  Requires-Dist: pytest-cov; extra == "dev"
53
53
  Requires-Dist: hypothesis>=6.0; extra == "dev"
54
+ Requires-Dist: ruff; extra == "dev"
55
+ Requires-Dist: mypy; extra == "dev"
54
56
  Dynamic: license-file
55
57
 
56
58
  # diffmonkey
@@ -144,6 +146,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
144
146
  deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
145
147
  handling) so behaviour that looks surprising is not mistaken for a bug.
146
148
 
149
+ ## Contributing & quality
150
+
151
+ diffmonkey is tested and reviewed against an explicit quality contract. See
152
+ [`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
153
+ competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
154
+ for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
155
+ for the release rubric (`python scripts/readiness.py`).
156
+
147
157
  ## License
148
158
 
149
159
  MIT — see [`LICENSE`](./LICENSE).
@@ -89,6 +89,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
89
89
  deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
90
90
  handling) so behaviour that looks surprising is not mistaken for a bug.
91
91
 
92
+ ## Contributing & quality
93
+
94
+ diffmonkey is tested and reviewed against an explicit quality contract. See
95
+ [`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
96
+ competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
97
+ for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
98
+ for the release rubric (`python scripts/readiness.py`).
99
+
92
100
  ## License
93
101
 
94
102
  MIT — see [`LICENSE`](./LICENSE).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "diffmonkey"
7
- version = "1.0.0"
7
+ version = "1.1.1"
8
8
  description = "Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -23,15 +23,15 @@ classifiers = [
23
23
  "Topic :: Utilities",
24
24
  ]
25
25
  dependencies = [
26
- "cleanmonkey",
27
- "typemonkey",
28
- "datemonkey",
26
+ "cleanmonkey>=0.2.0",
27
+ "typemonkey>=1.2.0",
28
+ "datemonkey>=0.2.0",
29
29
  ]
30
30
 
31
31
  [project.optional-dependencies]
32
32
  excel = ["openpyxl>=3.0"]
33
33
  dsv = ["dsvmonkey"]
34
- dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0"]
34
+ dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0", "ruff", "mypy"]
35
35
 
36
36
  [project.scripts]
37
37
  diffmonkey = "diffmonkey.cli:main"
@@ -46,3 +46,17 @@ where = ["src"]
46
46
  [tool.pytest.ini_options]
47
47
  testpaths = ["tests"]
48
48
  pythonpath = ["src"]
49
+
50
+ [tool.ruff]
51
+ src = ["src", "tests"]
52
+
53
+ [tool.mypy]
54
+ files = ["src"]
55
+ python_version = "3.11"
56
+ warn_unused_ignores = true
57
+
58
+ # The rexbytes monkey libraries and openpyxl ship without type stubs; diffmonkey
59
+ # pins their runtime behaviour with tests, not types.
60
+ [[tool.mypy.overrides]]
61
+ module = ["cleanmonkey", "datemonkey", "typemonkey", "openpyxl", "openpyxl.*"]
62
+ ignore_missing_imports = true
@@ -31,7 +31,7 @@ from .models import (
31
31
  )
32
32
  from .readers import read_csv, read_excel, read_table
33
33
 
34
- __version__ = "1.0.0"
34
+ __version__ = "1.1.1"
35
35
 
36
36
  __all__ = [
37
37
  "compare",
@@ -23,10 +23,19 @@ EXIT_DIFF = 1
23
23
  EXIT_ERROR = 2
24
24
 
25
25
 
26
+ def _split_csv(value: str) -> list[str]:
27
+ return [part.strip() for part in value.split(",") if part.strip()]
28
+
29
+
26
30
  def _split_csv_opt(value: str | None) -> list[str] | None:
31
+ # An empty / all-blank option (e.g. --columns "" or a shell variable that
32
+ # expanded to nothing) means "unset", not "an explicit empty list". The
33
+ # latter would make _resolve_columns compare ZERO columns and exit 0,
34
+ # silently masking real differences from a CI gate.
27
35
  if value is None:
28
36
  return None
29
- return [part.strip() for part in value.split(",") if part.strip()]
37
+ parts = _split_csv(value)
38
+ return parts or None
30
39
 
31
40
 
32
41
  def _parse_column_map(pairs: Sequence[str] | None) -> dict[str, str] | None:
@@ -83,7 +92,7 @@ def build_parser() -> argparse.ArgumentParser:
83
92
  )
84
93
  cmp.add_argument(
85
94
  "--include-unchanged", action="store_true",
86
- help="Include unchanged rows in json/markdown output.",
95
+ help="Retain unchanged rows (listed in json output; counted elsewhere).",
87
96
  )
88
97
  cmp.add_argument(
89
98
  "--on-duplicate", choices=["warn", "first", "last", "error"], default="warn",
@@ -97,7 +106,7 @@ def build_parser() -> argparse.ArgumentParser:
97
106
  return parser
98
107
 
99
108
 
100
- def _render(result, fmt: str, *, include_unchanged: bool) -> str:
109
+ def _render(result, fmt: str) -> str:
101
110
  if fmt == "summary":
102
111
  text = result.summary.one_line()
103
112
  if result.warnings:
@@ -131,7 +140,7 @@ def main(argv: Sequence[str] | None = None) -> int:
131
140
  result = compare(
132
141
  old_rows,
133
142
  new_rows,
134
- key=_split_csv_opt(args.key),
143
+ key=_split_csv(args.key),
135
144
  columns=_split_csv_opt(args.columns),
136
145
  ignore=_split_csv_opt(args.ignore),
137
146
  column_map=_parse_column_map(args.map),
@@ -148,7 +157,7 @@ def main(argv: Sequence[str] | None = None) -> int:
148
157
  print(f"diffmonkey: {exc}", file=sys.stderr)
149
158
  return EXIT_ERROR
150
159
 
151
- text = _render(result, args.format, include_unchanged=args.include_unchanged)
160
+ text = _render(result, args.format)
152
161
  if args.output:
153
162
  try:
154
163
  with open(args.output, "w", encoding="utf-8", newline="") as fh:
@@ -76,7 +76,8 @@ def _clean(value: Any, settings: CompareSettings) -> str:
76
76
  text = value if isinstance(value, str) else str(value)
77
77
  if settings.normalize_whitespace:
78
78
  # 'minimal' would skip whitespace collapsing; 'default' strips and
79
- # collapses runs and removes invisible characters exactly the
79
+ # collapses runs, removes invisible characters, and folds typographic
80
+ # variants (smart quotes→straight, dashes→'-', '…'→'...') — exactly the
80
81
  # false-diff sources we want gone. strip=True is safe for cell values.
81
82
  return cleanmonkey.clean(text, profile="default")
82
83
  return text
@@ -95,15 +96,31 @@ class ColumnComparator:
95
96
  non-null and not already string-equal.
96
97
  """
97
98
 
98
- def __init__(self, column: str, kind: str, settings: CompareSettings) -> None:
99
+ def __init__(
100
+ self,
101
+ column: str,
102
+ kind: str,
103
+ settings: CompareSettings,
104
+ *,
105
+ old_locale: str = "us",
106
+ new_locale: str = "us",
107
+ ) -> None:
99
108
  self.column = column
100
109
  self.kind = kind # "numeric" | "date" | "boolean" | "string"
101
110
  self.settings = settings
111
+ # Number locale resolved per side (see _side_locale): when the caller
112
+ # left ``locale=None`` we honour typemonkey's auto-detection instead of
113
+ # forcing US, and each side keeps its own — so an ``old`` file in EU
114
+ # format and a ``new`` file in US format are each parsed correctly.
115
+ # (The date path is NOT per-side: it uses the single ``settings.locale``
116
+ # hint, so disambiguating DD/MM vs MM/DD across files still needs an
117
+ # explicit ``locale=`` — see LIMITATIONS.md and ``_as_date``.)
118
+ self.old_locale = old_locale
119
+ self.new_locale = new_locale
102
120
 
103
121
  # -- per-kind normalisation -------------------------------------------
104
122
 
105
- def _as_number(self, value: Any) -> float | _Unparseable:
106
- locale = self.settings.locale or "us"
123
+ def _as_number(self, value: Any, locale: str) -> float | _Unparseable:
107
124
  try:
108
125
  return float(typemonkey.parse_number(value, locale=locale).value)
109
126
  except (ValueError, TypeError):
@@ -121,6 +138,22 @@ class ColumnComparator:
121
138
  # whole point of a diff is that old and new differ), and per-value
122
139
  # parsing resolves DD/MM vs MM/DD via ``locale`` instead of guessing
123
140
  # from the majority and failing the minority. See LIMITATIONS.md.
141
+ #
142
+ # A bare number like ``"2024"`` is NOT a date a user writes; datemonkey
143
+ # would read it as an Excel serial day-number (``1`` -> 1900-01-01), so a
144
+ # numeric column compared against a date column would silently match the
145
+ # wrong day. Refuse SHORT bare numbers so the pair falls back to string
146
+ # comparison instead of accepting a serial-date interpretation. This
147
+ # covers integer (``"45000"``) AND float-form (``"45000.0"``, or the
148
+ # native float openpyxl returns for a number-formatted cell) serials.
149
+ # The length gate is on the INTEGER part: Excel serials top out at 7
150
+ # digits (``2958465`` is year 9999), so ``< 8`` integer digits blocks
151
+ # every serial while letting the 8-digit compact ISO ``YYYYMMDD`` parse.
152
+ text = str(value).strip()
153
+ core = text.lstrip("+-")
154
+ int_part, _, frac = core.partition(".")
155
+ if int_part.isdigit() and len(int_part) < 8 and (frac == "" or frac.isdigit()):
156
+ return UNPARSEABLE
124
157
  try:
125
158
  batch = datemonkey.parse_dates(
126
159
  [value], locale_preference=self.settings.locale
@@ -152,9 +185,9 @@ class ColumnComparator:
152
185
  return True
153
186
 
154
187
  if self.kind == "numeric":
155
- a = self._as_number(old)
156
- b = self._as_number(new)
157
- if a is UNPARSEABLE or b is UNPARSEABLE:
188
+ a = self._as_number(old, self.old_locale)
189
+ b = self._as_number(new, self.new_locale)
190
+ if isinstance(a, _Unparseable) or isinstance(b, _Unparseable):
158
191
  return _strings_equal(old, new, s)
159
192
  return math.isclose(a, b, rel_tol=s.rel_tol, abs_tol=s.abs_tol)
160
193
 
@@ -181,7 +214,9 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
181
214
 
182
215
  Returns ``"empty"`` when the side has no values to judge (so the other
183
216
  side decides). ``preserve_as_string`` columns (leading-zero IDs, zips,
184
- phone numbers) report ``"string"`` so they are never numeric-compared.
217
+ phone numbers) report ``"preserve"`` a *dominant* string kind that
218
+ :func:`_reconcile` will not let the other side out-vote, so they are never
219
+ numeric-compared even when the opposite side parses as a number.
185
220
  """
186
221
  values = list(values)
187
222
  if not values:
@@ -190,7 +225,7 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
190
225
  values, null_values=settings.null_values, locale=settings.locale
191
226
  )
192
227
  if profile.preserve_as_string:
193
- return "string"
228
+ return "preserve"
194
229
  t = profile.type
195
230
  if t in NUMERIC_TYPES:
196
231
  return "numeric"
@@ -211,13 +246,17 @@ def _reconcile(old_kind: str, new_kind: str, settings: CompareSettings) -> str:
211
246
  is exactly what we must not treat as a change. So we infer per side and
212
247
  pick the more specific kind, trusting :meth:`ColumnComparator.equal` to
213
248
  fall back to string comparison for any individual pair that cannot be
214
- parsed under that kind. Precedence: **date** (if ``date_aware``) > numeric.
215
- Boolean requires *both* sides to look boolean, so an integer column versus
216
- a ``true``/``false`` column is not silently equated (``1`` == ``true``).
249
+ parsed under that kind. Precedence: **preserve** (leading-zero IDs etc.,
250
+ dominant never numeric-compared, even against a numeric side) > **date**
251
+ (if ``date_aware``) > numeric. Boolean requires *both* sides to look
252
+ boolean, so an integer column versus a ``true``/``false`` column is not
253
+ silently equated (``1`` == ``true``).
217
254
  """
218
255
  kinds = {old_kind, new_kind} - {"empty"}
219
256
  if not kinds:
220
257
  return "string"
258
+ if "preserve" in kinds:
259
+ return "string"
221
260
  if "date" in kinds and settings.date_aware:
222
261
  return "date"
223
262
  if "numeric" in kinds:
@@ -238,6 +277,27 @@ def _infer_kind(
238
277
  )
239
278
 
240
279
 
280
+ def _side_locale(values: Sequence[Any], settings: CompareSettings) -> str:
281
+ """Resolve the number-parsing locale for one side.
282
+
283
+ Honours an explicit ``settings.locale`` when given; otherwise auto-detects
284
+ it from the values via ``typemonkey.infer_type`` (whose ``profile.locale``
285
+ distinguishes ``"1.234,56"`` EU from ``"1,234.56"`` US), falling back to
286
+ ``"us"`` when there is nothing to detect. This is what makes the documented
287
+ ``locale=None`` "auto-detect" promise true for numbers — the comparator must
288
+ not silently force ``"us"`` and mis-compare EU-formatted data.
289
+ """
290
+ if settings.locale:
291
+ return settings.locale
292
+ values = list(values)
293
+ if not values:
294
+ return "us"
295
+ profile = typemonkey.infer_type(
296
+ values, null_values=settings.null_values, locale=None
297
+ )
298
+ return getattr(profile, "locale", None) or "us"
299
+
300
+
241
301
  def make_comparator(
242
302
  column: str,
243
303
  old_values: Sequence[Any],
@@ -248,12 +308,21 @@ def make_comparator(
248
308
 
249
309
  When ``settings.type_aware`` is false the comparator is a pure
250
310
  string/null comparator. Otherwise the column's kind is inferred from both
251
- sides combined; date columns then compare by parsed calendar date (each
252
- value parsed independently with the ``locale`` hint) so that an ``old``
253
- column in ``DD/MM/YYYY`` and a ``new`` column in ISO are not false diffs.
311
+ sides combined; date columns compare by parsed calendar date, each value
312
+ parsed independently with the ``locale`` hint. An ``old`` column in an
313
+ unambiguous format and a ``new`` column in another (e.g. ISO) are not false
314
+ diffs; an *ambiguous* mix such as ``DD/MM/YYYY`` vs ISO needs an explicit
315
+ ``locale=`` to disambiguate (see LIMITATIONS.md) — unlike numbers, the date
316
+ path does not auto-detect a per-side locale.
254
317
  """
255
318
  if not settings.type_aware:
256
319
  return ColumnComparator(column, "string", settings)
257
320
 
258
321
  kind = _infer_kind(old_values, new_values, settings)
259
- return ColumnComparator(column, kind, settings)
322
+ return ColumnComparator(
323
+ column,
324
+ kind,
325
+ settings,
326
+ old_locale=_side_locale(old_values, settings),
327
+ new_locale=_side_locale(new_values, settings),
328
+ )
@@ -12,7 +12,7 @@ from __future__ import annotations
12
12
  from typing import Any, Iterable, Mapping, Sequence
13
13
 
14
14
  from .comparators import CompareSettings, make_comparator
15
- from .matching import index_rows, make_key, validate_policies
15
+ from .matching import index_rows, validate_policies
16
16
  from .models import DiffResult, DiffSummary, FieldChange, RowDiff
17
17
 
18
18
  Row = Mapping[str, Any]
@@ -38,17 +38,27 @@ def _apply_column_map(
38
38
  sides share the new namespace. If both ``old_name`` and ``new_name`` are
39
39
  present in a row, the explicitly-new value wins (the rename target is not
40
40
  clobbered).
41
+
42
+ Renames are resolved against each row's *original* column names in a single
43
+ pass, so they never chain: ``{"a": "b", "b": "c"}`` renames the original
44
+ ``a``→``b`` and the original ``b``→``c`` independently — it does not rename a
45
+ freshly-produced ``b`` on to ``c``.
41
46
  """
42
47
  if not column_map:
43
48
  return [dict(r) for r in rows]
44
49
  out: list[dict[str, Any]] = []
45
50
  for r in rows:
46
51
  rd = dict(r)
47
- for old_name, new_name in column_map.items():
48
- if old_name in rd:
49
- value = rd.pop(old_name)
50
- rd.setdefault(new_name, value)
51
- out.append(rd)
52
+ # Columns whose names are taken literally (not rename sources); an
53
+ # explicit value already under a rename target lives here and must win.
54
+ literal_targets = {c for c in rd if c not in column_map}
55
+ renamed: dict[str, Any] = {}
56
+ for col, value in rd.items():
57
+ target = column_map.get(col, col)
58
+ if col in column_map and target in literal_targets:
59
+ continue # explicit-new value present elsewhere wins
60
+ renamed.setdefault(target, value)
61
+ out.append(renamed)
52
62
  return out
53
63
 
54
64
 
@@ -108,18 +118,23 @@ def compare(
108
118
  key: Identity column name, or a sequence of names for a composite key.
109
119
  Names refer to the *new* namespace (i.e. after ``column_map``).
110
120
  columns: Restrict comparison to these columns. Default: every non-key
111
- column seen on either side.
121
+ column seen on either side. Like ``key``, names are in the *new*
122
+ namespace (i.e. after ``column_map``) — pass the renamed name, not
123
+ the original, or the column will match nothing and its change be
124
+ missed.
112
125
  ignore: Columns to exclude from comparison (timestamps, audit fields,
113
- row numbers). Applied after ``columns``.
126
+ row numbers), also in the *new* namespace. Applied after ``columns``.
114
127
  column_map: ``{old_name: new_name}`` renames applied to ``old`` rows so
115
128
  renamed columns are not reported as removed+added.
116
129
  rel_tol: Relative floating-point tolerance for numeric columns
117
130
  (``math.isclose``). Default ``1e-9``.
118
131
  abs_tol: Absolute floating-point tolerance for numeric columns. Default
119
132
  ``0.0`` — set this to compare values near zero.
120
- normalize_whitespace: Collapse/strip whitespace and drop invisible
121
- characters before comparing strings (via ``cleanmonkey``). Default
122
- True.
133
+ normalize_whitespace: Collapse/strip whitespace, drop invisible
134
+ characters, and fold typographic variants (smart quotes→straight,
135
+ dashes/minus→``-``, ``…``→``...``) before comparing strings (via
136
+ ``cleanmonkey``'s ``default`` profile), so cosmetic punctuation
137
+ differences are not reported as changes. Default True.
123
138
  null_equivalent: Treat all null spellings (``None``, ``""``,
124
139
  whitespace-only, ``"na"``, …) as one value, so two different nulls
125
140
  are equal and null↔value is a change. Default True.
@@ -10,8 +10,8 @@ analyst can pivot/filter the diff. The schema is fixed:
10
10
  * ``key`` is the row key rendered as ``col=value`` (joined by ``; `` for
11
11
  composite keys).
12
12
  * For ``added``/``removed`` rows there is no per-field breakdown, so one row is
13
- emitted with empty ``column``/``old``/``new`` (added) or ``column``/``new``
14
- empty (removed) the presence of the row is the change.
13
+ emitted with ``column``/``old``/``new`` all empty — the row's presence under
14
+ ``change_type`` (and its key) is the change.
15
15
  """
16
16
 
17
17
  from __future__ import annotations
@@ -26,8 +26,13 @@ HEADER = ["change_type", "key", "column", "old", "new"]
26
26
 
27
27
 
28
28
  def _key_str(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
29
+ # Render a None key component as the (null) sentinel, like the markdown and
30
+ # HTML reports. A None key arises from on_missing_key="warn" (the default);
31
+ # leaving it "" would make it indistinguishable from a genuine empty-string
32
+ # key in the CSV.
29
33
  return "; ".join(
30
- f"{col}={'' if val is None else val}" for col, val in zip(key_columns, rd.key)
34
+ f"{col}={'(null)' if val is None else val}"
35
+ for col, val in zip(key_columns, rd.key)
31
36
  )
32
37
 
33
38
 
@@ -35,7 +35,7 @@ def _cell(value: Any) -> str:
35
35
  def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
36
36
  return escape(
37
37
  ", ".join(
38
- f"{col}={'' if val is None else val}"
38
+ f"{col}={'(null)' if val is None else val}"
39
39
  for col, val in zip(key_columns, rd.key)
40
40
  )
41
41
  )
@@ -8,6 +8,7 @@ downstream scripts can rely on its shape.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import re
11
12
  from typing import Any
12
13
 
13
14
  from ..models import DiffResult, RowDiff
@@ -20,14 +21,35 @@ def _fmt(value: Any) -> str:
20
21
  return str(value)
21
22
 
22
23
 
24
+ def _code(value: Any) -> str:
25
+ """Wrap a value in a backtick code span that survives embedded backticks.
26
+
27
+ A naive ```{value}``` breaks when the value itself contains a
28
+ backtick (the span closes early and the rest renders as garbled text). Per
29
+ CommonMark, a code span may be fenced by any number of backticks, so we pick
30
+ a fence one longer than the longest run inside the value and pad with spaces
31
+ when needed — lossless, no character substitution.
32
+ """
33
+ text = _fmt(value)
34
+ if "`" not in text:
35
+ return f"`{text}`"
36
+ longest = max(len(run) for run in re.findall(r"`+", text))
37
+ fence = "`" * (longest + 1)
38
+ return f"{fence} {text} {fence}"
39
+
40
+
23
41
  def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
24
42
  parts = [f"{col}={_fmt(val)}" for col, val in zip(key_columns, rd.key)]
25
43
  return ", ".join(parts)
26
44
 
27
45
 
28
46
  def _limit(rows: list[RowDiff], max_rows: int | None) -> tuple[list[RowDiff], int]:
29
- """Return ``(shown_rows, hidden_count)`` honouring ``max_rows``."""
30
- if max_rows is None or len(rows) <= max_rows:
47
+ """Return ``(shown_rows, hidden_count)`` honouring ``max_rows``.
48
+
49
+ A negative ``max_rows`` means "no limit" (same as ``None``); it never
50
+ produces a phantom "N more" notice from slicing with a negative bound.
51
+ """
52
+ if max_rows is None or max_rows < 0 or len(rows) <= max_rows:
31
53
  return rows, 0
32
54
  return rows[:max_rows], len(rows) - max_rows
33
55
 
@@ -58,7 +80,7 @@ def render(result: DiffResult, *, max_rows: int | None = None) -> str:
58
80
  for rd in shown:
59
81
  lines.append(f"### {_key_label(rd, result.key_columns)}")
60
82
  for ch in rd.changes:
61
- lines.append(f"- **{ch.column}**: `{_fmt(ch.old)}``{_fmt(ch.new)}`")
83
+ lines.append(f"- **{ch.column}**: {_code(ch.old)} → {_code(ch.new)}")
62
84
  lines.append("")
63
85
  if hidden:
64
86
  lines.append(f"_… and {hidden} more changed row(s)._")