diffmonkey 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffmonkey/__init__.py ADDED
@@ -0,0 +1,49 @@
1
+ """diffmonkey — type-aware, key-based structural diffing of tabular datasets.
2
+
3
+ Public API::
4
+
5
+ from diffmonkey import compare
6
+ result = compare(old_rows, new_rows, key="id")
7
+ print(result.summary.one_line())
8
+ print(result.to_markdown())
9
+
10
+ ``compare`` matches rows by a key column (or composite key), compares the
11
+ remaining columns with type awareness (numbers by value, dates by calendar
12
+ date, booleans by truth, strings whitespace-normalised, nulls unified), and
13
+ returns a :class:`DiffResult` bucketed into added / removed / changed /
14
+ unchanged with summary statistics and multiple report formats.
15
+
16
+ See ``LIMITATIONS.md`` for deliberate design tradeoffs and ``SKILL.md`` for
17
+ LLM-oriented usage guidance.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from .compare import compare
23
+ from .models import (
24
+ DiffMonkeyError,
25
+ DiffResult,
26
+ DiffSummary,
27
+ DuplicateKeyError,
28
+ FieldChange,
29
+ MissingKeyError,
30
+ RowDiff,
31
+ )
32
+ from .readers import read_csv, read_excel, read_table
33
+
34
+ __version__ = "1.0.0"
35
+
36
+ __all__ = [
37
+ "compare",
38
+ "DiffResult",
39
+ "DiffSummary",
40
+ "RowDiff",
41
+ "FieldChange",
42
+ "DiffMonkeyError",
43
+ "DuplicateKeyError",
44
+ "MissingKeyError",
45
+ "read_table",
46
+ "read_csv",
47
+ "read_excel",
48
+ "__version__",
49
+ ]
diffmonkey/cli.py ADDED
@@ -0,0 +1,168 @@
1
+ """Command-line interface — a thin wrapper around :func:`diffmonkey.compare`.
2
+
3
+ This module exists only to parse arguments, read input files
4
+ (:mod:`diffmonkey.readers`), call the library, render the chosen format and
5
+ return a process exit code. It contains no comparison logic. Exit codes:
6
+ ``0`` = no differences, ``1`` = differences found, ``2`` = usage/IO error.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import sys
14
+ from typing import Sequence
15
+
16
+ from . import __version__
17
+ from .compare import compare
18
+ from .models import DiffMonkeyError
19
+ from .readers import read_table
20
+
21
+ EXIT_NO_DIFF = 0
22
+ EXIT_DIFF = 1
23
+ EXIT_ERROR = 2
24
+
25
+
26
+ def _split_csv_opt(value: str | None) -> list[str] | None:
27
+ if value is None:
28
+ return None
29
+ return [part.strip() for part in value.split(",") if part.strip()]
30
+
31
+
32
+ def _parse_column_map(pairs: Sequence[str] | None) -> dict[str, str] | None:
33
+ if not pairs:
34
+ return None
35
+ mapping: dict[str, str] = {}
36
+ for pair in pairs:
37
+ if "=" not in pair:
38
+ # ValueError (not argparse.ArgumentTypeError): this runs inside
39
+ # main(), not as an argparse type= callable, so it must be caught
40
+ # by main()'s handler and turned into exit code 2.
41
+ raise ValueError(f"--map expects OLD=NEW, got {pair!r}")
42
+ old, new = pair.split("=", 1)
43
+ mapping[old.strip()] = new.strip()
44
+ return mapping
45
+
46
+
47
+ def build_parser() -> argparse.ArgumentParser:
48
+ parser = argparse.ArgumentParser(
49
+ prog="diffmonkey",
50
+ description="Type-aware, key-based structural diff of two tabular files.",
51
+ )
52
+ parser.add_argument("--version", action="version", version=f"diffmonkey {__version__}")
53
+ sub = parser.add_subparsers(dest="command", required=True)
54
+
55
+ cmp = sub.add_parser("compare", help="Compare two files by key column(s).")
56
+ cmp.add_argument("old", help="Baseline file (CSV/TSV/Excel).")
57
+ cmp.add_argument("new", help="Current file to compare against the baseline.")
58
+ cmp.add_argument(
59
+ "-k", "--key", required=True,
60
+ help="Identity column, or comma-separated columns for a composite key.",
61
+ )
62
+ cmp.add_argument("--columns", help="Comma-separated columns to compare (default: all).")
63
+ cmp.add_argument("--ignore", help="Comma-separated columns to exclude.")
64
+ cmp.add_argument(
65
+ "--map", action="append", metavar="OLD=NEW",
66
+ help="Rename an old column to a new name (repeatable).",
67
+ )
68
+ cmp.add_argument(
69
+ "--format", choices=["summary", "markdown", "html", "csv", "json"],
70
+ default="summary", help="Output format (default: summary).",
71
+ )
72
+ cmp.add_argument("-o", "--output", help="Write report to this file instead of stdout.")
73
+ cmp.add_argument("--rel-tol", type=float, default=1e-9, help="Numeric relative tolerance.")
74
+ cmp.add_argument("--abs-tol", type=float, default=0.0, help="Numeric absolute tolerance.")
75
+ cmp.add_argument("--locale", choices=["us", "eu"], help="Number/date locale hint.")
76
+ cmp.add_argument(
77
+ "--no-type-aware", action="store_true",
78
+ help="Compare every column as a normalised string.",
79
+ )
80
+ cmp.add_argument(
81
+ "--no-null-equivalent", action="store_true",
82
+ help="Do not treat different null spellings as equal.",
83
+ )
84
+ cmp.add_argument(
85
+ "--include-unchanged", action="store_true",
86
+ help="Include unchanged rows in json/markdown output.",
87
+ )
88
+ cmp.add_argument(
89
+ "--on-duplicate", choices=["warn", "first", "last", "error"], default="warn",
90
+ help="Duplicate-key policy (default: warn).",
91
+ )
92
+ cmp.add_argument(
93
+ "--on-missing-key", choices=["warn", "skip", "error"], default="warn",
94
+ help="Missing-key policy (default: warn).",
95
+ )
96
+ cmp.add_argument("--delimiter", help="Force a delimiter for DSV inputs.")
97
+ return parser
98
+
99
+
100
+ def _render(result, fmt: str, *, include_unchanged: bool) -> str:
101
+ if fmt == "summary":
102
+ text = result.summary.one_line()
103
+ if result.warnings:
104
+ text += "\n" + "\n".join(f"warning: {w}" for w in result.warnings)
105
+ return text + "\n"
106
+ if fmt == "markdown":
107
+ return result.to_markdown()
108
+ if fmt == "html":
109
+ return result.to_html()
110
+ if fmt == "csv":
111
+ return result.to_csv()
112
+ if fmt == "json":
113
+ return json.dumps(result.to_dict(), indent=2, default=str) + "\n"
114
+ raise ValueError(f"unknown format {fmt!r}") # pragma: no cover
115
+
116
+
117
+ def main(argv: Sequence[str] | None = None) -> int:
118
+ parser = build_parser()
119
+ args = parser.parse_args(argv)
120
+
121
+ if args.command == "compare":
122
+ try:
123
+ read_kwargs = {"delimiter": args.delimiter} if args.delimiter else {}
124
+ old_rows = read_table(args.old, **read_kwargs)
125
+ new_rows = read_table(args.new, **read_kwargs)
126
+ except (OSError, RuntimeError, ValueError, TypeError) as exc:
127
+ print(f"diffmonkey: error reading input: {exc}", file=sys.stderr)
128
+ return EXIT_ERROR
129
+
130
+ try:
131
+ result = compare(
132
+ old_rows,
133
+ new_rows,
134
+ key=_split_csv_opt(args.key),
135
+ columns=_split_csv_opt(args.columns),
136
+ ignore=_split_csv_opt(args.ignore),
137
+ column_map=_parse_column_map(args.map),
138
+ rel_tol=args.rel_tol,
139
+ abs_tol=args.abs_tol,
140
+ locale=args.locale,
141
+ type_aware=not args.no_type_aware,
142
+ null_equivalent=not args.no_null_equivalent,
143
+ include_unchanged=args.include_unchanged,
144
+ on_duplicate=args.on_duplicate,
145
+ on_missing_key=args.on_missing_key,
146
+ )
147
+ except (DiffMonkeyError, ValueError, TypeError) as exc:
148
+ print(f"diffmonkey: {exc}", file=sys.stderr)
149
+ return EXIT_ERROR
150
+
151
+ text = _render(result, args.format, include_unchanged=args.include_unchanged)
152
+ if args.output:
153
+ try:
154
+ with open(args.output, "w", encoding="utf-8", newline="") as fh:
155
+ fh.write(text)
156
+ except OSError as exc:
157
+ print(f"diffmonkey: error writing output: {exc}", file=sys.stderr)
158
+ return EXIT_ERROR
159
+ else:
160
+ sys.stdout.write(text)
161
+
162
+ return EXIT_DIFF if result.has_changes() else EXIT_NO_DIFF
163
+
164
+ return EXIT_ERROR # pragma: no cover - argparse enforces a command
165
+
166
+
167
+ if __name__ == "__main__": # pragma: no cover
168
+ raise SystemExit(main())
@@ -0,0 +1,259 @@
1
+ """Type-aware value equivalence for column comparison.
2
+
3
+ This module exists to answer one question well: *are these two cell values the
4
+ same?* — where "same" should ignore differences that are not real data changes.
5
+ ``"1,234"`` and ``"1234"`` are the same number; ``"01/02/2025"`` and
6
+ ``"2025-01-02"`` may be the same date; ``" foo "`` and ``"foo"`` are the same
7
+ string; ``""`` and ``None`` are both missing. We lean on the ecosystem packages
8
+ that already solve each sub-problem (``typemonkey`` for numbers/booleans/null
9
+ vocabulary and type inference, ``datemonkey`` for date parsing, ``cleanmonkey``
10
+ for whitespace/invisible-character normalisation) rather than re-deriving them.
11
+
12
+ The strategy is *column-level*: ``infer_type`` decides a column's kind once
13
+ (over the union of both sides), then a :class:`ColumnComparator` compares every
14
+ pair in that column under that kind, falling back to normalised-string equality
15
+ for any individual pair the kind cannot parse. This avoids the trap of trying
16
+ every strategy per pair, which would call ``"1"`` and ``"true"`` equal.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import math
22
+ from dataclasses import dataclass
23
+ from typing import Any, Sequence
24
+
25
+ import cleanmonkey
26
+ import datemonkey
27
+ import typemonkey
28
+ from typemonkey import TypeName
29
+
30
+ NUMERIC_TYPES = frozenset(
31
+ {TypeName.INTEGER, TypeName.FLOAT, TypeName.CURRENCY, TypeName.PERCENTAGE}
32
+ )
33
+
34
+
35
+ class _Unparseable:
36
+ """Sentinel: a value could not be normalised to a column's kind."""
37
+
38
+ __slots__ = ()
39
+
40
+ def __repr__(self) -> str: # pragma: no cover - debug aid
41
+ return "<unparseable>"
42
+
43
+
44
+ UNPARSEABLE = _Unparseable()
45
+
46
+
47
+ @dataclass
48
+ class CompareSettings:
49
+ """Knobs that govern equivalence, shared across all columns of one compare."""
50
+
51
+ rel_tol: float = 1e-9
52
+ abs_tol: float = 0.0
53
+ normalize_whitespace: bool = True
54
+ null_equivalent: bool = True
55
+ type_aware: bool = True
56
+ date_aware: bool = True
57
+ locale: str | None = None
58
+ null_values: frozenset[str] | None = None
59
+
60
+
61
+ def is_null(value: Any, settings: CompareSettings) -> bool:
62
+ """True when ``value`` should be treated as missing.
63
+
64
+ ``None`` is always null. Otherwise the decision is delegated to
65
+ ``typemonkey.is_null`` (whose default vocabulary includes ``""``,
66
+ whitespace-only, ``"na"``, ``"null"``, ``"none"`` and friends), optionally
67
+ restricted to a caller-supplied ``null_values`` set.
68
+ """
69
+ if value is None:
70
+ return True
71
+ return typemonkey.is_null(value, settings.null_values)
72
+
73
+
74
+ def _clean(value: Any, settings: CompareSettings) -> str:
75
+ """Normalise a value to its comparable string form."""
76
+ text = value if isinstance(value, str) else str(value)
77
+ if settings.normalize_whitespace:
78
+ # 'minimal' would skip whitespace collapsing; 'default' strips and
79
+ # collapses runs and removes invisible characters — exactly the
80
+ # false-diff sources we want gone. strip=True is safe for cell values.
81
+ return cleanmonkey.clean(text, profile="default")
82
+ return text
83
+
84
+
85
+ def _strings_equal(old: Any, new: Any, settings: CompareSettings) -> bool:
86
+ return _clean(old, settings) == _clean(new, settings)
87
+
88
+
89
+ class ColumnComparator:
90
+ """Compares values of a single column under one inferred kind.
91
+
92
+ Construct via :func:`make_comparator`. ``equal(old, new)`` returns whether
93
+ the two raw values are equivalent. Null handling and exact-string fast path
94
+ are applied first; the kind-specific test runs only when both values are
95
+ non-null and not already string-equal.
96
+ """
97
+
98
+ def __init__(self, column: str, kind: str, settings: CompareSettings) -> None:
99
+ self.column = column
100
+ self.kind = kind # "numeric" | "date" | "boolean" | "string"
101
+ self.settings = settings
102
+
103
+ # -- per-kind normalisation -------------------------------------------
104
+
105
+ def _as_number(self, value: Any) -> float | _Unparseable:
106
+ locale = self.settings.locale or "us"
107
+ try:
108
+ return float(typemonkey.parse_number(value, locale=locale).value)
109
+ except (ValueError, TypeError):
110
+ return UNPARSEABLE
111
+
112
+ def _as_bool(self, value: Any) -> bool | _Unparseable:
113
+ try:
114
+ return typemonkey.parse_boolean(value).value
115
+ except (ValueError, TypeError):
116
+ return UNPARSEABLE
117
+
118
+ def _as_date(self, value: Any):
119
+ # Parse each value independently with a locale hint rather than locking
120
+ # one format per column: a column may legitimately mix formats (the
121
+ # whole point of a diff is that old and new differ), and per-value
122
+ # parsing resolves DD/MM vs MM/DD via ``locale`` instead of guessing
123
+ # from the majority and failing the minority. See LIMITATIONS.md.
124
+ try:
125
+ batch = datemonkey.parse_dates(
126
+ [value], locale_preference=self.settings.locale
127
+ )
128
+ except Exception: # datemonkey raises various errors on odd input
129
+ return UNPARSEABLE
130
+ parsed = batch.dates[0] if batch.dates else None
131
+ return parsed if parsed is not None else UNPARSEABLE
132
+
133
+ # -- the public test ---------------------------------------------------
134
+
135
+ def equal(self, old: Any, new: Any) -> bool:
136
+ s = self.settings
137
+ old_null = is_null(old, s)
138
+ new_null = is_null(new, s)
139
+ if s.null_equivalent:
140
+ if old_null and new_null:
141
+ return True
142
+ if old_null != new_null:
143
+ return False
144
+ else:
145
+ # nulls are just ordinary strings; fall through to string/kind tests
146
+ if old_null and new_null:
147
+ return _strings_equal(old, new, s)
148
+
149
+ # Fast path: identical text (after optional whitespace normalisation)
150
+ # is equal under every kind, and is cheap.
151
+ if _strings_equal(old, new, s):
152
+ return True
153
+
154
+ if self.kind == "numeric":
155
+ a = self._as_number(old)
156
+ b = self._as_number(new)
157
+ if a is UNPARSEABLE or b is UNPARSEABLE:
158
+ return _strings_equal(old, new, s)
159
+ return math.isclose(a, b, rel_tol=s.rel_tol, abs_tol=s.abs_tol)
160
+
161
+ if self.kind == "boolean":
162
+ a = self._as_bool(old)
163
+ b = self._as_bool(new)
164
+ if a is UNPARSEABLE or b is UNPARSEABLE:
165
+ return _strings_equal(old, new, s)
166
+ return a == b
167
+
168
+ if self.kind == "date":
169
+ a = self._as_date(old)
170
+ b = self._as_date(new)
171
+ if a is UNPARSEABLE or b is UNPARSEABLE:
172
+ return _strings_equal(old, new, s)
173
+ return a == b
174
+
175
+ # string / fallback
176
+ return _strings_equal(old, new, s)
177
+
178
+
179
+ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
180
+ """Infer one side's kind via ``typemonkey.infer_type``.
181
+
182
+ Returns ``"empty"`` when the side has no values to judge (so the other
183
+ side decides). ``preserve_as_string`` columns (leading-zero IDs, zips,
184
+ phone numbers) report ``"string"`` so they are never numeric-compared.
185
+ """
186
+ values = list(values)
187
+ if not values:
188
+ return "empty"
189
+ profile = typemonkey.infer_type(
190
+ values, null_values=settings.null_values, locale=settings.locale
191
+ )
192
+ if profile.preserve_as_string:
193
+ return "string"
194
+ t = profile.type
195
+ if t in NUMERIC_TYPES:
196
+ return "numeric"
197
+ if t == TypeName.BOOLEAN:
198
+ return "boolean"
199
+ if t == TypeName.DATE:
200
+ return "date"
201
+ if t == TypeName.NULL:
202
+ return "empty"
203
+ return "string"
204
+
205
+
206
+ def _reconcile(old_kind: str, new_kind: str, settings: CompareSettings) -> str:
207
+ """Combine the two sides' inferred kinds into one comparison kind.
208
+
209
+ Each *file* is usually internally consistent in format, but the two files
210
+ may differ (US dates vs ISO, ``"1,234"`` vs ``"1234"``) — that difference
211
+ is exactly what we must not treat as a change. So we infer per side and
212
+ pick the more specific kind, trusting :meth:`ColumnComparator.equal` to
213
+ fall back to string comparison for any individual pair that cannot be
214
+ parsed under that kind. Precedence: **date** (if ``date_aware``) > numeric.
215
+ Boolean requires *both* sides to look boolean, so an integer column versus
216
+ a ``true``/``false`` column is not silently equated (``1`` == ``true``).
217
+ """
218
+ kinds = {old_kind, new_kind} - {"empty"}
219
+ if not kinds:
220
+ return "string"
221
+ if "date" in kinds and settings.date_aware:
222
+ return "date"
223
+ if "numeric" in kinds:
224
+ return "numeric"
225
+ if kinds == {"boolean"}:
226
+ return "boolean"
227
+ return "string"
228
+
229
+
230
+ def _infer_kind(
231
+ old_values: Sequence[Any], new_values: Sequence[Any], settings: CompareSettings
232
+ ) -> str:
233
+ """Infer the comparison kind for a column from both sides independently."""
234
+ return _reconcile(
235
+ _side_kind(old_values, settings),
236
+ _side_kind(new_values, settings),
237
+ settings,
238
+ )
239
+
240
+
241
+ def make_comparator(
242
+ column: str,
243
+ old_values: Sequence[Any],
244
+ new_values: Sequence[Any],
245
+ settings: CompareSettings,
246
+ ) -> ColumnComparator:
247
+ """Build a :class:`ColumnComparator` for ``column``.
248
+
249
+ When ``settings.type_aware`` is false the comparator is a pure
250
+ string/null comparator. Otherwise the column's kind is inferred from both
251
+ sides combined; date columns then compare by parsed calendar date (each
252
+ value parsed independently with the ``locale`` hint) so that an ``old``
253
+ column in ``DD/MM/YYYY`` and a ``new`` column in ISO are not false diffs.
254
+ """
255
+ if not settings.type_aware:
256
+ return ColumnComparator(column, "string", settings)
257
+
258
+ kind = _infer_kind(old_values, new_values, settings)
259
+ return ColumnComparator(column, kind, settings)