PyPI - diffmonkey - Versions diffs - 1.0.0__py3-none-any.whl - Mend

diffmonkey 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

diffmonkey/__init__.py +49 -0
diffmonkey/cli.py +168 -0
diffmonkey/comparators.py +259 -0
diffmonkey/compare.py +253 -0
diffmonkey/formatters/__init__.py +13 -0
diffmonkey/formatters/csv_out.py +55 -0
diffmonkey/formatters/html.py +93 -0
diffmonkey/formatters/markdown.py +94 -0
diffmonkey/matching.py +141 -0
diffmonkey/models.py +185 -0
diffmonkey/readers.py +117 -0
diffmonkey-1.0.0.dist-info/METADATA +153 -0
diffmonkey-1.0.0.dist-info/RECORD +17 -0
diffmonkey-1.0.0.dist-info/WHEEL +5 -0
diffmonkey-1.0.0.dist-info/entry_points.txt +2 -0
diffmonkey-1.0.0.dist-info/licenses/LICENSE +21 -0
diffmonkey-1.0.0.dist-info/top_level.txt +1 -0

diffmonkey/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""diffmonkey — type-aware, key-based structural diffing of tabular datasets.
+Public API::
+    from diffmonkey import compare
+    result = compare(old_rows, new_rows, key="id")
+    print(result.summary.one_line())
+    print(result.to_markdown())
+``compare`` matches rows by a key column (or composite key), compares the
+remaining columns with type awareness (numbers by value, dates by calendar
+date, booleans by truth, strings whitespace-normalised, nulls unified), and
+returns a :class:`DiffResult` bucketed into added / removed / changed /
+unchanged with summary statistics and multiple report formats.
+See ``LIMITATIONS.md`` for deliberate design tradeoffs and ``SKILL.md`` for
+LLM-oriented usage guidance.
+"""
+from __future__ import annotations
+from .compare import compare
+from .models import (
+    DiffMonkeyError,
+    DiffResult,
+    DiffSummary,
+    DuplicateKeyError,
+    FieldChange,
+    MissingKeyError,
+    RowDiff,
+)
+from .readers import read_csv, read_excel, read_table
+__version__ = "1.0.0"
+__all__ = [
+    "compare",
+    "DiffResult",
+    "DiffSummary",
+    "RowDiff",
+    "FieldChange",
+    "DiffMonkeyError",
+    "DuplicateKeyError",
+    "MissingKeyError",
+    "read_table",
+    "read_csv",
+    "read_excel",
+    "__version__",
+]

diffmonkey/cli.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Command-line interface — a thin wrapper around :func:`diffmonkey.compare`.
+This module exists only to parse arguments, read input files
+(:mod:`diffmonkey.readers`), call the library, render the chosen format and
+return a process exit code. It contains no comparison logic. Exit codes:
+``0`` = no differences, ``1`` = differences found, ``2`` = usage/IO error.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from typing import Sequence
+from . import __version__
+from .compare import compare
+from .models import DiffMonkeyError
+from .readers import read_table
+EXIT_NO_DIFF = 0
+EXIT_DIFF = 1
+EXIT_ERROR = 2
+def _split_csv_opt(value: str | None) -> list[str] | None:
+    if value is None:
+        return None
+    return [part.strip() for part in value.split(",") if part.strip()]
+def _parse_column_map(pairs: Sequence[str] | None) -> dict[str, str] | None:
+    if not pairs:
+        return None
+    mapping: dict[str, str] = {}
+    for pair in pairs:
+        if "=" not in pair:
+            # ValueError (not argparse.ArgumentTypeError): this runs inside
+            # main(), not as an argparse type= callable, so it must be caught
+            # by main()'s handler and turned into exit code 2.
+            raise ValueError(f"--map expects OLD=NEW, got {pair!r}")
+        old, new = pair.split("=", 1)
+        mapping[old.strip()] = new.strip()
+    return mapping
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="diffmonkey",
+        description="Type-aware, key-based structural diff of two tabular files.",
+    )
+    parser.add_argument("--version", action="version", version=f"diffmonkey {__version__}")
+    sub = parser.add_subparsers(dest="command", required=True)
+    cmp = sub.add_parser("compare", help="Compare two files by key column(s).")
+    cmp.add_argument("old", help="Baseline file (CSV/TSV/Excel).")
+    cmp.add_argument("new", help="Current file to compare against the baseline.")
+    cmp.add_argument(
+        "-k", "--key", required=True,
+        help="Identity column, or comma-separated columns for a composite key.",
+    )
+    cmp.add_argument("--columns", help="Comma-separated columns to compare (default: all).")
+    cmp.add_argument("--ignore", help="Comma-separated columns to exclude.")
+    cmp.add_argument(
+        "--map", action="append", metavar="OLD=NEW",
+        help="Rename an old column to a new name (repeatable).",
+    )
+    cmp.add_argument(
+        "--format", choices=["summary", "markdown", "html", "csv", "json"],
+        default="summary", help="Output format (default: summary).",
+    )
+    cmp.add_argument("-o", "--output", help="Write report to this file instead of stdout.")
+    cmp.add_argument("--rel-tol", type=float, default=1e-9, help="Numeric relative tolerance.")
+    cmp.add_argument("--abs-tol", type=float, default=0.0, help="Numeric absolute tolerance.")
+    cmp.add_argument("--locale", choices=["us", "eu"], help="Number/date locale hint.")
+    cmp.add_argument(
+        "--no-type-aware", action="store_true",
+        help="Compare every column as a normalised string.",
+    )
+    cmp.add_argument(
+        "--no-null-equivalent", action="store_true",
+        help="Do not treat different null spellings as equal.",
+    )
+    cmp.add_argument(
+        "--include-unchanged", action="store_true",
+        help="Include unchanged rows in json/markdown output.",
+    )
+    cmp.add_argument(
+        "--on-duplicate", choices=["warn", "first", "last", "error"], default="warn",
+        help="Duplicate-key policy (default: warn).",
+    )
+    cmp.add_argument(
+        "--on-missing-key", choices=["warn", "skip", "error"], default="warn",
+        help="Missing-key policy (default: warn).",
+    )
+    cmp.add_argument("--delimiter", help="Force a delimiter for DSV inputs.")
+    return parser
+def _render(result, fmt: str, *, include_unchanged: bool) -> str:
+    if fmt == "summary":
+        text = result.summary.one_line()
+        if result.warnings:
+            text += "\n" + "\n".join(f"warning: {w}" for w in result.warnings)
+        return text + "\n"
+    if fmt == "markdown":
+        return result.to_markdown()
+    if fmt == "html":
+        return result.to_html()
+    if fmt == "csv":
+        return result.to_csv()
+    if fmt == "json":
+        return json.dumps(result.to_dict(), indent=2, default=str) + "\n"
+    raise ValueError(f"unknown format {fmt!r}")  # pragma: no cover
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    if args.command == "compare":
+        try:
+            read_kwargs = {"delimiter": args.delimiter} if args.delimiter else {}
+            old_rows = read_table(args.old, **read_kwargs)
+            new_rows = read_table(args.new, **read_kwargs)
+        except (OSError, RuntimeError, ValueError, TypeError) as exc:
+            print(f"diffmonkey: error reading input: {exc}", file=sys.stderr)
+            return EXIT_ERROR
+        try:
+            result = compare(
+                old_rows,
+                new_rows,
+                key=_split_csv_opt(args.key),
+                columns=_split_csv_opt(args.columns),
+                ignore=_split_csv_opt(args.ignore),
+                column_map=_parse_column_map(args.map),
+                rel_tol=args.rel_tol,
+                abs_tol=args.abs_tol,
+                locale=args.locale,
+                type_aware=not args.no_type_aware,
+                null_equivalent=not args.no_null_equivalent,
+                include_unchanged=args.include_unchanged,
+                on_duplicate=args.on_duplicate,
+                on_missing_key=args.on_missing_key,
+            )
+        except (DiffMonkeyError, ValueError, TypeError) as exc:
+            print(f"diffmonkey: {exc}", file=sys.stderr)
+            return EXIT_ERROR
+        text = _render(result, args.format, include_unchanged=args.include_unchanged)
+        if args.output:
+            try:
+                with open(args.output, "w", encoding="utf-8", newline="") as fh:
+                    fh.write(text)
+            except OSError as exc:
+                print(f"diffmonkey: error writing output: {exc}", file=sys.stderr)
+                return EXIT_ERROR
+        else:
+            sys.stdout.write(text)
+        return EXIT_DIFF if result.has_changes() else EXIT_NO_DIFF
+    return EXIT_ERROR  # pragma: no cover - argparse enforces a command
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())

diffmonkey/comparators.py ADDED Viewed

@@ -0,0 +1,259 @@
+"""Type-aware value equivalence for column comparison.
+This module exists to answer one question well: *are these two cell values the
+same?* — where "same" should ignore differences that are not real data changes.
+``"1,234"`` and ``"1234"`` are the same number; ``"01/02/2025"`` and
+``"2025-01-02"`` may be the same date; ``" foo "`` and ``"foo"`` are the same
+string; ``""`` and ``None`` are both missing. We lean on the ecosystem packages
+that already solve each sub-problem (``typemonkey`` for numbers/booleans/null
+vocabulary and type inference, ``datemonkey`` for date parsing, ``cleanmonkey``
+for whitespace/invisible-character normalisation) rather than re-deriving them.
+The strategy is *column-level*: ``infer_type`` decides a column's kind once
+(over the union of both sides), then a :class:`ColumnComparator` compares every
+pair in that column under that kind, falling back to normalised-string equality
+for any individual pair the kind cannot parse. This avoids the trap of trying
+every strategy per pair, which would call ``"1"`` and ``"true"`` equal.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Any, Sequence
+import cleanmonkey
+import datemonkey
+import typemonkey
+from typemonkey import TypeName
+NUMERIC_TYPES = frozenset(
+    {TypeName.INTEGER, TypeName.FLOAT, TypeName.CURRENCY, TypeName.PERCENTAGE}
+)
+class _Unparseable:
+    """Sentinel: a value could not be normalised to a column's kind."""
+    __slots__ = ()
+    def __repr__(self) -> str:  # pragma: no cover - debug aid
+        return "<unparseable>"
+UNPARSEABLE = _Unparseable()
+@dataclass
+class CompareSettings:
+    """Knobs that govern equivalence, shared across all columns of one compare."""
+    rel_tol: float = 1e-9
+    abs_tol: float = 0.0
+    normalize_whitespace: bool = True
+    null_equivalent: bool = True
+    type_aware: bool = True
+    date_aware: bool = True
+    locale: str | None = None
+    null_values: frozenset[str] | None = None
+def is_null(value: Any, settings: CompareSettings) -> bool:
+    """True when ``value`` should be treated as missing.
+    ``None`` is always null. Otherwise the decision is delegated to
+    ``typemonkey.is_null`` (whose default vocabulary includes ``""``,
+    whitespace-only, ``"na"``, ``"null"``, ``"none"`` and friends), optionally
+    restricted to a caller-supplied ``null_values`` set.
+    """
+    if value is None:
+        return True
+    return typemonkey.is_null(value, settings.null_values)
+def _clean(value: Any, settings: CompareSettings) -> str:
+    """Normalise a value to its comparable string form."""
+    text = value if isinstance(value, str) else str(value)
+    if settings.normalize_whitespace:
+        # 'minimal' would skip whitespace collapsing; 'default' strips and
+        # collapses runs and removes invisible characters — exactly the
+        # false-diff sources we want gone. strip=True is safe for cell values.
+        return cleanmonkey.clean(text, profile="default")
+    return text
+def _strings_equal(old: Any, new: Any, settings: CompareSettings) -> bool:
+    return _clean(old, settings) == _clean(new, settings)
+class ColumnComparator:
+    """Compares values of a single column under one inferred kind.
+    Construct via :func:`make_comparator`. ``equal(old, new)`` returns whether
+    the two raw values are equivalent. Null handling and exact-string fast path
+    are applied first; the kind-specific test runs only when both values are
+    non-null and not already string-equal.
+    """
+    def __init__(self, column: str, kind: str, settings: CompareSettings) -> None:
+        self.column = column
+        self.kind = kind  # "numeric" | "date" | "boolean" | "string"
+        self.settings = settings
+    # -- per-kind normalisation -------------------------------------------
+    def _as_number(self, value: Any) -> float | _Unparseable:
+        locale = self.settings.locale or "us"
+        try:
+            return float(typemonkey.parse_number(value, locale=locale).value)
+        except (ValueError, TypeError):
+            return UNPARSEABLE
+    def _as_bool(self, value: Any) -> bool | _Unparseable:
+        try:
+            return typemonkey.parse_boolean(value).value
+        except (ValueError, TypeError):
+            return UNPARSEABLE
+    def _as_date(self, value: Any):
+        # Parse each value independently with a locale hint rather than locking
+        # one format per column: a column may legitimately mix formats (the
+        # whole point of a diff is that old and new differ), and per-value
+        # parsing resolves DD/MM vs MM/DD via ``locale`` instead of guessing
+        # from the majority and failing the minority. See LIMITATIONS.md.
+        try:
+            batch = datemonkey.parse_dates(
+                [value], locale_preference=self.settings.locale
+            )
+        except Exception:  # datemonkey raises various errors on odd input
+            return UNPARSEABLE
+        parsed = batch.dates[0] if batch.dates else None
+        return parsed if parsed is not None else UNPARSEABLE
+    # -- the public test ---------------------------------------------------
+    def equal(self, old: Any, new: Any) -> bool:
+        s = self.settings
+        old_null = is_null(old, s)
+        new_null = is_null(new, s)
+        if s.null_equivalent:
+            if old_null and new_null:
+                return True
+            if old_null != new_null:
+                return False
+        else:
+            # nulls are just ordinary strings; fall through to string/kind tests
+            if old_null and new_null:
+                return _strings_equal(old, new, s)
+        # Fast path: identical text (after optional whitespace normalisation)
+        # is equal under every kind, and is cheap.
+        if _strings_equal(old, new, s):
+            return True
+        if self.kind == "numeric":
+            a = self._as_number(old)
+            b = self._as_number(new)
+            if a is UNPARSEABLE or b is UNPARSEABLE:
+                return _strings_equal(old, new, s)
+            return math.isclose(a, b, rel_tol=s.rel_tol, abs_tol=s.abs_tol)
+        if self.kind == "boolean":
+            a = self._as_bool(old)
+            b = self._as_bool(new)
+            if a is UNPARSEABLE or b is UNPARSEABLE:
+                return _strings_equal(old, new, s)
+            return a == b
+        if self.kind == "date":
+            a = self._as_date(old)
+            b = self._as_date(new)
+            if a is UNPARSEABLE or b is UNPARSEABLE:
+                return _strings_equal(old, new, s)
+            return a == b
+        # string / fallback
+        return _strings_equal(old, new, s)
+def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
+    """Infer one side's kind via ``typemonkey.infer_type``.
+    Returns ``"empty"`` when the side has no values to judge (so the other
+    side decides). ``preserve_as_string`` columns (leading-zero IDs, zips,
+    phone numbers) report ``"string"`` so they are never numeric-compared.
+    """
+    values = list(values)
+    if not values:
+        return "empty"
+    profile = typemonkey.infer_type(
+        values, null_values=settings.null_values, locale=settings.locale
+    )
+    if profile.preserve_as_string:
+        return "string"
+    t = profile.type
+    if t in NUMERIC_TYPES:
+        return "numeric"
+    if t == TypeName.BOOLEAN:
+        return "boolean"
+    if t == TypeName.DATE:
+        return "date"
+    if t == TypeName.NULL:
+        return "empty"
+    return "string"
+def _reconcile(old_kind: str, new_kind: str, settings: CompareSettings) -> str:
+    """Combine the two sides' inferred kinds into one comparison kind.
+    Each *file* is usually internally consistent in format, but the two files
+    may differ (US dates vs ISO, ``"1,234"`` vs ``"1234"``) — that difference
+    is exactly what we must not treat as a change. So we infer per side and
+    pick the more specific kind, trusting :meth:`ColumnComparator.equal` to
+    fall back to string comparison for any individual pair that cannot be
+    parsed under that kind. Precedence: **date** (if ``date_aware``) > numeric.
+    Boolean requires *both* sides to look boolean, so an integer column versus
+    a ``true``/``false`` column is not silently equated (``1`` == ``true``).
+    """
+    kinds = {old_kind, new_kind} - {"empty"}
+    if not kinds:
+        return "string"
+    if "date" in kinds and settings.date_aware:
+        return "date"
+    if "numeric" in kinds:
+        return "numeric"
+    if kinds == {"boolean"}:
+        return "boolean"
+    return "string"
+def _infer_kind(
+    old_values: Sequence[Any], new_values: Sequence[Any], settings: CompareSettings
+) -> str:
+    """Infer the comparison kind for a column from both sides independently."""
+    return _reconcile(
+        _side_kind(old_values, settings),
+        _side_kind(new_values, settings),
+        settings,
+    )
+def make_comparator(
+    column: str,
+    old_values: Sequence[Any],
+    new_values: Sequence[Any],
+    settings: CompareSettings,
+) -> ColumnComparator:
+    """Build a :class:`ColumnComparator` for ``column``.
+    When ``settings.type_aware`` is false the comparator is a pure
+    string/null comparator. Otherwise the column's kind is inferred from both
+    sides combined; date columns then compare by parsed calendar date (each
+    value parsed independently with the ``locale`` hint) so that an ``old``
+    column in ``DD/MM/YYYY`` and a ``new`` column in ISO are not false diffs.
+    """
+    if not settings.type_aware:
+        return ColumnComparator(column, "string", settings)
+    kind = _infer_kind(old_values, new_values, settings)
+    return ColumnComparator(column, kind, settings)