PyPI - diffmonkey - Versions diffs - 1.0.0__tar.gz → 1.1.1__tar.gz - Mend

diffmonkey 1.0.0tar.gz → 1.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{diffmonkey-1.0.0/src/diffmonkey.egg-info → diffmonkey-1.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffmonkey
-Version: 1.0.0
+Version: 1.1.1
 Summary: Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports.
 Author-email: RexBytes <pythonic@rexbytes.com>
 License: MIT License
@@ -40,9 +40,9 @@ Classifier: Topic :: Utilities
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: cleanmonkey
-Requires-Dist: typemonkey
-Requires-Dist: datemonkey
+Requires-Dist: cleanmonkey>=0.2.0
+Requires-Dist: typemonkey>=1.2.0
+Requires-Dist: datemonkey>=0.2.0
 Provides-Extra: excel
 Requires-Dist: openpyxl>=3.0; extra == "excel"
 Provides-Extra: dsv
@@ -51,6 +51,8 @@ Provides-Extra: dev
 Requires-Dist: pytest>=7.0; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
 Requires-Dist: hypothesis>=6.0; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
+Requires-Dist: mypy; extra == "dev"
 Dynamic: license-file
 # diffmonkey
@@ -144,6 +146,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
 deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
 handling) so behaviour that looks surprising is not mistaken for a bug.
+## Contributing & quality
+diffmonkey is tested and reviewed against an explicit quality contract. See
+[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
+competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
+for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
+for the release rubric (`python scripts/readiness.py`).
 ## License
 MIT — see [`LICENSE`](./LICENSE).

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/README.md RENAMED Viewed

@@ -89,6 +89,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
 deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
 handling) so behaviour that looks surprising is not mistaken for a bug.
+## Contributing & quality
+diffmonkey is tested and reviewed against an explicit quality contract. See
+[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
+competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
+for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
+for the release rubric (`python scripts/readiness.py`).
 ## License
 MIT — see [`LICENSE`](./LICENSE).

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "diffmonkey"
-version = "1.0.0"
+version = "1.1.1"
 description = "Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports."
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -23,15 +23,15 @@ classifiers = [
     "Topic :: Utilities",
 ]
 dependencies = [
-    "cleanmonkey",
-    "typemonkey",
-    "datemonkey",
+    "cleanmonkey>=0.2.0",
+    "typemonkey>=1.2.0",
+    "datemonkey>=0.2.0",
 ]
 [project.optional-dependencies]
 excel = ["openpyxl>=3.0"]
 dsv = ["dsvmonkey"]
-dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0"]
+dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0", "ruff", "mypy"]
 [project.scripts]
 diffmonkey = "diffmonkey.cli:main"
@@ -46,3 +46,17 @@ where = ["src"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 pythonpath = ["src"]
+[tool.ruff]
+src = ["src", "tests"]
+[tool.mypy]
+files = ["src"]
+python_version = "3.11"
+warn_unused_ignores = true
+# The rexbytes monkey libraries and openpyxl ship without type stubs; diffmonkey
+# pins their runtime behaviour with tests, not types.
+[[tool.mypy.overrides]]
+module = ["cleanmonkey", "datemonkey", "typemonkey", "openpyxl", "openpyxl.*"]
+ignore_missing_imports = true

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/__init__.py RENAMED Viewed

@@ -31,7 +31,7 @@ from .models import (
 )
 from .readers import read_csv, read_excel, read_table
-__version__ = "1.0.0"
+__version__ = "1.1.1"
 __all__ = [
     "compare",

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/cli.py RENAMED Viewed

@@ -23,10 +23,19 @@ EXIT_DIFF = 1
 EXIT_ERROR = 2
+def _split_csv(value: str) -> list[str]:
+    return [part.strip() for part in value.split(",") if part.strip()]
 def _split_csv_opt(value: str | None) -> list[str] | None:
+    # An empty / all-blank option (e.g. --columns "" or a shell variable that
+    # expanded to nothing) means "unset", not "an explicit empty list". The
+    # latter would make _resolve_columns compare ZERO columns and exit 0,
+    # silently masking real differences from a CI gate.
     if value is None:
         return None
-    return [part.strip() for part in value.split(",") if part.strip()]
+    parts = _split_csv(value)
+    return parts or None
 def _parse_column_map(pairs: Sequence[str] | None) -> dict[str, str] | None:
@@ -83,7 +92,7 @@ def build_parser() -> argparse.ArgumentParser:
     )
     cmp.add_argument(
         "--include-unchanged", action="store_true",
-        help="Include unchanged rows in json/markdown output.",
+        help="Retain unchanged rows (listed in json output; counted elsewhere).",
     )
     cmp.add_argument(
         "--on-duplicate", choices=["warn", "first", "last", "error"], default="warn",
@@ -97,7 +106,7 @@ def build_parser() -> argparse.ArgumentParser:
     return parser
-def _render(result, fmt: str, *, include_unchanged: bool) -> str:
+def _render(result, fmt: str) -> str:
     if fmt == "summary":
         text = result.summary.one_line()
         if result.warnings:
@@ -131,7 +140,7 @@ def main(argv: Sequence[str] | None = None) -> int:
             result = compare(
                 old_rows,
                 new_rows,
-                key=_split_csv_opt(args.key),
+                key=_split_csv(args.key),
                 columns=_split_csv_opt(args.columns),
                 ignore=_split_csv_opt(args.ignore),
                 column_map=_parse_column_map(args.map),
@@ -148,7 +157,7 @@ def main(argv: Sequence[str] | None = None) -> int:
             print(f"diffmonkey: {exc}", file=sys.stderr)
             return EXIT_ERROR
-        text = _render(result, args.format, include_unchanged=args.include_unchanged)
+        text = _render(result, args.format)
         if args.output:
             try:
                 with open(args.output, "w", encoding="utf-8", newline="") as fh:

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/comparators.py RENAMED Viewed

@@ -76,7 +76,8 @@ def _clean(value: Any, settings: CompareSettings) -> str:
     text = value if isinstance(value, str) else str(value)
     if settings.normalize_whitespace:
         # 'minimal' would skip whitespace collapsing; 'default' strips and
-        # collapses runs and removes invisible characters — exactly the
+        # collapses runs, removes invisible characters, and folds typographic
+        # variants (smart quotes→straight, dashes→'-', '…'→'...') — exactly the
         # false-diff sources we want gone. strip=True is safe for cell values.
         return cleanmonkey.clean(text, profile="default")
     return text
@@ -95,15 +96,31 @@ class ColumnComparator:
     non-null and not already string-equal.
     """
-    def __init__(self, column: str, kind: str, settings: CompareSettings) -> None:
+    def __init__(
+        self,
+        column: str,
+        kind: str,
+        settings: CompareSettings,
+        *,
+        old_locale: str = "us",
+        new_locale: str = "us",
+    ) -> None:
         self.column = column
         self.kind = kind  # "numeric" | "date" | "boolean" | "string"
         self.settings = settings
+        # Number locale resolved per side (see _side_locale): when the caller
+        # left ``locale=None`` we honour typemonkey's auto-detection instead of
+        # forcing US, and each side keeps its own — so an ``old`` file in EU
+        # format and a ``new`` file in US format are each parsed correctly.
+        # (The date path is NOT per-side: it uses the single ``settings.locale``
+        # hint, so disambiguating DD/MM vs MM/DD across files still needs an
+        # explicit ``locale=`` — see LIMITATIONS.md and ``_as_date``.)
+        self.old_locale = old_locale
+        self.new_locale = new_locale
     # -- per-kind normalisation -------------------------------------------
-    def _as_number(self, value: Any) -> float | _Unparseable:
-        locale = self.settings.locale or "us"
+    def _as_number(self, value: Any, locale: str) -> float | _Unparseable:
         try:
             return float(typemonkey.parse_number(value, locale=locale).value)
         except (ValueError, TypeError):
@@ -121,6 +138,22 @@ class ColumnComparator:
         # whole point of a diff is that old and new differ), and per-value
         # parsing resolves DD/MM vs MM/DD via ``locale`` instead of guessing
         # from the majority and failing the minority. See LIMITATIONS.md.
+        #
+        # A bare number like ``"2024"`` is NOT a date a user writes; datemonkey
+        # would read it as an Excel serial day-number (``1`` -> 1900-01-01), so a
+        # numeric column compared against a date column would silently match the
+        # wrong day. Refuse SHORT bare numbers so the pair falls back to string
+        # comparison instead of accepting a serial-date interpretation. This
+        # covers integer (``"45000"``) AND float-form (``"45000.0"``, or the
+        # native float openpyxl returns for a number-formatted cell) serials.
+        # The length gate is on the INTEGER part: Excel serials top out at 7
+        # digits (``2958465`` is year 9999), so ``< 8`` integer digits blocks
+        # every serial while letting the 8-digit compact ISO ``YYYYMMDD`` parse.
+        text = str(value).strip()
+        core = text.lstrip("+-")
+        int_part, _, frac = core.partition(".")
+        if int_part.isdigit() and len(int_part) < 8 and (frac == "" or frac.isdigit()):
+            return UNPARSEABLE
         try:
             batch = datemonkey.parse_dates(
                 [value], locale_preference=self.settings.locale
@@ -152,9 +185,9 @@ class ColumnComparator:
             return True
         if self.kind == "numeric":
-            a = self._as_number(old)
-            b = self._as_number(new)
-            if a is UNPARSEABLE or b is UNPARSEABLE:
+            a = self._as_number(old, self.old_locale)
+            b = self._as_number(new, self.new_locale)
+            if isinstance(a, _Unparseable) or isinstance(b, _Unparseable):
                 return _strings_equal(old, new, s)
             return math.isclose(a, b, rel_tol=s.rel_tol, abs_tol=s.abs_tol)
@@ -181,7 +214,9 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
     Returns ``"empty"`` when the side has no values to judge (so the other
     side decides). ``preserve_as_string`` columns (leading-zero IDs, zips,
-    phone numbers) report ``"string"`` so they are never numeric-compared.
+    phone numbers) report ``"preserve"`` — a *dominant* string kind that
+    :func:`_reconcile` will not let the other side out-vote, so they are never
+    numeric-compared even when the opposite side parses as a number.
     """
     values = list(values)
     if not values:
@@ -190,7 +225,7 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
         values, null_values=settings.null_values, locale=settings.locale
     )
     if profile.preserve_as_string:
-        return "string"
+        return "preserve"
     t = profile.type
     if t in NUMERIC_TYPES:
         return "numeric"
@@ -211,13 +246,17 @@ def _reconcile(old_kind: str, new_kind: str, settings: CompareSettings) -> str:
     is exactly what we must not treat as a change. So we infer per side and
     pick the more specific kind, trusting :meth:`ColumnComparator.equal` to
     fall back to string comparison for any individual pair that cannot be
-    parsed under that kind. Precedence: **date** (if ``date_aware``) > numeric.
-    Boolean requires *both* sides to look boolean, so an integer column versus
-    a ``true``/``false`` column is not silently equated (``1`` == ``true``).
+    parsed under that kind. Precedence: **preserve** (leading-zero IDs etc.,
+    dominant — never numeric-compared, even against a numeric side) > **date**
+    (if ``date_aware``) > numeric. Boolean requires *both* sides to look
+    boolean, so an integer column versus a ``true``/``false`` column is not
+    silently equated (``1`` == ``true``).
     """
     kinds = {old_kind, new_kind} - {"empty"}
     if not kinds:
         return "string"
+    if "preserve" in kinds:
+        return "string"
     if "date" in kinds and settings.date_aware:
         return "date"
     if "numeric" in kinds:
@@ -238,6 +277,27 @@ def _infer_kind(
     )
+def _side_locale(values: Sequence[Any], settings: CompareSettings) -> str:
+    """Resolve the number-parsing locale for one side.
+    Honours an explicit ``settings.locale`` when given; otherwise auto-detects
+    it from the values via ``typemonkey.infer_type`` (whose ``profile.locale``
+    distinguishes ``"1.234,56"`` EU from ``"1,234.56"`` US), falling back to
+    ``"us"`` when there is nothing to detect. This is what makes the documented
+    ``locale=None`` "auto-detect" promise true for numbers — the comparator must
+    not silently force ``"us"`` and mis-compare EU-formatted data.
+    """
+    if settings.locale:
+        return settings.locale
+    values = list(values)
+    if not values:
+        return "us"
+    profile = typemonkey.infer_type(
+        values, null_values=settings.null_values, locale=None
+    )
+    return getattr(profile, "locale", None) or "us"
 def make_comparator(
     column: str,
     old_values: Sequence[Any],
@@ -248,12 +308,21 @@ def make_comparator(
     When ``settings.type_aware`` is false the comparator is a pure
     string/null comparator. Otherwise the column's kind is inferred from both
-    sides combined; date columns then compare by parsed calendar date (each
-    value parsed independently with the ``locale`` hint) so that an ``old``
-    column in ``DD/MM/YYYY`` and a ``new`` column in ISO are not false diffs.
+    sides combined; date columns compare by parsed calendar date, each value
+    parsed independently with the ``locale`` hint. An ``old`` column in an
+    unambiguous format and a ``new`` column in another (e.g. ISO) are not false
+    diffs; an *ambiguous* mix such as ``DD/MM/YYYY`` vs ISO needs an explicit
+    ``locale=`` to disambiguate (see LIMITATIONS.md) — unlike numbers, the date
+    path does not auto-detect a per-side locale.
     """
     if not settings.type_aware:
         return ColumnComparator(column, "string", settings)
     kind = _infer_kind(old_values, new_values, settings)
-    return ColumnComparator(column, kind, settings)
+    return ColumnComparator(
+        column,
+        kind,
+        settings,
+        old_locale=_side_locale(old_values, settings),
+        new_locale=_side_locale(new_values, settings),
+    )

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/compare.py RENAMED Viewed

@@ -12,7 +12,7 @@ from __future__ import annotations
 from typing import Any, Iterable, Mapping, Sequence
 from .comparators import CompareSettings, make_comparator
-from .matching import index_rows, make_key, validate_policies
+from .matching import index_rows, validate_policies
 from .models import DiffResult, DiffSummary, FieldChange, RowDiff
 Row = Mapping[str, Any]
@@ -38,17 +38,27 @@ def _apply_column_map(
     sides share the new namespace. If both ``old_name`` and ``new_name`` are
     present in a row, the explicitly-new value wins (the rename target is not
     clobbered).
+    Renames are resolved against each row's *original* column names in a single
+    pass, so they never chain: ``{"a": "b", "b": "c"}`` renames the original
+    ``a``→``b`` and the original ``b``→``c`` independently — it does not rename a
+    freshly-produced ``b`` on to ``c``.
     """
     if not column_map:
         return [dict(r) for r in rows]
     out: list[dict[str, Any]] = []
     for r in rows:
         rd = dict(r)
-        for old_name, new_name in column_map.items():
-            if old_name in rd:
-                value = rd.pop(old_name)
-                rd.setdefault(new_name, value)
-        out.append(rd)
+        # Columns whose names are taken literally (not rename sources); an
+        # explicit value already under a rename target lives here and must win.
+        literal_targets = {c for c in rd if c not in column_map}
+        renamed: dict[str, Any] = {}
+        for col, value in rd.items():
+            target = column_map.get(col, col)
+            if col in column_map and target in literal_targets:
+                continue  # explicit-new value present elsewhere wins
+            renamed.setdefault(target, value)
+        out.append(renamed)
     return out
@@ -108,18 +118,23 @@ def compare(
         key: Identity column name, or a sequence of names for a composite key.
             Names refer to the *new* namespace (i.e. after ``column_map``).
         columns: Restrict comparison to these columns. Default: every non-key
-            column seen on either side.
+            column seen on either side. Like ``key``, names are in the *new*
+            namespace (i.e. after ``column_map``) — pass the renamed name, not
+            the original, or the column will match nothing and its change be
+            missed.
         ignore: Columns to exclude from comparison (timestamps, audit fields,
-            row numbers). Applied after ``columns``.
+            row numbers), also in the *new* namespace. Applied after ``columns``.
         column_map: ``{old_name: new_name}`` renames applied to ``old`` rows so
             renamed columns are not reported as removed+added.
         rel_tol: Relative floating-point tolerance for numeric columns
             (``math.isclose``). Default ``1e-9``.
         abs_tol: Absolute floating-point tolerance for numeric columns. Default
             ``0.0`` — set this to compare values near zero.
-        normalize_whitespace: Collapse/strip whitespace and drop invisible
-            characters before comparing strings (via ``cleanmonkey``). Default
-            True.
+        normalize_whitespace: Collapse/strip whitespace, drop invisible
+            characters, and fold typographic variants (smart quotes→straight,
+            dashes/minus→``-``, ``…``→``...``) before comparing strings (via
+            ``cleanmonkey``'s ``default`` profile), so cosmetic punctuation
+            differences are not reported as changes. Default True.
         null_equivalent: Treat all null spellings (``None``, ``""``,
             whitespace-only, ``"na"``, …) as one value, so two different nulls
             are equal and null↔value is a change. Default True.

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/csv_out.py RENAMED Viewed

@@ -10,8 +10,8 @@ analyst can pivot/filter the diff. The schema is fixed:
 * ``key`` is the row key rendered as ``col=value`` (joined by ``; `` for
   composite keys).
 * For ``added``/``removed`` rows there is no per-field breakdown, so one row is
-  emitted with empty ``column``/``old``/``new`` (added) or ``column``/``new``
-  empty (removed) — the presence of the row is the change.
+  emitted with ``column``/``old``/``new`` all empty — the row's presence under
+  ``change_type`` (and its key) is the change.
 """
 from __future__ import annotations
@@ -26,8 +26,13 @@ HEADER = ["change_type", "key", "column", "old", "new"]
 def _key_str(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
+    # Render a None key component as the (null) sentinel, like the markdown and
+    # HTML reports. A None key arises from on_missing_key="warn" (the default);
+    # leaving it "" would make it indistinguishable from a genuine empty-string
+    # key in the CSV.
     return "; ".join(
-        f"{col}={'' if val is None else val}" for col, val in zip(key_columns, rd.key)
+        f"{col}={'(null)' if val is None else val}"
+        for col, val in zip(key_columns, rd.key)
     )

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/html.py RENAMED Viewed

@@ -35,7 +35,7 @@ def _cell(value: Any) -> str:
 def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
     return escape(
         ", ".join(
-            f"{col}={'' if val is None else val}"
+            f"{col}={'(null)' if val is None else val}"
             for col, val in zip(key_columns, rd.key)
         )
     )

{diffmonkey-1.0.0 → diffmonkey-1.1.1}/src/diffmonkey/formatters/markdown.py RENAMED Viewed

@@ -8,6 +8,7 @@ downstream scripts can rely on its shape.
 from __future__ import annotations
+import re
 from typing import Any
 from ..models import DiffResult, RowDiff
@@ -20,14 +21,35 @@ def _fmt(value: Any) -> str:
     return str(value)
+def _code(value: Any) -> str:
+    """Wrap a value in a backtick code span that survives embedded backticks.
+    A naive ```{value}``` breaks when the value itself contains a
+    backtick (the span closes early and the rest renders as garbled text). Per
+    CommonMark, a code span may be fenced by any number of backticks, so we pick
+    a fence one longer than the longest run inside the value and pad with spaces
+    when needed — lossless, no character substitution.
+    """
+    text = _fmt(value)
+    if "`" not in text:
+        return f"`{text}`"
+    longest = max(len(run) for run in re.findall(r"`+", text))
+    fence = "`" * (longest + 1)
+    return f"{fence} {text} {fence}"
 def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
     parts = [f"{col}={_fmt(val)}" for col, val in zip(key_columns, rd.key)]
     return ", ".join(parts)
 def _limit(rows: list[RowDiff], max_rows: int | None) -> tuple[list[RowDiff], int]:
-    """Return ``(shown_rows, hidden_count)`` honouring ``max_rows``."""
-    if max_rows is None or len(rows) <= max_rows:
+    """Return ``(shown_rows, hidden_count)`` honouring ``max_rows``.
+    A negative ``max_rows`` means "no limit" (same as ``None``); it never
+    produces a phantom "N more" notice from slicing with a negative bound.
+    """
+    if max_rows is None or max_rows < 0 or len(rows) <= max_rows:
         return rows, 0
     return rows[:max_rows], len(rows) - max_rows
@@ -58,7 +80,7 @@ def render(result: DiffResult, *, max_rows: int | None = None) -> str:
     for rd in shown:
         lines.append(f"### {_key_label(rd, result.key_columns)}")
         for ch in rd.changes:
-            lines.append(f"- **{ch.column}**: `{_fmt(ch.old)}` → `{_fmt(ch.new)}`")
+            lines.append(f"- **{ch.column}**: {_code(ch.old)} → {_code(ch.new)}")
         lines.append("")
     if hidden:
         lines.append(f"_… and {hidden} more changed row(s)._")

diffmonkey 1.0.0__tar.gz → 1.1.1__tar.gz

diffmonkey 1.0.0tar.gz → 1.1.1tar.gz