PyPI - diffmonkey - Versions diffs - 1.0.0__py3-none-any.whl - Mend

diffmonkey 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

diffmonkey/__init__.py +49 -0
diffmonkey/cli.py +168 -0
diffmonkey/comparators.py +259 -0
diffmonkey/compare.py +253 -0
diffmonkey/formatters/__init__.py +13 -0
diffmonkey/formatters/csv_out.py +55 -0
diffmonkey/formatters/html.py +93 -0
diffmonkey/formatters/markdown.py +94 -0
diffmonkey/matching.py +141 -0
diffmonkey/models.py +185 -0
diffmonkey/readers.py +117 -0
diffmonkey-1.0.0.dist-info/METADATA +153 -0
diffmonkey-1.0.0.dist-info/RECORD +17 -0
diffmonkey-1.0.0.dist-info/WHEEL +5 -0
diffmonkey-1.0.0.dist-info/entry_points.txt +2 -0
diffmonkey-1.0.0.dist-info/licenses/LICENSE +21 -0
diffmonkey-1.0.0.dist-info/top_level.txt +1 -0

diffmonkey/models.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""Result dataclasses for a diffmonkey comparison.
+This module exists to give every comparison a typed, introspectable shape
+instead of a bag of nested dicts. ``compare()`` returns a :class:`DiffResult`;
+downstream code reads ``.added`` / ``.removed`` / ``.changed`` / ``.unchanged``
+and ``.summary`` with autocomplete and type-checking, and renders reports via
+the ``to_*`` helpers. The row-level types (:class:`FieldChange`,
+:class:`RowDiff`) are frozen so a result is a stable record of one comparison.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+class DiffMonkeyError(Exception):
+    """Base class for all diffmonkey errors."""
+class DuplicateKeyError(DiffMonkeyError):
+    """Raised when a key value occurs more than once and ``on_duplicate='error'``."""
+class MissingKeyError(DiffMonkeyError):
+    """Raised when a row lacks a key column and ``on_missing_key='error'``."""
+@dataclass(frozen=True)
+class FieldChange:
+    """A single column whose value differs between matched rows.
+    ``old`` and ``new`` are the *original* (un-normalised) values, so a report
+    shows what the data actually said, not its comparison-normalised form.
+    """
+    column: str
+    old: Any
+    new: Any
+@dataclass(frozen=True)
+class RowDiff:
+    """One row's place in the diff.
+    ``key`` is always a tuple (length 1 for a single key column, longer for a
+    composite key) so callers can treat single and composite keys uniformly.
+    ``old`` / ``new`` are the source row dicts: ``new`` is ``None`` for removed
+    rows, ``old`` is ``None`` for added rows, and both are present for changed
+    and unchanged rows. ``changes`` is non-empty only for changed rows.
+    """
+    key: tuple[Any, ...]
+    old: dict[str, Any] | None = None
+    new: dict[str, Any] | None = None
+    changes: tuple[FieldChange, ...] = ()
+    def key_dict(self, key_columns: tuple[str, ...]) -> dict[str, Any]:
+        """Return the key as a ``{column: value}`` mapping for display/export."""
+        return dict(zip(key_columns, self.key))
+@dataclass(frozen=True)
+class DiffSummary:
+    """Aggregate counts for a comparison.
+    ``matched`` is the number of keys present in *both* inputs (``changed +
+    unchanged``). ``total_old`` / ``total_new`` count input rows *after*
+    duplicate resolution, so ``added + removed + matched`` need not equal their
+    sum when duplicates were collapsed; ``duplicate_keys`` records how many
+    distinct keys were duplicated on either side.
+    """
+    total_old: int
+    total_new: int
+    matched: int
+    added: int
+    removed: int
+    changed: int
+    unchanged: int
+    key_columns: tuple[str, ...]
+    compared_columns: tuple[str, ...]
+    duplicate_keys: int = 0
+    def as_dict(self) -> dict[str, Any]:
+        return {
+            "total_old": self.total_old,
+            "total_new": self.total_new,
+            "matched": self.matched,
+            "added": self.added,
+            "removed": self.removed,
+            "changed": self.changed,
+            "unchanged": self.unchanged,
+            "duplicate_keys": self.duplicate_keys,
+            "key_columns": list(self.key_columns),
+            "compared_columns": list(self.compared_columns),
+        }
+    def one_line(self) -> str:
+        """A single human sentence, e.g. the headline of a report."""
+        return (
+            f"{self.added} added, {self.removed} removed, "
+            f"{self.changed} changed (of {self.total_new} current), "
+            f"{self.unchanged} unchanged"
+        )
+@dataclass
+class DiffResult:
+    """The full outcome of comparing two tabular datasets.
+    Rows are bucketed into :attr:`added` (key only in *new*), :attr:`removed`
+    (key only in *old*), :attr:`changed` (matched, at least one compared field
+    differs) and :attr:`unchanged` (matched, no compared field differs).
+    :attr:`unchanged` is only populated when ``compare(..., include_unchanged=
+    True)``. :attr:`warnings` collects non-fatal issues (duplicate or missing
+    keys) surfaced during matching.
+    """
+    added: list[RowDiff]
+    removed: list[RowDiff]
+    changed: list[RowDiff]
+    unchanged: list[RowDiff]
+    summary: DiffSummary
+    key_columns: tuple[str, ...]
+    compared_columns: tuple[str, ...]
+    warnings: list[str] = field(default_factory=list)
+    # -- serialisation -----------------------------------------------------
+    def to_dict(self) -> dict[str, Any]:
+        """A JSON-serialisable dict (given JSON-serialisable cell values)."""
+        def row(rd: RowDiff, body: str) -> dict[str, Any]:
+            out: dict[str, Any] = {"key": rd.key_dict(self.key_columns)}
+            if body == "added":
+                out["row"] = rd.new
+            elif body == "removed":
+                out["row"] = rd.old
+            elif body == "unchanged":
+                out["row"] = rd.new
+            else:  # changed
+                out["changes"] = {
+                    c.column: {"old": c.old, "new": c.new} for c in rd.changes
+                }
+            return out
+        return {
+            "summary": self.summary.as_dict(),
+            "added": [row(r, "added") for r in self.added],
+            "removed": [row(r, "removed") for r in self.removed],
+            "changed": [row(r, "changed") for r in self.changed],
+            "unchanged": [row(r, "unchanged") for r in self.unchanged],
+            "warnings": list(self.warnings),
+        }
+    def to_markdown(self, *, max_rows: int | None = None) -> str:
+        """Human-readable markdown report. See :mod:`diffmonkey.formatters.markdown`."""
+        from .formatters import markdown
+        return markdown.render(self, max_rows=max_rows)
+    def to_html(self, *, title: str = "diffmonkey report") -> str:
+        """Standalone HTML diff report. See :mod:`diffmonkey.formatters.html`."""
+        from .formatters import html
+        return html.render(self, title=title)
+    def to_csv(self) -> str:
+        """CSV of every changed field (one row per field change). See
+        :mod:`diffmonkey.formatters.csv_out`."""
+        from .formatters import csv_out
+        return csv_out.render(self)
+    def write_csv(self, path: str) -> None:
+        """Write :meth:`to_csv` output to ``path`` (UTF-8, ``\\n`` line endings)."""
+        with open(path, "w", encoding="utf-8", newline="") as fh:
+            fh.write(self.to_csv())
+    # -- convenience -------------------------------------------------------
+    def has_changes(self) -> bool:
+        """True when anything was added, removed, or changed."""
+        return bool(self.added or self.removed or self.changed)

diffmonkey/readers.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Read tabular inputs into lists of dicts for :func:`diffmonkey.compare`.
+This module exists so the CLI (and callers who start from files) can hand
+``compare()`` the list-of-dicts it expects, regardless of source format. DSV
+files (CSV/TSV/pipe/…) are read with the stdlib ``csv`` module — robust and
+dependency-free. Excel and richer DSV repair are *optional*: if ``openpyxl`` or
+``dsvmonkey`` is installed we use it, otherwise reading those formats raises a
+clear, actionable error rather than failing obscurely.
+"""
+from __future__ import annotations
+import csv
+import io
+import os
+from typing import Any
+DELIMITER_BY_EXT = {
+    ".csv": ",",
+    ".tsv": "\t",
+    ".tab": "\t",
+    ".psv": "|",
+    ".pipe": "|",
+}
+def read_csv(
+    path: str,
+    *,
+    delimiter: str | None = None,
+    encoding: str = "utf-8-sig",
+) -> list[dict[str, Any]]:
+    """Read a delimited file into a list of dicts using the stdlib ``csv``.
+    ``encoding`` defaults to ``utf-8-sig`` so a leading BOM is stripped from the
+    first header. ``delimiter`` defaults to the one implied by the file
+    extension (``,`` for unknown extensions). All cell values are strings; type
+    awareness happens later in :func:`compare`.
+    """
+    if delimiter is None:
+        ext = os.path.splitext(path)[1].lower()
+        delimiter = DELIMITER_BY_EXT.get(ext, ",")
+    with open(path, "r", encoding=encoding, newline="") as fh:
+        reader = csv.DictReader(fh, delimiter=delimiter)
+        rows = [dict(r) for r in reader]
+    return rows
+def read_excel(path: str, *, sheet: str | int | None = None) -> list[dict[str, Any]]:
+    """Read the first (or named) worksheet into a list of dicts.
+    Requires the ``excel`` extra (``pip install diffmonkey[excel]``). The first
+    row is treated as the header. Raises :class:`RuntimeError` with install
+    guidance if ``openpyxl`` is unavailable.
+    """
+    try:
+        from openpyxl import load_workbook
+    except ImportError as exc:  # pragma: no cover - exercised via monkeypatch
+        raise RuntimeError(
+            "Reading Excel files requires openpyxl. "
+            "Install it with: pip install diffmonkey[excel]"
+        ) from exc
+    wb = load_workbook(path, read_only=True, data_only=True)
+    ws = wb[sheet] if isinstance(sheet, str) else (
+        wb.worksheets[sheet] if isinstance(sheet, int) else wb.active
+    )
+    rows_iter = ws.iter_rows(values_only=True)
+    try:
+        header = next(rows_iter)
+    except StopIteration:
+        return []
+    headers = [("" if h is None else str(h)) for h in header]
+    out: list[dict[str, Any]] = []
+    for raw in rows_iter:
+        out.append({headers[i]: raw[i] if i < len(raw) else None for i in range(len(headers))})
+    wb.close()
+    return out
+EXCEL_EXTENSIONS = (".xlsx", ".xlsm", ".xltx", ".xltm")
+def read_table(
+    path: str,
+    *,
+    delimiter: str | None = None,
+    encoding: str = "utf-8-sig",
+    sheet: str | int | None = None,
+) -> list[dict[str, Any]]:
+    """Read ``path`` by dispatching on its extension.
+    ``.xlsx``/``.xlsm``/``.xltx``/``.xltm`` go to :func:`read_excel`; everything
+    else is treated as delimited text via :func:`read_csv`. Arguments are routed
+    to the format that understands them: ``delimiter``/``encoding`` apply to
+    delimited text only, ``sheet`` to Excel only. Passing ``delimiter`` for an
+    Excel input raises :class:`ValueError` rather than failing obscurely deep in
+    the Excel reader.
+    """
+    ext = os.path.splitext(path)[1].lower()
+    if ext in EXCEL_EXTENSIONS:
+        if delimiter is not None:
+            raise ValueError(
+                f"delimiter is not applicable to Excel input {path!r}"
+            )
+        return read_excel(path, sheet=sheet)
+    if sheet is not None:
+        raise ValueError(f"sheet is not applicable to delimited input {path!r}")
+    return read_csv(path, delimiter=delimiter, encoding=encoding)
+def read_csv_string(
+    text: str, *, delimiter: str = ","
+) -> list[dict[str, Any]]:
+    """Parse a delimited *string* into a list of dicts (used in tests/pipes)."""
+    reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
+    return [dict(r) for r in reader]

diffmonkey-1.0.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,153 @@
+Metadata-Version: 2.4
+Name: diffmonkey
+Version: 1.0.0
+Summary: Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports.
+Author-email: RexBytes <pythonic@rexbytes.com>
+License: MIT License
+        Copyright (c) 2026 RexBytes
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/RexBytes/diffmonkey
+Project-URL: Issues, https://github.com/RexBytes/diffmonkey/issues
+Keywords: diff,csv,tabular,compare,dataset,changes,reconciliation
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Utilities
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: cleanmonkey
+Requires-Dist: typemonkey
+Requires-Dist: datemonkey
+Provides-Extra: excel
+Requires-Dist: openpyxl>=3.0; extra == "excel"
+Provides-Extra: dsv
+Requires-Dist: dsvmonkey; extra == "dsv"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: hypothesis>=6.0; extra == "dev"
+Dynamic: license-file
+# diffmonkey
+Type-aware, key-based structural diffing of tabular datasets — answer "what
+changed between last month's export and this month's?" in one call, with
+human- and machine-readable reports.
+diffmonkey matches rows by a key column (or composite key), compares the
+remaining columns *with type awareness* (numbers by value, dates by calendar
+date, booleans by truth, strings whitespace-normalised, nulls unified), and
+buckets the result into **added / removed / changed / unchanged** with summary
+statistics. It is built on the rexbytes ecosystem — [`typemonkey`] for type
+inference and number parsing, [`datemonkey`] for date parsing, [`cleanmonkey`]
+for whitespace and invisible-character normalisation — so it does not re-derive
+those wheels.
+In scope: structural comparison, change detection, change reporting. Out of
+scope: merge/reconciliation, text diffing, schema migration, version control.
+## Install
+```bash
+pip install diffmonkey            # CSV/TSV/pipe input built in
+pip install "diffmonkey[excel]"   # add .xlsx reading (openpyxl)
+```
+Requires Python 3.11+.
+## Quick start
+```python
+from diffmonkey import compare
+old = [{"id": "1", "name": "Widget", "price": "1,234"},
+       {"id": "2", "name": "Gadget", "price": "50"}]
+new = [{"id": "1", "name": "Widget", "price": "1234"},   # price reformatted, not changed
+       {"id": "3", "name": "Gizmo",  "price": "9"}]       # id 2 removed, id 3 added
+result = compare(old, new, key="id")
+print(result.summary.one_line())
+# 1 added, 1 removed, 0 changed (of 2 current), 0 unchanged
+print(result.to_markdown())   # human report
+result.to_dict()              # machine-readable
+result.write_csv("changes.csv")
+```
+`"1,234"` vs `"1234"` is **not** reported as a change — type-aware numeric
+comparison sees one number. The same applies to `"01/02/2025"` vs `"2025-01-02"`
+(dates, with a `locale` hint), `" foo "` vs `"foo"` (whitespace), and
+`None`/`""`/`"NA"` (nulls).
+## CLI
+```bash
+diffmonkey compare old.csv new.csv --key id
+diffmonkey compare old.csv new.csv --key region,sku --ignore updated_at --format markdown
+diffmonkey compare old.xlsx new.xlsx --key id --format json -o diff.json
+```
+Exit code is `0` when the datasets are identical and `1` when they differ —
+handy in CI and scripts.
+## Key options
+| Option | Purpose |
+|---|---|
+| `key` | Identity column, or list for a composite key |
+| `columns` / `ignore` | Restrict / exclude columns from comparison |
+| `column_map={"old":"new"}` | Handle renamed columns (avoids false add+remove) |
+| `rel_tol` / `abs_tol` | Floating-point tolerance for numeric columns |
+| `locale="us"` / `"eu"` | Disambiguate slash dates and number separators |
+| `null_equivalent` | Treat all null spellings as one value (default on) |
+| `type_aware` / `date_aware` | Toggle type/date-aware comparison |
+| `include_unchanged` | Retain unchanged rows in the result |
+| `on_duplicate` / `on_missing_key` | Policies for messy keys |
+## Output formats
+- `result.to_dict()` — JSON-serialisable structure
+- `result.to_markdown()` — report for PRs, chat, email
+- `result.to_html()` — standalone HTML diff report
+- `result.to_csv()` / `result.write_csv(path)` — one row per field change
+## Using with AI assistants
+See [`SKILL.md`](./SKILL.md) for LLM-oriented usage (decision tree, worked
+examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
+deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
+handling) so behaviour that looks surprising is not mistaken for a bug.
+## License
+MIT — see [`LICENSE`](./LICENSE).
+[`typemonkey`]: https://pypi.org/project/typemonkey/
+[`datemonkey`]: https://pypi.org/project/datemonkey/
+[`cleanmonkey`]: https://pypi.org/project/cleanmonkey/

diffmonkey-1.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+diffmonkey/__init__.py,sha256=24HiUU_WjZx9kJ5WCACQ8NVVtEeKDKrXgHQGIJ1wE-8,1259
+diffmonkey/cli.py,sha256=w4LP70cP0UeQpUxZSVQzBbMK0ZUpzFatelTjRPAj1VM,6508
+diffmonkey/comparators.py,sha256=P-7d9OJTCGFsb8ty3bJwq52gICnS3LNX11PWcOhYeK0,9722
+diffmonkey/compare.py,sha256=tZFIoKW6ZJRJbihVOQ50j4P2wQ3lpTnZjg56pto0xKw,9600
+diffmonkey/matching.py,sha256=zSkWf8tjUi-4NdFTdGk2Ofkpjw1t4y41xy_B4Ka-iK4,5294
+diffmonkey/models.py,sha256=Khdl5xuS6zZT7JrEm0V40SrUNfk-JyweIWH4fnZsZZg,6639
+diffmonkey/readers.py,sha256=ZdhCTUaiVWmlTFD_QSi1EUL9h_Ke4I9ltp8PW-IuxCw,4186
+diffmonkey/formatters/__init__.py,sha256=AOO53gkYHHwcXeIoRw_qg820cCRT5d0NVPWmmGSv5nc,461
+diffmonkey/formatters/csv_out.py,sha256=7bo7rZSGwUeg32RzgcwWbx2dGn5CktBbl6gZDKGSGdI,1807
+diffmonkey/formatters/html.py,sha256=lQGRSCjncNLp0AR9DEs0ki7JFLhvZRq0XYCY_q7ps-U,3427
+diffmonkey/formatters/markdown.py,sha256=SFnWZ0-nbh1lwmt_70Dfzv_qE5tE3N4vB4t2uEuBWFc,3271
+diffmonkey-1.0.0.dist-info/licenses/LICENSE,sha256=srNahN_Cxejm5SlFsCghF2Mml1gXgqlnuqWlDt7F1ck,1065
+diffmonkey-1.0.0.dist-info/METADATA,sha256=Vf1kiH8sR4u2jLL2WGo53Zyuvn6iam_sxWO2YPfI-eA,6320
+diffmonkey-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+diffmonkey-1.0.0.dist-info/entry_points.txt,sha256=CuC3tfsyD9w705dxxGpk41euAKTubnI3wOdL045BPcs,51
+diffmonkey-1.0.0.dist-info/top_level.txt,sha256=m7WLtVq4qOOi5_ygdJl9wsktjtIrd4F4sZPznl9W5yQ,11
+diffmonkey-1.0.0.dist-info/RECORD,,

diffmonkey-1.0.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

diffmonkey-1.0.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ diffmonkey = diffmonkey.cli:main

diffmonkey-1.0.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 RexBytes
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

diffmonkey-1.0.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ diffmonkey