PyPI - diffmonkey - Versions diffs - 1.0.0__tar.gz → 1.1.0__tar.gz - Mend

diffmonkey 1.0.0tar.gz → 1.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{diffmonkey-1.0.0/src/diffmonkey.egg-info → diffmonkey-1.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffmonkey
-Version: 1.0.0
+Version: 1.1.0
 Summary: Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports.
 Author-email: RexBytes <pythonic@rexbytes.com>
 License: MIT License
@@ -51,6 +51,8 @@ Provides-Extra: dev
 Requires-Dist: pytest>=7.0; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
 Requires-Dist: hypothesis>=6.0; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
+Requires-Dist: mypy; extra == "dev"
 Dynamic: license-file
 # diffmonkey
@@ -144,6 +146,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
 deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
 handling) so behaviour that looks surprising is not mistaken for a bug.
+## Contributing & quality
+diffmonkey is tested and reviewed against an explicit quality contract. See
+[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
+competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
+for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
+for the release rubric (`python scripts/readiness.py`).
 ## License
 MIT — see [`LICENSE`](./LICENSE).

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/README.md RENAMED Viewed

@@ -89,6 +89,14 @@ examples, anti-patterns). See [`LIMITATIONS.md`](./LIMITATIONS.md) for the
 deliberate design tradeoffs (date/locale ambiguity, null vocabulary, duplicate
 handling) so behaviour that looks surprising is not mistaken for a bug.
+## Contributing & quality
+diffmonkey is tested and reviewed against an explicit quality contract. See
+[`CONTRIBUTING.md`](./CONTRIBUTING.md) for the testing philosophy and the
+competitive multi-model review process, [`REVIEW_HISTORY.md`](./REVIEW_HISTORY.md)
+for the review trajectory, and [`RELEASE_READINESS.md`](./RELEASE_READINESS.md)
+for the release rubric (`python scripts/readiness.py`).
 ## License
 MIT — see [`LICENSE`](./LICENSE).

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "diffmonkey"
-version = "1.0.0"
+version = "1.1.0"
 description = "Type-aware, key-based structural diffing of tabular datasets with human- and machine-readable reports."
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -31,7 +31,7 @@ dependencies = [
 [project.optional-dependencies]
 excel = ["openpyxl>=3.0"]
 dsv = ["dsvmonkey"]
-dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0"]
+dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0", "ruff", "mypy"]
 [project.scripts]
 diffmonkey = "diffmonkey.cli:main"
@@ -46,3 +46,17 @@ where = ["src"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 pythonpath = ["src"]
+[tool.ruff]
+src = ["src", "tests"]
+[tool.mypy]
+files = ["src"]
+python_version = "3.11"
+warn_unused_ignores = true
+# The rexbytes monkey libraries and openpyxl ship without type stubs; diffmonkey
+# pins their runtime behaviour with tests, not types.
+[[tool.mypy.overrides]]
+module = ["cleanmonkey", "datemonkey", "typemonkey", "openpyxl", "openpyxl.*"]
+ignore_missing_imports = true

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/__init__.py RENAMED Viewed

@@ -31,7 +31,7 @@ from .models import (
 )
 from .readers import read_csv, read_excel, read_table
-__version__ = "1.0.0"
+__version__ = "1.1.0"
 __all__ = [
     "compare",

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/cli.py RENAMED Viewed

@@ -23,10 +23,19 @@ EXIT_DIFF = 1
 EXIT_ERROR = 2
+def _split_csv(value: str) -> list[str]:
+    return [part.strip() for part in value.split(",") if part.strip()]
 def _split_csv_opt(value: str | None) -> list[str] | None:
+    # An empty / all-blank option (e.g. --columns "" or a shell variable that
+    # expanded to nothing) means "unset", not "an explicit empty list". The
+    # latter would make _resolve_columns compare ZERO columns and exit 0,
+    # silently masking real differences from a CI gate.
     if value is None:
         return None
-    return [part.strip() for part in value.split(",") if part.strip()]
+    parts = _split_csv(value)
+    return parts or None
 def _parse_column_map(pairs: Sequence[str] | None) -> dict[str, str] | None:
@@ -83,7 +92,7 @@ def build_parser() -> argparse.ArgumentParser:
     )
     cmp.add_argument(
         "--include-unchanged", action="store_true",
-        help="Include unchanged rows in json/markdown output.",
+        help="Retain unchanged rows (listed in json output; counted elsewhere).",
     )
     cmp.add_argument(
         "--on-duplicate", choices=["warn", "first", "last", "error"], default="warn",
@@ -97,7 +106,7 @@ def build_parser() -> argparse.ArgumentParser:
     return parser
-def _render(result, fmt: str, *, include_unchanged: bool) -> str:
+def _render(result, fmt: str) -> str:
     if fmt == "summary":
         text = result.summary.one_line()
         if result.warnings:
@@ -131,7 +140,7 @@ def main(argv: Sequence[str] | None = None) -> int:
             result = compare(
                 old_rows,
                 new_rows,
-                key=_split_csv_opt(args.key),
+                key=_split_csv(args.key),
                 columns=_split_csv_opt(args.columns),
                 ignore=_split_csv_opt(args.ignore),
                 column_map=_parse_column_map(args.map),
@@ -148,7 +157,7 @@ def main(argv: Sequence[str] | None = None) -> int:
             print(f"diffmonkey: {exc}", file=sys.stderr)
             return EXIT_ERROR
-        text = _render(result, args.format, include_unchanged=args.include_unchanged)
+        text = _render(result, args.format)
         if args.output:
             try:
                 with open(args.output, "w", encoding="utf-8", newline="") as fh:

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/comparators.py RENAMED Viewed

@@ -95,15 +95,31 @@ class ColumnComparator:
     non-null and not already string-equal.
     """
-    def __init__(self, column: str, kind: str, settings: CompareSettings) -> None:
+    def __init__(
+        self,
+        column: str,
+        kind: str,
+        settings: CompareSettings,
+        *,
+        old_locale: str = "us",
+        new_locale: str = "us",
+    ) -> None:
         self.column = column
         self.kind = kind  # "numeric" | "date" | "boolean" | "string"
         self.settings = settings
+        # Number locale resolved per side (see _side_locale): when the caller
+        # left ``locale=None`` we honour typemonkey's auto-detection instead of
+        # forcing US, and each side keeps its own — so an ``old`` file in EU
+        # format and a ``new`` file in US format are each parsed correctly.
+        # (The date path is NOT per-side: it uses the single ``settings.locale``
+        # hint, so disambiguating DD/MM vs MM/DD across files still needs an
+        # explicit ``locale=`` — see LIMITATIONS.md and ``_as_date``.)
+        self.old_locale = old_locale
+        self.new_locale = new_locale
     # -- per-kind normalisation -------------------------------------------
-    def _as_number(self, value: Any) -> float | _Unparseable:
-        locale = self.settings.locale or "us"
+    def _as_number(self, value: Any, locale: str) -> float | _Unparseable:
         try:
             return float(typemonkey.parse_number(value, locale=locale).value)
         except (ValueError, TypeError):
@@ -121,6 +137,22 @@ class ColumnComparator:
         # whole point of a diff is that old and new differ), and per-value
         # parsing resolves DD/MM vs MM/DD via ``locale`` instead of guessing
         # from the majority and failing the minority. See LIMITATIONS.md.
+        #
+        # A bare number like ``"2024"`` is NOT a date a user writes; datemonkey
+        # would read it as an Excel serial day-number (``1`` -> 1900-01-01), so a
+        # numeric column compared against a date column would silently match the
+        # wrong day. Refuse SHORT bare numbers so the pair falls back to string
+        # comparison instead of accepting a serial-date interpretation. This
+        # covers integer (``"45000"``) AND float-form (``"45000.0"``, or the
+        # native float openpyxl returns for a number-formatted cell) serials.
+        # The length gate is on the INTEGER part: Excel serials top out at 7
+        # digits (``2958465`` is year 9999), so ``< 8`` integer digits blocks
+        # every serial while letting the 8-digit compact ISO ``YYYYMMDD`` parse.
+        text = str(value).strip()
+        core = text.lstrip("+-")
+        int_part, _, frac = core.partition(".")
+        if int_part.isdigit() and len(int_part) < 8 and (frac == "" or frac.isdigit()):
+            return UNPARSEABLE
         try:
             batch = datemonkey.parse_dates(
                 [value], locale_preference=self.settings.locale
@@ -152,9 +184,9 @@ class ColumnComparator:
             return True
         if self.kind == "numeric":
-            a = self._as_number(old)
-            b = self._as_number(new)
-            if a is UNPARSEABLE or b is UNPARSEABLE:
+            a = self._as_number(old, self.old_locale)
+            b = self._as_number(new, self.new_locale)
+            if isinstance(a, _Unparseable) or isinstance(b, _Unparseable):
                 return _strings_equal(old, new, s)
             return math.isclose(a, b, rel_tol=s.rel_tol, abs_tol=s.abs_tol)
@@ -181,7 +213,9 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
     Returns ``"empty"`` when the side has no values to judge (so the other
     side decides). ``preserve_as_string`` columns (leading-zero IDs, zips,
-    phone numbers) report ``"string"`` so they are never numeric-compared.
+    phone numbers) report ``"preserve"`` — a *dominant* string kind that
+    :func:`_reconcile` will not let the other side out-vote, so they are never
+    numeric-compared even when the opposite side parses as a number.
     """
     values = list(values)
     if not values:
@@ -190,7 +224,7 @@ def _side_kind(values: Sequence[Any], settings: CompareSettings) -> str:
         values, null_values=settings.null_values, locale=settings.locale
     )
     if profile.preserve_as_string:
-        return "string"
+        return "preserve"
     t = profile.type
     if t in NUMERIC_TYPES:
         return "numeric"
@@ -211,13 +245,17 @@ def _reconcile(old_kind: str, new_kind: str, settings: CompareSettings) -> str:
     is exactly what we must not treat as a change. So we infer per side and
     pick the more specific kind, trusting :meth:`ColumnComparator.equal` to
     fall back to string comparison for any individual pair that cannot be
-    parsed under that kind. Precedence: **date** (if ``date_aware``) > numeric.
-    Boolean requires *both* sides to look boolean, so an integer column versus
-    a ``true``/``false`` column is not silently equated (``1`` == ``true``).
+    parsed under that kind. Precedence: **preserve** (leading-zero IDs etc.,
+    dominant — never numeric-compared, even against a numeric side) > **date**
+    (if ``date_aware``) > numeric. Boolean requires *both* sides to look
+    boolean, so an integer column versus a ``true``/``false`` column is not
+    silently equated (``1`` == ``true``).
     """
     kinds = {old_kind, new_kind} - {"empty"}
     if not kinds:
         return "string"
+    if "preserve" in kinds:
+        return "string"
     if "date" in kinds and settings.date_aware:
         return "date"
     if "numeric" in kinds:
@@ -238,6 +276,27 @@ def _infer_kind(
     )
+def _side_locale(values: Sequence[Any], settings: CompareSettings) -> str:
+    """Resolve the number-parsing locale for one side.
+    Honours an explicit ``settings.locale`` when given; otherwise auto-detects
+    it from the values via ``typemonkey.infer_type`` (whose ``profile.locale``
+    distinguishes ``"1.234,56"`` EU from ``"1,234.56"`` US), falling back to
+    ``"us"`` when there is nothing to detect. This is what makes the documented
+    ``locale=None`` "auto-detect" promise true for numbers — the comparator must
+    not silently force ``"us"`` and mis-compare EU-formatted data.
+    """
+    if settings.locale:
+        return settings.locale
+    values = list(values)
+    if not values:
+        return "us"
+    profile = typemonkey.infer_type(
+        values, null_values=settings.null_values, locale=None
+    )
+    return getattr(profile, "locale", None) or "us"
 def make_comparator(
     column: str,
     old_values: Sequence[Any],
@@ -248,12 +307,21 @@ def make_comparator(
     When ``settings.type_aware`` is false the comparator is a pure
     string/null comparator. Otherwise the column's kind is inferred from both
-    sides combined; date columns then compare by parsed calendar date (each
-    value parsed independently with the ``locale`` hint) so that an ``old``
-    column in ``DD/MM/YYYY`` and a ``new`` column in ISO are not false diffs.
+    sides combined; date columns compare by parsed calendar date, each value
+    parsed independently with the ``locale`` hint. An ``old`` column in an
+    unambiguous format and a ``new`` column in another (e.g. ISO) are not false
+    diffs; an *ambiguous* mix such as ``DD/MM/YYYY`` vs ISO needs an explicit
+    ``locale=`` to disambiguate (see LIMITATIONS.md) — unlike numbers, the date
+    path does not auto-detect a per-side locale.
     """
     if not settings.type_aware:
         return ColumnComparator(column, "string", settings)
     kind = _infer_kind(old_values, new_values, settings)
-    return ColumnComparator(column, kind, settings)
+    return ColumnComparator(
+        column,
+        kind,
+        settings,
+        old_locale=_side_locale(old_values, settings),
+        new_locale=_side_locale(new_values, settings),
+    )

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/compare.py RENAMED Viewed

@@ -12,7 +12,7 @@ from __future__ import annotations
 from typing import Any, Iterable, Mapping, Sequence
 from .comparators import CompareSettings, make_comparator
-from .matching import index_rows, make_key, validate_policies
+from .matching import index_rows, validate_policies
 from .models import DiffResult, DiffSummary, FieldChange, RowDiff
 Row = Mapping[str, Any]
@@ -38,17 +38,27 @@ def _apply_column_map(
     sides share the new namespace. If both ``old_name`` and ``new_name`` are
     present in a row, the explicitly-new value wins (the rename target is not
     clobbered).
+    Renames are resolved against each row's *original* column names in a single
+    pass, so they never chain: ``{"a": "b", "b": "c"}`` renames the original
+    ``a``→``b`` and the original ``b``→``c`` independently — it does not rename a
+    freshly-produced ``b`` on to ``c``.
     """
     if not column_map:
         return [dict(r) for r in rows]
     out: list[dict[str, Any]] = []
     for r in rows:
         rd = dict(r)
-        for old_name, new_name in column_map.items():
-            if old_name in rd:
-                value = rd.pop(old_name)
-                rd.setdefault(new_name, value)
-        out.append(rd)
+        # Columns whose names are taken literally (not rename sources); an
+        # explicit value already under a rename target lives here and must win.
+        literal_targets = {c for c in rd if c not in column_map}
+        renamed: dict[str, Any] = {}
+        for col, value in rd.items():
+            target = column_map.get(col, col)
+            if col in column_map and target in literal_targets:
+                continue  # explicit-new value present elsewhere wins
+            renamed.setdefault(target, value)
+        out.append(renamed)
     return out
@@ -108,9 +118,12 @@ def compare(
         key: Identity column name, or a sequence of names for a composite key.
             Names refer to the *new* namespace (i.e. after ``column_map``).
         columns: Restrict comparison to these columns. Default: every non-key
-            column seen on either side.
+            column seen on either side. Like ``key``, names are in the *new*
+            namespace (i.e. after ``column_map``) — pass the renamed name, not
+            the original, or the column will match nothing and its change be
+            missed.
         ignore: Columns to exclude from comparison (timestamps, audit fields,
-            row numbers). Applied after ``columns``.
+            row numbers), also in the *new* namespace. Applied after ``columns``.
         column_map: ``{old_name: new_name}`` renames applied to ``old`` rows so
             renamed columns are not reported as removed+added.
         rel_tol: Relative floating-point tolerance for numeric columns

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/formatters/csv_out.py RENAMED Viewed

@@ -10,8 +10,8 @@ analyst can pivot/filter the diff. The schema is fixed:
 * ``key`` is the row key rendered as ``col=value`` (joined by ``; `` for
   composite keys).
 * For ``added``/``removed`` rows there is no per-field breakdown, so one row is
-  emitted with empty ``column``/``old``/``new`` (added) or ``column``/``new``
-  empty (removed) — the presence of the row is the change.
+  emitted with ``column``/``old``/``new`` all empty — the row's presence under
+  ``change_type`` (and its key) is the change.
 """
 from __future__ import annotations
@@ -26,8 +26,13 @@ HEADER = ["change_type", "key", "column", "old", "new"]
 def _key_str(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
+    # Render a None key component as the (null) sentinel, like the markdown and
+    # HTML reports. A None key arises from on_missing_key="warn" (the default);
+    # leaving it "" would make it indistinguishable from a genuine empty-string
+    # key in the CSV.
     return "; ".join(
-        f"{col}={'' if val is None else val}" for col, val in zip(key_columns, rd.key)
+        f"{col}={'(null)' if val is None else val}"
+        for col, val in zip(key_columns, rd.key)
     )

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/formatters/html.py RENAMED Viewed

@@ -35,7 +35,7 @@ def _cell(value: Any) -> str:
 def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
     return escape(
         ", ".join(
-            f"{col}={'' if val is None else val}"
+            f"{col}={'(null)' if val is None else val}"
             for col, val in zip(key_columns, rd.key)
         )
     )

{diffmonkey-1.0.0 → diffmonkey-1.1.0}/src/diffmonkey/formatters/markdown.py RENAMED Viewed

@@ -8,6 +8,7 @@ downstream scripts can rely on its shape.
 from __future__ import annotations
+import re
 from typing import Any
 from ..models import DiffResult, RowDiff
@@ -20,14 +21,35 @@ def _fmt(value: Any) -> str:
     return str(value)
+def _code(value: Any) -> str:
+    """Wrap a value in a backtick code span that survives embedded backticks.
+    A naive ```{value}``` breaks when the value itself contains a
+    backtick (the span closes early and the rest renders as garbled text). Per
+    CommonMark, a code span may be fenced by any number of backticks, so we pick
+    a fence one longer than the longest run inside the value and pad with spaces
+    when needed — lossless, no character substitution.
+    """
+    text = _fmt(value)
+    if "`" not in text:
+        return f"`{text}`"
+    longest = max(len(run) for run in re.findall(r"`+", text))
+    fence = "`" * (longest + 1)
+    return f"{fence} {text} {fence}"
 def _key_label(rd: RowDiff, key_columns: tuple[str, ...]) -> str:
     parts = [f"{col}={_fmt(val)}" for col, val in zip(key_columns, rd.key)]
     return ", ".join(parts)
 def _limit(rows: list[RowDiff], max_rows: int | None) -> tuple[list[RowDiff], int]:
-    """Return ``(shown_rows, hidden_count)`` honouring ``max_rows``."""
-    if max_rows is None or len(rows) <= max_rows:
+    """Return ``(shown_rows, hidden_count)`` honouring ``max_rows``.
+    A negative ``max_rows`` means "no limit" (same as ``None``); it never
+    produces a phantom "N more" notice from slicing with a negative bound.
+    """
+    if max_rows is None or max_rows < 0 or len(rows) <= max_rows:
         return rows, 0
     return rows[:max_rows], len(rows) - max_rows
@@ -58,7 +80,7 @@ def render(result: DiffResult, *, max_rows: int | None = None) -> str:
     for rd in shown:
         lines.append(f"### {_key_label(rd, result.key_columns)}")
         for ch in rd.changes:
-            lines.append(f"- **{ch.column}**: `{_fmt(ch.old)}` → `{_fmt(ch.new)}`")
+            lines.append(f"- **{ch.column}**: {_code(ch.old)} → {_code(ch.new)}")
         lines.append("")
     if hidden:
         lines.append(f"_… and {hidden} more changed row(s)._")

diffmonkey-1.1.0/src/diffmonkey/readers.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""Read tabular inputs into lists of dicts for :func:`diffmonkey.compare`.
+This module exists so the CLI (and callers who start from files) can hand
+``compare()`` the list-of-dicts it expects, regardless of source format. DSV
+files (CSV/TSV/pipe/…) are read with the stdlib ``csv`` module — robust and
+dependency-free. Excel is *optional*: if ``openpyxl`` is installed we use it,
+otherwise :func:`read_excel` raises a clear, actionable error rather than
+failing obscurely.
+"""
+from __future__ import annotations
+import csv
+import io
+import os
+from typing import Any
+DELIMITER_BY_EXT = {
+    ".csv": ",",
+    ".tsv": "\t",
+    ".tab": "\t",
+    ".psv": "|",
+    ".pipe": "|",
+}
+def read_csv(
+    path: str,
+    *,
+    delimiter: str | None = None,
+    encoding: str = "utf-8-sig",
+) -> list[dict[str, Any]]:
+    """Read a delimited file into a list of dicts using the stdlib ``csv``.
+    ``encoding`` defaults to ``utf-8-sig`` so a leading BOM is stripped from the
+    first header. ``delimiter`` defaults to the one implied by the file
+    extension (``,`` for unknown extensions). All cell values are strings; type
+    awareness happens later in :func:`compare`. A row with *more* fields than the
+    header raises :class:`ValueError` rather than silently bucketing the overflow
+    under a ``None`` key (which would smuggle a non-string, non-column cell into
+    the diff).
+    Note: *duplicate* header names collapse to one column (stdlib
+    ``csv.DictReader`` keeps the last) — unlike :func:`read_excel`, which
+    de-duplicates. A delimited file with repeated headers is malformed; fix it
+    upstream or read it with :func:`read_excel` if it came from a spreadsheet.
+    """
+    if delimiter is None:
+        ext = os.path.splitext(path)[1].lower()
+        delimiter = DELIMITER_BY_EXT.get(ext, ",")
+    with open(path, "r", encoding=encoding, newline="") as fh:
+        return _read_dictreader(csv.DictReader(fh, delimiter=delimiter), path)
+_OVERFLOW_KEY = "__diffmonkey_overflow__"
+def _read_dictreader(reader: "csv.DictReader[str]", source: str) -> list[dict[str, Any]]:
+    """Materialise a ``DictReader``, rejecting rows wider than the header.
+    ``csv.DictReader`` would otherwise stash extra fields in a list under the
+    ``None`` key; that yields a non-string key and a list value that flow
+    straight into :func:`compare`, contradicting the "all cell values are
+    strings" contract.
+    Overflow is detected by the bucket being a *list* (DictReader only ever puts
+    a list under ``restkey``), not merely by the key's presence — so a file that
+    legitimately has a column literally named ``__diffmonkey_overflow__`` is not
+    mistaken for a ragged row.
+    """
+    reader.restkey = _OVERFLOW_KEY
+    rows: list[dict[str, Any]] = []
+    for n, r in enumerate(reader, start=2):  # row 1 is the header
+        if isinstance(r.get(_OVERFLOW_KEY), list):
+            extra = r[_OVERFLOW_KEY]
+            raise ValueError(
+                f"{source}: row {n} has more fields than the header "
+                f"({len(reader.fieldnames or ())} columns); extra values {extra!r}"
+            )
+        rows.append(dict(r))
+    return rows
+def _unique_headers(names: list[str]) -> list[str]:
+    """Make header names unique so no two columns share a dict key and one
+    silently overwrites the other.
+    A repeated name gets the first free ``_2``/``_3``/… suffix, in order of
+    appearance. This covers both an injected ``column_N`` (for a blank header)
+    clashing with a real column literally named ``column_N`` and two equal
+    literal headers; the first occurrence keeps the name, later ones are
+    suffixed, so every column's data survives under a distinct key.
+    """
+    assigned: set[str] = set()
+    out: list[str] = []
+    for name in names:
+        if name not in assigned:
+            assigned.add(name)
+            out.append(name)
+            continue
+        n = 2
+        while f"{name}_{n}" in assigned:
+            n += 1
+        candidate = f"{name}_{n}"
+        assigned.add(candidate)
+        out.append(candidate)
+    return out
+def read_excel(path: str, *, sheet: str | int | None = None) -> list[dict[str, Any]]:
+    """Read the first (or named) worksheet into a list of dicts.
+    Requires the ``excel`` extra (``pip install diffmonkey[excel]``). The first
+    row is treated as the header. Raises :class:`RuntimeError` with install
+    guidance if ``openpyxl`` is unavailable.
+    Empty header cells are given stable positional names (``column_1``,
+    ``column_2``, …) rather than a shared ``""`` key, so a blank or ragged
+    header never silently collapses several data columns onto one key and loses
+    values. ``openpyxl``'s ``read_only`` mode pads every row to the sheet's
+    widest, so an over-wide data row turns the trailing header padding into such
+    a positional column instead of dropping the stray cell.
+    """
+    try:
+        from openpyxl import load_workbook
+    except ImportError as exc:  # pragma: no cover - exercised via monkeypatch
+        raise RuntimeError(
+            "Reading Excel files requires openpyxl. "
+            "Install it with: pip install diffmonkey[excel]"
+        ) from exc
+    wb = load_workbook(path, read_only=True, data_only=True)
+    ws = wb[sheet] if isinstance(sheet, str) else (
+        wb.worksheets[sheet] if isinstance(sheet, int) else wb.active
+    )
+    rows_iter = ws.iter_rows(values_only=True)
+    try:
+        header = next(rows_iter)
+    except StopIteration:
+        return []
+    headers = _unique_headers(
+        [(f"column_{i + 1}" if h is None or str(h) == "" else str(h))
+         for i, h in enumerate(header)]
+    )
+    out: list[dict[str, Any]] = []
+    for raw in rows_iter:
+        out.append({headers[i]: raw[i] if i < len(raw) else None for i in range(len(headers))})
+    wb.close()
+    return out
+EXCEL_EXTENSIONS = (".xlsx", ".xlsm", ".xltx", ".xltm")
+def read_table(
+    path: str,
+    *,
+    delimiter: str | None = None,
+    encoding: str = "utf-8-sig",
+    sheet: str | int | None = None,
+) -> list[dict[str, Any]]:
+    """Read ``path`` by dispatching on its extension.
+    ``.xlsx``/``.xlsm``/``.xltx``/``.xltm`` go to :func:`read_excel`; everything
+    else is treated as delimited text via :func:`read_csv`. Arguments are routed
+    to the format that understands them: ``delimiter``/``encoding`` apply to
+    delimited text only, ``sheet`` to Excel only. Passing ``delimiter`` for an
+    Excel input raises :class:`ValueError` rather than failing obscurely deep in
+    the Excel reader.
+    """
+    ext = os.path.splitext(path)[1].lower()
+    if ext in EXCEL_EXTENSIONS:
+        if delimiter is not None:
+            raise ValueError(
+                f"delimiter is not applicable to Excel input {path!r}"
+            )
+        return read_excel(path, sheet=sheet)
+    if sheet is not None:
+        raise ValueError(f"sheet is not applicable to delimited input {path!r}")
+    return read_csv(path, delimiter=delimiter, encoding=encoding)
+def read_csv_string(
+    text: str, *, delimiter: str = ","
+) -> list[dict[str, Any]]:
+    """Parse a delimited *string* into a list of dicts (used in tests/pipes).
+    Like :func:`read_csv`, a row with more fields than the header raises
+    :class:`ValueError` rather than producing a ``None``-keyed overflow cell.
+    """
+    reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
+    return _read_dictreader(reader, "<string>")

diffmonkey 1.0.0__tar.gz → 1.1.0__tar.gz

diffmonkey 1.0.0tar.gz → 1.1.0tar.gz