PyPI - proccompy - Versions diffs - 0.4.1__py3-none-any.whl - Mend

proccompy 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

proccompy/__init__.py +37 -0
proccompy/cli.py +334 -0
proccompy/compare.py +100 -0
proccompy/engine_duckdb.py +378 -0
proccompy/lst_report.py +372 -0
proccompy/report.py +115 -0
proccompy/tolerance.py +80 -0
proccompy/types.py +266 -0
proccompy-0.4.1.dist-info/METADATA +284 -0
proccompy-0.4.1.dist-info/RECORD +13 -0
proccompy-0.4.1.dist-info/WHEEL +4 -0
proccompy-0.4.1.dist-info/entry_points.txt +2 -0
proccompy-0.4.1.dist-info/licenses/LICENSE +202 -0

proccompy/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""
+proccompy: SAS PROC COMPARE-style DataFrame comparison for Python.
+Public API:
+    compare()        - top-level entry point
+    T                - tolerance factory (T.exact, T.absolute, T.percent)
+    CompareResult    - result object with summary(), unequal_rows(),
+                       diff_dataset(), report(), assert_matches()
+"""
+from .compare import compare
+from .tolerance import T, Tolerance, Exact, Absolute, Percent
+from .types import (
+    CompareResult,
+    ColumnSummary,
+    TYPE_BASE,
+    TYPE_COMPARE,
+    TYPE_DIF,
+    TYPE_PERCENT,
+)
+__version__ = "0.4.1"
+__all__ = [
+    "compare",
+    "T",
+    "Tolerance",
+    "Exact",
+    "Absolute",
+    "Percent",
+    "CompareResult",
+    "ColumnSummary",
+    "TYPE_BASE",
+    "TYPE_COMPARE",
+    "TYPE_DIF",
+    "TYPE_PERCENT",
+]

proccompy/cli.py ADDED Viewed

@@ -0,0 +1,334 @@
+"""
+Command-line interface for proccompy.
+Usage:
+    proccompy BASE COMPARE --id ID_COLS [options]
+Reads two tabular files (parquet, csv, tsv), runs compare(), prints the
+report to stdout, optionally writes structured outputs to disk, and exits
+0 if the datasets match or 1 if they differ. The exit code makes the CLI
+usable as a CI/cron validation gate:
+    proccompy expected.parquet actual.parquet --id account_id && deploy.sh
+Tolerances are specified per-column via repeated --tolerance flags using
+a compact string syntax: "column=method:value"
+    --tolerance "dollar_amt=abs:0.01"
+    --tolerance "rate=pct:0.001"
+    --tolerance "category=exact"
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from typing import Optional
+import click
+import polars as pl
+from . import __version__
+from .compare import compare as _compare
+from .tolerance import T, Tolerance
+# ----------------------------------------------------------------------
+# Input loading: format autodetection by extension, --format override
+# ----------------------------------------------------------------------
+SUPPORTED_FORMATS = ("parquet", "csv", "tsv")
+def _infer_format(path: Path, fmt_override: Optional[str]) -> str:
+    """
+    Infer file format from extension or directory contents.
+    Directories are always treated as parquet datasets (Spark/Athena-style
+    multi-file output is the dominant directory pattern). CSV/TSV
+    directories aren't supported because the semantics get murky (headers
+    in every file? consistent schema? join order?). If anyone asks, we'll
+    add it.
+    """
+    if fmt_override:
+        if path.is_dir() and fmt_override != "parquet":
+            raise click.UsageError(
+                f"Directory input is only supported for parquet format, "
+                f"got --format {fmt_override!r}"
+            )
+        return fmt_override
+    if path.is_dir():
+        return "parquet"
+    ext = path.suffix.lower().lstrip(".")
+    if ext in SUPPORTED_FORMATS:
+        return ext
+    raise click.UsageError(
+        f"Cannot infer format from extension {path.suffix!r}. "
+        f"Use --format with one of: {', '.join(SUPPORTED_FORMATS)}"
+    )
+def _read_frame(path: Path, fmt: str, hive: bool = True) -> pl.DataFrame:
+    """
+    Load a tabular file (or directory of parquet files) as a polars DataFrame.
+    For parquet, both single files and directories are accepted. Directories
+    are read as multi-file datasets; if Hive-style subdirectories are
+    present (e.g. year=2024/month=01/), the partition columns are added
+    to the frame when hive=True (the default).
+    """
+    if fmt == "parquet":
+        if path.is_dir():
+            try:
+                return pl.read_parquet(path, hive_partitioning=hive)
+            except Exception as e:
+                # Most likely cause: schema mismatch across files in the dir.
+                # Surface as a clean CLI message instead of a polars traceback.
+                raise click.UsageError(
+                    f"Failed to read parquet directory {path}: {e}"
+                )
+        return pl.read_parquet(path)
+    if fmt == "csv":
+        return pl.read_csv(path)
+    if fmt == "tsv":
+        return pl.read_csv(path, separator="\t")
+    raise click.UsageError(f"Unsupported format: {fmt!r}")
+# ----------------------------------------------------------------------
+# Tolerance string parser
+# ----------------------------------------------------------------------
+def _parse_tolerance(spec: str) -> tuple[str, Tolerance]:
+    """
+    Parse a tolerance spec like:
+        "column=method:value"
+        "column=exact"
+    Returns (column_name, Tolerance instance).
+    """
+    if "=" not in spec:
+        raise click.BadParameter(
+            f"Tolerance spec {spec!r} must contain '='. "
+            "Format: column=method:value (e.g. amount=abs:0.01) "
+            "or column=exact"
+        )
+    col, rhs = spec.split("=", 1)
+    col = col.strip()
+    rhs = rhs.strip()
+    if not col:
+        raise click.BadParameter(f"Empty column name in {spec!r}")
+    # exact has no value
+    if rhs.lower() == "exact":
+        return col, T.exact()
+    if ":" not in rhs:
+        raise click.BadParameter(
+            f"Tolerance spec {spec!r}: method needs a value "
+            "(e.g. abs:0.01 or pct:0.001), or use 'exact'"
+        )
+    method, value_str = rhs.split(":", 1)
+    method = method.strip().lower()
+    try:
+        value = float(value_str.strip())
+    except ValueError:
+        raise click.BadParameter(
+            f"Tolerance value {value_str!r} for column {col!r} is not numeric"
+        )
+    if method in ("abs", "absolute"):
+        return col, T.absolute(value)
+    if method in ("pct", "percent"):
+        return col, T.percent(value)
+    raise click.BadParameter(
+        f"Unknown tolerance method {method!r} in {spec!r}. "
+        "Use 'abs', 'pct', or 'exact'."
+    )
+# ----------------------------------------------------------------------
+# CLI command
+# ----------------------------------------------------------------------
+@click.command(
+    name="proccompy",
+    context_settings={"help_option_names": ["-h", "--help"]},
+)
+@click.argument(
+    "base",
+    type=click.Path(exists=True, path_type=Path),
+)
+@click.argument(
+    "compare_path",                            # 'compare' shadows our import
+    type=click.Path(exists=True, path_type=Path),
+    metavar="COMPARE",
+)
+@click.option(
+    "--id", "id_columns",
+    required=True,
+    metavar="COLS",
+    help="Comma-separated ID column(s). Example: --id account_id or --id sector,naics,year",
+)
+@click.option(
+    "--tolerance", "tolerances",
+    multiple=True,
+    metavar="SPEC",
+    help=(
+        "Per-column tolerance. Repeatable. Format: column=method:value, "
+        "where method is 'abs' (absolute), 'pct' (percent), or 'exact' "
+        "(no value). Examples: amount=abs:0.01, rate=pct:0.001, category=exact."
+    ),
+)
+@click.option(
+    "--format", "fmt",
+    type=click.Choice(SUPPORTED_FORMATS, case_sensitive=False),
+    default=None,
+    help="Override input format. Default: infer from file extension, "
+         "or 'parquet' for directories.",
+)
+@click.option(
+    "--no-hive",
+    "no_hive",
+    is_flag=True,
+    default=False,
+    help="For parquet directory input, disable Hive partition column "
+         "discovery. By default, year=2024/month=01/ subdirectories "
+         "produce 'year' and 'month' columns in the frame.",
+)
+@click.option(
+    "--base-name",
+    default="base",
+    show_default=True,
+    help="Label for the base dataset in the report.",
+)
+@click.option(
+    "--compare-name",
+    default="compare",
+    show_default=True,
+    help="Label for the compare dataset in the report.",
+)
+@click.option(
+    "--lst", "lst_path",
+    type=click.Path(dir_okay=False, path_type=Path),
+    default=None,
+    help="Write SAS-style .lst report to this path.",
+)
+@click.option(
+    "--text", "text_path",
+    type=click.Path(dir_okay=False, path_type=Path),
+    default=None,
+    help="Write plain-text terminal-style report to this path.",
+)
+@click.option(
+    "--summary-csv", "summary_csv_path",
+    type=click.Path(dir_okay=False, path_type=Path),
+    default=None,
+    help="Write the per-column summary as CSV.",
+)
+@click.option(
+    "--diff-parquet", "diff_parquet_path",
+    type=click.Path(dir_okay=False, path_type=Path),
+    default=None,
+    help="Write the SAS OUT=-style diff dataset as parquet (unequal rows only).",
+)
+@click.option(
+    "--quiet", "-q",
+    is_flag=True,
+    help="Suppress the stdout report. Exit code still reflects match/mismatch.",
+)
+@click.version_option(version=__version__, prog_name="proccompy")
+def cli(
+    base: Path,
+    compare_path: Path,
+    id_columns: str,
+    tolerances: tuple[str, ...],
+    fmt: Optional[str],
+    no_hive: bool,
+    base_name: str,
+    compare_name: str,
+    lst_path: Optional[Path],
+    text_path: Optional[Path],
+    summary_csv_path: Optional[Path],
+    diff_parquet_path: Optional[Path],
+    quiet: bool,
+):
+    """
+    Compare two tabular files and produce a structured report.
+    BASE and COMPARE may be parquet, csv, or tsv files. Format is inferred
+    from extension; use --format to override.
+    The exit code is 0 if the datasets match (within any specified
+    tolerances), 1 if they differ in any compared value, row presence, or
+    column overlap. This makes the CLI usable as a CI/cron gate.
+    Examples:
+      proccompy expected.parquet actual.parquet --id id
+      proccompy old.csv new.csv --id "sector,year" \\
+          --tolerance "amount=abs:0.01" --tolerance "rate=pct:0.001"
+      proccompy a.parquet b.parquet --id id --lst report.lst --quiet
+    """
+    # Parse ID columns
+    id_cols = [c.strip() for c in id_columns.split(",") if c.strip()]
+    if not id_cols:
+        raise click.UsageError("--id requires at least one column name")
+    # Parse tolerances
+    tol_map: dict[str, Tolerance] = {}
+    for spec in tolerances:
+        col, tol = _parse_tolerance(spec)
+        if col in tol_map:
+            raise click.UsageError(f"Tolerance for column {col!r} specified twice")
+        tol_map[col] = tol
+    # Load frames
+    base_fmt = _infer_format(base, fmt)
+    compare_fmt = _infer_format(compare_path, fmt)
+    hive = not no_hive
+    base_df = _read_frame(base, base_fmt, hive=hive)
+    compare_df = _read_frame(compare_path, compare_fmt, hive=hive)
+    # Run comparison. Surface common errors as clean CLI messages, not tracebacks.
+    try:
+        result = _compare(
+            base=base_df,
+            compare=compare_df,
+            id_columns=id_cols,
+            tolerances=tol_map,
+            base_name=base_name,
+            compare_name=compare_name,
+        )
+    except ValueError as e:
+        raise click.UsageError(str(e))
+    # Stdout report (unless suppressed)
+    if not quiet:
+        click.echo(result.report())
+    # File outputs
+    if lst_path:
+        result.to_lst(str(lst_path))
+        if not quiet:
+            click.echo(f"Wrote {lst_path}", err=True)
+    if text_path:
+        result.to_text(str(text_path))
+        if not quiet:
+            click.echo(f"Wrote {text_path}", err=True)
+    if summary_csv_path:
+        result.summary().write_csv(str(summary_csv_path))
+        if not quiet:
+            click.echo(f"Wrote {summary_csv_path}", err=True)
+    if diff_parquet_path:
+        result.diff_dataset(only_unequal=True).write_parquet(str(diff_parquet_path))
+        if not quiet:
+            click.echo(f"Wrote {diff_parquet_path}", err=True)
+    # Exit code drives CI integration: 0 = match, 1 = differ
+    sys.exit(0 if result.matches else 1)
+if __name__ == "__main__":
+    cli()

proccompy/compare.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""
+Top-level compare() entry point: engine-agnostic wrapper.
+In v0.1 only the DuckDB engine is implemented. Polars-native and Pandas-native
+engines will follow.
+"""
+from __future__ import annotations
+from typing import Any, Mapping, Optional, Sequence, Union
+from .engine_duckdb import compare_duckdb
+from .tolerance import Tolerance
+from .types import CompareResult
+from .report import render_report
+from .lst_report import render_lst
+def compare(
+    base: Any,
+    compare: Any,
+    id_columns: Union[str, Sequence[str]],
+    var_columns: Optional[Sequence[str]] = None,
+    tolerances: Optional[Mapping[str, Tolerance]] = None,
+    null_eq_null: bool = True,
+    duplicate_strategy: str = "strict",
+    base_name: str = "base",
+    compare_name: str = "compare",
+    engine: str = "duckdb",
+) -> CompareResult:
+    """
+    Compare two DataFrames.
+    See engine_duckdb.compare_duckdb for parameter docs.
+    """
+    if engine == "duckdb":
+        return compare_duckdb(
+            base=base,
+            compare=compare,
+            id_columns=id_columns,
+            var_columns=var_columns,
+            tolerances=tolerances,
+            null_eq_null=null_eq_null,
+            duplicate_strategy=duplicate_strategy,
+            base_name=base_name,
+            compare_name=compare_name,
+        )
+    raise NotImplementedError(
+        f"engine={engine!r} not implemented in v0.1. Use engine='duckdb'."
+    )
+# Attach render_report as a method on CompareResult for convenience.
+def _report_method(self, style: str = "proc_compare") -> str:
+    return render_report(self, style=style)
+CompareResult.report = _report_method  # type: ignore[attr-defined]
+def _to_lst_method(
+    self,
+    path: str,
+    n_sample: int = 50,
+    width: int = 132,
+) -> str:
+    """
+    Write a SAS-style .lst report to disk.
+    Parameters
+    ----------
+    path : output file path. Conventionally ends in .lst.
+    n_sample : max number of unequal observations to detail in the
+               per-observation section. Set to 0 to skip that section.
+    width : page width in characters. SAS default is 132.
+    Returns the rendered string (also for testing / inspection).
+    """
+    text = render_lst(self, n_sample=n_sample, width=width)
+    # ASCII encoding intentional: real .lst files are ASCII. Anything
+    # non-ASCII in user data (UTF-8 names, etc.) is replaced with '?'
+    # so the file stays SAS-compatible.
+    with open(path, "w", encoding="ascii", errors="replace", newline="\n") as f:
+        f.write(text)
+    return text
+CompareResult.to_lst = _to_lst_method  # type: ignore[attr-defined]
+def _to_text_method(self, path: str, style: str = "proc_compare") -> str:
+    """
+    Write the terminal-style report to a UTF-8 text file.
+    Use this when you want the same output as report() but persisted.
+    For SAS-style fixed-width output, use to_lst() instead.
+    """
+    text = render_report(self, style=style)
+    with open(path, "w", encoding="utf-8", newline="\n") as f:
+        f.write(text)
+    return text
+CompareResult.to_text = _to_text_method  # type: ignore[attr-defined]