proccompy 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
proccompy/__init__.py ADDED
@@ -0,0 +1,37 @@
1
+ """
2
+ proccompy: SAS PROC COMPARE-style DataFrame comparison for Python.
3
+
4
+ Public API:
5
+ compare() - top-level entry point
6
+ T - tolerance factory (T.exact, T.absolute, T.percent)
7
+ CompareResult - result object with summary(), unequal_rows(),
8
+ diff_dataset(), report(), assert_matches()
9
+ """
10
+
11
+ from .compare import compare
12
+ from .tolerance import T, Tolerance, Exact, Absolute, Percent
13
+ from .types import (
14
+ CompareResult,
15
+ ColumnSummary,
16
+ TYPE_BASE,
17
+ TYPE_COMPARE,
18
+ TYPE_DIF,
19
+ TYPE_PERCENT,
20
+ )
21
+
22
+ __version__ = "0.4.1"
23
+
24
+ __all__ = [
25
+ "compare",
26
+ "T",
27
+ "Tolerance",
28
+ "Exact",
29
+ "Absolute",
30
+ "Percent",
31
+ "CompareResult",
32
+ "ColumnSummary",
33
+ "TYPE_BASE",
34
+ "TYPE_COMPARE",
35
+ "TYPE_DIF",
36
+ "TYPE_PERCENT",
37
+ ]
proccompy/cli.py ADDED
@@ -0,0 +1,334 @@
1
+ """
2
+ Command-line interface for proccompy.
3
+
4
+ Usage:
5
+ proccompy BASE COMPARE --id ID_COLS [options]
6
+
7
+ Reads two tabular files (parquet, csv, tsv), runs compare(), prints the
8
+ report to stdout, optionally writes structured outputs to disk, and exits
9
+ 0 if the datasets match or 1 if they differ. The exit code makes the CLI
10
+ usable as a CI/cron validation gate:
11
+
12
+ proccompy expected.parquet actual.parquet --id account_id && deploy.sh
13
+
14
+ Tolerances are specified per-column via repeated --tolerance flags using
15
+ a compact string syntax: "column=method:value"
16
+
17
+ --tolerance "dollar_amt=abs:0.01"
18
+ --tolerance "rate=pct:0.001"
19
+ --tolerance "category=exact"
20
+ """
21
+
22
+ from __future__ import annotations
23
+ import sys
24
+ from pathlib import Path
25
+ from typing import Optional
26
+
27
+ import click
28
+ import polars as pl
29
+
30
+ from . import __version__
31
+ from .compare import compare as _compare
32
+ from .tolerance import T, Tolerance
33
+
34
+
35
+ # ----------------------------------------------------------------------
36
+ # Input loading: format autodetection by extension, --format override
37
+ # ----------------------------------------------------------------------
38
+
39
+ SUPPORTED_FORMATS = ("parquet", "csv", "tsv")
40
+
41
+
42
+ def _infer_format(path: Path, fmt_override: Optional[str]) -> str:
43
+ """
44
+ Infer file format from extension or directory contents.
45
+
46
+ Directories are always treated as parquet datasets (Spark/Athena-style
47
+ multi-file output is the dominant directory pattern). CSV/TSV
48
+ directories aren't supported because the semantics get murky (headers
49
+ in every file? consistent schema? join order?). If anyone asks, we'll
50
+ add it.
51
+ """
52
+ if fmt_override:
53
+ if path.is_dir() and fmt_override != "parquet":
54
+ raise click.UsageError(
55
+ f"Directory input is only supported for parquet format, "
56
+ f"got --format {fmt_override!r}"
57
+ )
58
+ return fmt_override
59
+ if path.is_dir():
60
+ return "parquet"
61
+ ext = path.suffix.lower().lstrip(".")
62
+ if ext in SUPPORTED_FORMATS:
63
+ return ext
64
+ raise click.UsageError(
65
+ f"Cannot infer format from extension {path.suffix!r}. "
66
+ f"Use --format with one of: {', '.join(SUPPORTED_FORMATS)}"
67
+ )
68
+
69
+
70
+ def _read_frame(path: Path, fmt: str, hive: bool = True) -> pl.DataFrame:
71
+ """
72
+ Load a tabular file (or directory of parquet files) as a polars DataFrame.
73
+
74
+ For parquet, both single files and directories are accepted. Directories
75
+ are read as multi-file datasets; if Hive-style subdirectories are
76
+ present (e.g. year=2024/month=01/), the partition columns are added
77
+ to the frame when hive=True (the default).
78
+ """
79
+ if fmt == "parquet":
80
+ if path.is_dir():
81
+ try:
82
+ return pl.read_parquet(path, hive_partitioning=hive)
83
+ except Exception as e:
84
+ # Most likely cause: schema mismatch across files in the dir.
85
+ # Surface as a clean CLI message instead of a polars traceback.
86
+ raise click.UsageError(
87
+ f"Failed to read parquet directory {path}: {e}"
88
+ )
89
+ return pl.read_parquet(path)
90
+ if fmt == "csv":
91
+ return pl.read_csv(path)
92
+ if fmt == "tsv":
93
+ return pl.read_csv(path, separator="\t")
94
+ raise click.UsageError(f"Unsupported format: {fmt!r}")
95
+
96
+
97
+ # ----------------------------------------------------------------------
98
+ # Tolerance string parser
99
+ # ----------------------------------------------------------------------
100
+
101
+ def _parse_tolerance(spec: str) -> tuple[str, Tolerance]:
102
+ """
103
+ Parse a tolerance spec like:
104
+ "column=method:value"
105
+ "column=exact"
106
+
107
+ Returns (column_name, Tolerance instance).
108
+ """
109
+ if "=" not in spec:
110
+ raise click.BadParameter(
111
+ f"Tolerance spec {spec!r} must contain '='. "
112
+ "Format: column=method:value (e.g. amount=abs:0.01) "
113
+ "or column=exact"
114
+ )
115
+ col, rhs = spec.split("=", 1)
116
+ col = col.strip()
117
+ rhs = rhs.strip()
118
+ if not col:
119
+ raise click.BadParameter(f"Empty column name in {spec!r}")
120
+
121
+ # exact has no value
122
+ if rhs.lower() == "exact":
123
+ return col, T.exact()
124
+
125
+ if ":" not in rhs:
126
+ raise click.BadParameter(
127
+ f"Tolerance spec {spec!r}: method needs a value "
128
+ "(e.g. abs:0.01 or pct:0.001), or use 'exact'"
129
+ )
130
+ method, value_str = rhs.split(":", 1)
131
+ method = method.strip().lower()
132
+ try:
133
+ value = float(value_str.strip())
134
+ except ValueError:
135
+ raise click.BadParameter(
136
+ f"Tolerance value {value_str!r} for column {col!r} is not numeric"
137
+ )
138
+
139
+ if method in ("abs", "absolute"):
140
+ return col, T.absolute(value)
141
+ if method in ("pct", "percent"):
142
+ return col, T.percent(value)
143
+ raise click.BadParameter(
144
+ f"Unknown tolerance method {method!r} in {spec!r}. "
145
+ "Use 'abs', 'pct', or 'exact'."
146
+ )
147
+
148
+
149
+ # ----------------------------------------------------------------------
150
+ # CLI command
151
+ # ----------------------------------------------------------------------
152
+
153
+ @click.command(
154
+ name="proccompy",
155
+ context_settings={"help_option_names": ["-h", "--help"]},
156
+ )
157
+ @click.argument(
158
+ "base",
159
+ type=click.Path(exists=True, path_type=Path),
160
+ )
161
+ @click.argument(
162
+ "compare_path", # 'compare' shadows our import
163
+ type=click.Path(exists=True, path_type=Path),
164
+ metavar="COMPARE",
165
+ )
166
+ @click.option(
167
+ "--id", "id_columns",
168
+ required=True,
169
+ metavar="COLS",
170
+ help="Comma-separated ID column(s). Example: --id account_id or --id sector,naics,year",
171
+ )
172
+ @click.option(
173
+ "--tolerance", "tolerances",
174
+ multiple=True,
175
+ metavar="SPEC",
176
+ help=(
177
+ "Per-column tolerance. Repeatable. Format: column=method:value, "
178
+ "where method is 'abs' (absolute), 'pct' (percent), or 'exact' "
179
+ "(no value). Examples: amount=abs:0.01, rate=pct:0.001, category=exact."
180
+ ),
181
+ )
182
+ @click.option(
183
+ "--format", "fmt",
184
+ type=click.Choice(SUPPORTED_FORMATS, case_sensitive=False),
185
+ default=None,
186
+ help="Override input format. Default: infer from file extension, "
187
+ "or 'parquet' for directories.",
188
+ )
189
+ @click.option(
190
+ "--no-hive",
191
+ "no_hive",
192
+ is_flag=True,
193
+ default=False,
194
+ help="For parquet directory input, disable Hive partition column "
195
+ "discovery. By default, year=2024/month=01/ subdirectories "
196
+ "produce 'year' and 'month' columns in the frame.",
197
+ )
198
+ @click.option(
199
+ "--base-name",
200
+ default="base",
201
+ show_default=True,
202
+ help="Label for the base dataset in the report.",
203
+ )
204
+ @click.option(
205
+ "--compare-name",
206
+ default="compare",
207
+ show_default=True,
208
+ help="Label for the compare dataset in the report.",
209
+ )
210
+ @click.option(
211
+ "--lst", "lst_path",
212
+ type=click.Path(dir_okay=False, path_type=Path),
213
+ default=None,
214
+ help="Write SAS-style .lst report to this path.",
215
+ )
216
+ @click.option(
217
+ "--text", "text_path",
218
+ type=click.Path(dir_okay=False, path_type=Path),
219
+ default=None,
220
+ help="Write plain-text terminal-style report to this path.",
221
+ )
222
+ @click.option(
223
+ "--summary-csv", "summary_csv_path",
224
+ type=click.Path(dir_okay=False, path_type=Path),
225
+ default=None,
226
+ help="Write the per-column summary as CSV.",
227
+ )
228
+ @click.option(
229
+ "--diff-parquet", "diff_parquet_path",
230
+ type=click.Path(dir_okay=False, path_type=Path),
231
+ default=None,
232
+ help="Write the SAS OUT=-style diff dataset as parquet (unequal rows only).",
233
+ )
234
+ @click.option(
235
+ "--quiet", "-q",
236
+ is_flag=True,
237
+ help="Suppress the stdout report. Exit code still reflects match/mismatch.",
238
+ )
239
+ @click.version_option(version=__version__, prog_name="proccompy")
240
+ def cli(
241
+ base: Path,
242
+ compare_path: Path,
243
+ id_columns: str,
244
+ tolerances: tuple[str, ...],
245
+ fmt: Optional[str],
246
+ no_hive: bool,
247
+ base_name: str,
248
+ compare_name: str,
249
+ lst_path: Optional[Path],
250
+ text_path: Optional[Path],
251
+ summary_csv_path: Optional[Path],
252
+ diff_parquet_path: Optional[Path],
253
+ quiet: bool,
254
+ ):
255
+ """
256
+ Compare two tabular files and produce a structured report.
257
+
258
+ BASE and COMPARE may be parquet, csv, or tsv files. Format is inferred
259
+ from extension; use --format to override.
260
+
261
+ The exit code is 0 if the datasets match (within any specified
262
+ tolerances), 1 if they differ in any compared value, row presence, or
263
+ column overlap. This makes the CLI usable as a CI/cron gate.
264
+
265
+ Examples:
266
+
267
+ proccompy expected.parquet actual.parquet --id id
268
+
269
+ proccompy old.csv new.csv --id "sector,year" \\
270
+ --tolerance "amount=abs:0.01" --tolerance "rate=pct:0.001"
271
+
272
+ proccompy a.parquet b.parquet --id id --lst report.lst --quiet
273
+ """
274
+ # Parse ID columns
275
+ id_cols = [c.strip() for c in id_columns.split(",") if c.strip()]
276
+ if not id_cols:
277
+ raise click.UsageError("--id requires at least one column name")
278
+
279
+ # Parse tolerances
280
+ tol_map: dict[str, Tolerance] = {}
281
+ for spec in tolerances:
282
+ col, tol = _parse_tolerance(spec)
283
+ if col in tol_map:
284
+ raise click.UsageError(f"Tolerance for column {col!r} specified twice")
285
+ tol_map[col] = tol
286
+
287
+ # Load frames
288
+ base_fmt = _infer_format(base, fmt)
289
+ compare_fmt = _infer_format(compare_path, fmt)
290
+ hive = not no_hive
291
+ base_df = _read_frame(base, base_fmt, hive=hive)
292
+ compare_df = _read_frame(compare_path, compare_fmt, hive=hive)
293
+
294
+ # Run comparison. Surface common errors as clean CLI messages, not tracebacks.
295
+ try:
296
+ result = _compare(
297
+ base=base_df,
298
+ compare=compare_df,
299
+ id_columns=id_cols,
300
+ tolerances=tol_map,
301
+ base_name=base_name,
302
+ compare_name=compare_name,
303
+ )
304
+ except ValueError as e:
305
+ raise click.UsageError(str(e))
306
+
307
+ # Stdout report (unless suppressed)
308
+ if not quiet:
309
+ click.echo(result.report())
310
+
311
+ # File outputs
312
+ if lst_path:
313
+ result.to_lst(str(lst_path))
314
+ if not quiet:
315
+ click.echo(f"Wrote {lst_path}", err=True)
316
+ if text_path:
317
+ result.to_text(str(text_path))
318
+ if not quiet:
319
+ click.echo(f"Wrote {text_path}", err=True)
320
+ if summary_csv_path:
321
+ result.summary().write_csv(str(summary_csv_path))
322
+ if not quiet:
323
+ click.echo(f"Wrote {summary_csv_path}", err=True)
324
+ if diff_parquet_path:
325
+ result.diff_dataset(only_unequal=True).write_parquet(str(diff_parquet_path))
326
+ if not quiet:
327
+ click.echo(f"Wrote {diff_parquet_path}", err=True)
328
+
329
+ # Exit code drives CI integration: 0 = match, 1 = differ
330
+ sys.exit(0 if result.matches else 1)
331
+
332
+
333
+ if __name__ == "__main__":
334
+ cli()
proccompy/compare.py ADDED
@@ -0,0 +1,100 @@
1
+ """
2
+ Top-level compare() entry point: engine-agnostic wrapper.
3
+
4
+ In v0.1 only the DuckDB engine is implemented. Polars-native and Pandas-native
5
+ engines will follow.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ from typing import Any, Mapping, Optional, Sequence, Union
10
+
11
+ from .engine_duckdb import compare_duckdb
12
+ from .tolerance import Tolerance
13
+ from .types import CompareResult
14
+ from .report import render_report
15
+ from .lst_report import render_lst
16
+
17
+
18
+ def compare(
19
+ base: Any,
20
+ compare: Any,
21
+ id_columns: Union[str, Sequence[str]],
22
+ var_columns: Optional[Sequence[str]] = None,
23
+ tolerances: Optional[Mapping[str, Tolerance]] = None,
24
+ null_eq_null: bool = True,
25
+ duplicate_strategy: str = "strict",
26
+ base_name: str = "base",
27
+ compare_name: str = "compare",
28
+ engine: str = "duckdb",
29
+ ) -> CompareResult:
30
+ """
31
+ Compare two DataFrames.
32
+
33
+ See engine_duckdb.compare_duckdb for parameter docs.
34
+ """
35
+ if engine == "duckdb":
36
+ return compare_duckdb(
37
+ base=base,
38
+ compare=compare,
39
+ id_columns=id_columns,
40
+ var_columns=var_columns,
41
+ tolerances=tolerances,
42
+ null_eq_null=null_eq_null,
43
+ duplicate_strategy=duplicate_strategy,
44
+ base_name=base_name,
45
+ compare_name=compare_name,
46
+ )
47
+ raise NotImplementedError(
48
+ f"engine={engine!r} not implemented in v0.1. Use engine='duckdb'."
49
+ )
50
+
51
+
52
+ # Attach render_report as a method on CompareResult for convenience.
53
+ def _report_method(self, style: str = "proc_compare") -> str:
54
+ return render_report(self, style=style)
55
+
56
+ CompareResult.report = _report_method # type: ignore[attr-defined]
57
+
58
+
59
+ def _to_lst_method(
60
+ self,
61
+ path: str,
62
+ n_sample: int = 50,
63
+ width: int = 132,
64
+ ) -> str:
65
+ """
66
+ Write a SAS-style .lst report to disk.
67
+
68
+ Parameters
69
+ ----------
70
+ path : output file path. Conventionally ends in .lst.
71
+ n_sample : max number of unequal observations to detail in the
72
+ per-observation section. Set to 0 to skip that section.
73
+ width : page width in characters. SAS default is 132.
74
+
75
+ Returns the rendered string (also for testing / inspection).
76
+ """
77
+ text = render_lst(self, n_sample=n_sample, width=width)
78
+ # ASCII encoding intentional: real .lst files are ASCII. Anything
79
+ # non-ASCII in user data (UTF-8 names, etc.) is replaced with '?'
80
+ # so the file stays SAS-compatible.
81
+ with open(path, "w", encoding="ascii", errors="replace", newline="\n") as f:
82
+ f.write(text)
83
+ return text
84
+
85
+ CompareResult.to_lst = _to_lst_method # type: ignore[attr-defined]
86
+
87
+
88
+ def _to_text_method(self, path: str, style: str = "proc_compare") -> str:
89
+ """
90
+ Write the terminal-style report to a UTF-8 text file.
91
+
92
+ Use this when you want the same output as report() but persisted.
93
+ For SAS-style fixed-width output, use to_lst() instead.
94
+ """
95
+ text = render_report(self, style=style)
96
+ with open(path, "w", encoding="utf-8", newline="\n") as f:
97
+ f.write(text)
98
+ return text
99
+
100
+ CompareResult.to_text = _to_text_method # type: ignore[attr-defined]