lightweight-table-diff 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightweight_table_diff-0.1.0/PKG-INFO +12 -0
- lightweight_table_diff-0.1.0/README.md +0 -0
- lightweight_table_diff-0.1.0/pyproject.toml +28 -0
- lightweight_table_diff-0.1.0/setup.cfg +4 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/__init__.py +5 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/__main__.py +16 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/adapters/__init__.py +31 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/adapters/csv.py +14 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/adapters/hive_s3.py +67 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/adapters/parquet.py +13 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/adapters/sav.py +23 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/config.py +40 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/core.py +85 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/dimensions.py +96 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/normalisers.py +42 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff/runner.py +147 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff.egg-info/PKG-INFO +12 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff.egg-info/SOURCES.txt +19 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff.egg-info/dependency_links.txt +1 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff.egg-info/requires.txt +8 -0
- lightweight_table_diff-0.1.0/src/lightweight_table_diff.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lightweight-table-diff
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Cell-level table diffing for Polars
|
|
5
|
+
Requires-Python: >=3.11.1
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: polars>=1.38
|
|
8
|
+
Requires-Dist: polars-checkpoint>=0.1.2
|
|
9
|
+
Provides-Extra: spss
|
|
10
|
+
Requires-Dist: pyreadstat; extra == "spss"
|
|
11
|
+
Provides-Extra: hive
|
|
12
|
+
Requires-Dist: boto3; extra == "hive"
|
|
File without changes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77.0.3"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "lightweight-table-diff"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Cell-level table diffing for Polars"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11.1"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"polars>=1.38",
|
|
13
|
+
"polars-checkpoint>=0.1.2",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
spss = [
|
|
18
|
+
"pyreadstat",
|
|
19
|
+
]
|
|
20
|
+
hive = [
|
|
21
|
+
"boto3",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[tool.setuptools]
|
|
25
|
+
package-dir = {"" = "src"}
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["src"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""python -m table_diff config.yml"""
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from .runner import run_config
|
|
6
|
+
|
|
7
|
+
logging.basicConfig(
|
|
8
|
+
format="%(asctime)s %(levelname)-8s %(message)s",
|
|
9
|
+
level=logging.INFO,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
if len(sys.argv) < 2:
|
|
13
|
+
print("Usage: python -m table_diff <config.yml>", file=sys.stderr)
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
run_config(sys.argv[1])
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
AdapterFn = Callable[..., pl.LazyFrame]
|
|
8
|
+
|
|
9
|
+
_registry: dict[str, AdapterFn] = {}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def register(name: str, adapter: AdapterFn) -> None:
|
|
13
|
+
_registry[name] = adapter
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load(source_def: dict, **context: Any) -> pl.LazyFrame:
|
|
17
|
+
adapter_type = source_def.get("type", "parquet")
|
|
18
|
+
if adapter_type not in _registry:
|
|
19
|
+
raise ValueError(
|
|
20
|
+
f"Unknown adapter {adapter_type!r}. Registered: {sorted(_registry)}"
|
|
21
|
+
)
|
|
22
|
+
return _registry[adapter_type](source_def, **context)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
from .csv import load_csv # noqa: E402
|
|
26
|
+
from .parquet import load_parquet # noqa: E402
|
|
27
|
+
from .sav import load_sav # noqa: E402
|
|
28
|
+
|
|
29
|
+
register("parquet", load_parquet)
|
|
30
|
+
register("csv", load_csv)
|
|
31
|
+
register("sav", load_sav)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def load_csv(source_def: dict) -> pl.LazyFrame:
|
|
7
|
+
path = Path(source_def["path"])
|
|
8
|
+
glob = source_def.get("glob", "*.csv")
|
|
9
|
+
scan_path = str(path / glob) if path.is_dir() else str(path)
|
|
10
|
+
return pl.scan_csv(
|
|
11
|
+
scan_path,
|
|
12
|
+
infer_schema_length=source_def.get("infer_schema_length", 10_000),
|
|
13
|
+
ignore_errors=source_def.get("ignore_errors", True),
|
|
14
|
+
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_hive(source_def: dict, **context) -> pl.LazyFrame:
|
|
13
|
+
import boto3
|
|
14
|
+
|
|
15
|
+
spark = context.get("spark")
|
|
16
|
+
if spark is None:
|
|
17
|
+
raise RuntimeError("Hive adapter requires spark= to be passed")
|
|
18
|
+
|
|
19
|
+
ssl_cert = context.get("ssl_cert")
|
|
20
|
+
table_name = source_def["table"]
|
|
21
|
+
cache_dir = Path(source_def.get("cache_dir", f"/tmp/hive_{table_name}"))
|
|
22
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
rows = spark.sql(f"DESCRIBE FORMATTED {table_name}").collect()
|
|
25
|
+
location = next(
|
|
26
|
+
(r[1].strip() for r in rows if r[0] and "Location" in r[0]), None
|
|
27
|
+
)
|
|
28
|
+
if not location:
|
|
29
|
+
raise RuntimeError(f"Could not resolve S3 location for '{table_name}'")
|
|
30
|
+
|
|
31
|
+
parsed = urlparse(str(location).replace("s3a://", "s3://"))
|
|
32
|
+
bucket = parsed.netloc
|
|
33
|
+
prefix = parsed.path.lstrip("/").rstrip("/") + "/"
|
|
34
|
+
|
|
35
|
+
client = boto3.client("s3")
|
|
36
|
+
try:
|
|
37
|
+
import raz_client
|
|
38
|
+
|
|
39
|
+
if ssl_cert:
|
|
40
|
+
raz_client.configure_ranger_raz(client, ssl_file=ssl_cert)
|
|
41
|
+
except ImportError:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
logger.info("Downloading %s → %s", location, cache_dir)
|
|
45
|
+
n_files = 0
|
|
46
|
+
for page in client.get_paginator("list_objects_v2").paginate(
|
|
47
|
+
Bucket=bucket, Prefix=prefix
|
|
48
|
+
):
|
|
49
|
+
for obj in page.get("Contents", []):
|
|
50
|
+
if not obj["Key"].endswith(".parquet"):
|
|
51
|
+
continue
|
|
52
|
+
n_files += 1
|
|
53
|
+
relative = (
|
|
54
|
+
obj["Key"][len(prefix) :].lstrip("/")
|
|
55
|
+
if obj["Key"].startswith(prefix)
|
|
56
|
+
else Path(obj["Key"]).name
|
|
57
|
+
)
|
|
58
|
+
dest = cache_dir / relative
|
|
59
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
client.download_file(bucket, obj["Key"], str(dest))
|
|
61
|
+
|
|
62
|
+
if not n_files:
|
|
63
|
+
raise FileNotFoundError(f"No parquet files found at {location}")
|
|
64
|
+
|
|
65
|
+
return pl.scan_parquet(
|
|
66
|
+
str(cache_dir / "**/*.parquet"), hive_partitioning=True
|
|
67
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def load_parquet(source_def: dict) -> pl.LazyFrame:
|
|
7
|
+
path = Path(source_def["path"])
|
|
8
|
+
glob = source_def.get("glob", "*.parquet")
|
|
9
|
+
scan_path = str(path / glob) if path.is_dir() else str(path)
|
|
10
|
+
return pl.scan_parquet(
|
|
11
|
+
scan_path,
|
|
12
|
+
hive_partitioning=source_def.get("hive_partitioning", False),
|
|
13
|
+
)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPSS .sav adapter. Loads via pyreadstat into an in-memory Polars frame.
|
|
3
|
+
Normalisation (null alignment, trailing-zero stripping, etc.)
|
|
4
|
+
is left to the normaliser layer.
|
|
5
|
+
"""
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_sav(source_def: dict) -> pl.LazyFrame:
|
|
10
|
+
try:
|
|
11
|
+
import pyreadstat
|
|
12
|
+
except ImportError:
|
|
13
|
+
raise ImportError(
|
|
14
|
+
"pyreadstat is required for .sav files: pip install pyreadstat"
|
|
15
|
+
) from None
|
|
16
|
+
|
|
17
|
+
pdf, _meta = pyreadstat.read_sav(
|
|
18
|
+
str(source_def["path"]),
|
|
19
|
+
apply_value_formats=source_def.get("apply_value_formats", False),
|
|
20
|
+
formats_as_category=False,
|
|
21
|
+
user_missing=True,
|
|
22
|
+
)
|
|
23
|
+
return pl.from_pandas(pdf).lazy()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""YAML config loading and deep-merge expansion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def deep_merge(base: dict, override: dict) -> dict:
|
|
13
|
+
result = copy.deepcopy(base)
|
|
14
|
+
_merge_in_place(result, override)
|
|
15
|
+
return result
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _merge_in_place(target: dict, source: dict) -> None:
|
|
19
|
+
for k, v in source.items():
|
|
20
|
+
if k in target and isinstance(v, dict) and isinstance(target[k], dict):
|
|
21
|
+
_merge_in_place(target[k], v)
|
|
22
|
+
else:
|
|
23
|
+
target[k] = copy.deepcopy(v)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def expand_comparisons(raw: dict[str, Any]) -> list[dict[str, Any]]:
|
|
27
|
+
"""Split shared defaults + 'comparisons' list into per-job dicts."""
|
|
28
|
+
base = {k: v for k, v in raw.items() if k != "comparisons"}
|
|
29
|
+
items = raw.get("comparisons", [{}])
|
|
30
|
+
if not isinstance(items, list):
|
|
31
|
+
raise TypeError(
|
|
32
|
+
f"'comparisons' must be a list, got {type(items).__name__}"
|
|
33
|
+
)
|
|
34
|
+
return [deep_merge(base, item) for item in items]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def load_config(path: str | Path) -> list[dict[str, Any]]:
|
|
38
|
+
with open(path, encoding="utf-8") as f:
|
|
39
|
+
raw = yaml.safe_load(f)
|
|
40
|
+
return expand_comparisons(raw)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cell-level table differ for Polars.
|
|
3
|
+
|
|
4
|
+
Produces long-form (keys…, col_name, before_val, after_val) for every
|
|
5
|
+
cell that changed between two LazyFrames.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
import polars as pl
|
|
13
|
+
from polars_checkpoint import CheckpointSession, checkpoint
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def diff_tbls(
|
|
19
|
+
before: pl.LazyFrame,
|
|
20
|
+
after: pl.LazyFrame,
|
|
21
|
+
id_cols: list[str],
|
|
22
|
+
compare_cols: list[str] | None = None,
|
|
23
|
+
join_type: str = "full",
|
|
24
|
+
) -> pl.LazyFrame:
|
|
25
|
+
if compare_cols is None:
|
|
26
|
+
compare_cols = list(before.drop(*id_cols).collect_schema().keys())
|
|
27
|
+
|
|
28
|
+
bef_cols = [pl.col(c).alias(f"b__{c}") for c in compare_cols]
|
|
29
|
+
aft_cols = [pl.col(c).alias(f"a__{c}") for c in compare_cols]
|
|
30
|
+
before = before.select(*id_cols, *bef_cols)
|
|
31
|
+
after = after.select(*id_cols, *aft_cols)
|
|
32
|
+
|
|
33
|
+
joined = before.join(after, on=id_cols, how=join_type, coalesce=True)
|
|
34
|
+
|
|
35
|
+
diff_structs = [
|
|
36
|
+
pl.when(~pl.col(f"b__{c}").eq_missing(pl.col(f"a__{c}")))
|
|
37
|
+
.then(
|
|
38
|
+
pl.struct(
|
|
39
|
+
pl.col(f"b__{c}").alias("before_val"),
|
|
40
|
+
pl.col(f"a__{c}").alias("after_val"),
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
.otherwise(None)
|
|
44
|
+
.alias(c)
|
|
45
|
+
for c in compare_cols
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
return (
|
|
49
|
+
joined.select(*id_cols, *diff_structs)
|
|
50
|
+
.unpivot(
|
|
51
|
+
on=compare_cols,
|
|
52
|
+
index=id_cols,
|
|
53
|
+
variable_name="col_name",
|
|
54
|
+
value_name="diff",
|
|
55
|
+
)
|
|
56
|
+
.drop_nulls("diff")
|
|
57
|
+
.select(
|
|
58
|
+
*id_cols,
|
|
59
|
+
"col_name",
|
|
60
|
+
pl.col("diff").struct.field("before_val"),
|
|
61
|
+
pl.col("diff").struct.field("after_val"),
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def batch_diff_tbls(
|
|
67
|
+
before: pl.LazyFrame,
|
|
68
|
+
after: pl.LazyFrame,
|
|
69
|
+
id_cols: list[str],
|
|
70
|
+
compare_cols: list[str] | None = None,
|
|
71
|
+
batch_size: int = 50,
|
|
72
|
+
join_type: str = "full",
|
|
73
|
+
) -> pl.LazyFrame:
|
|
74
|
+
if compare_cols is None:
|
|
75
|
+
compare_cols = list(before.drop(*id_cols).collect_schema().keys())
|
|
76
|
+
|
|
77
|
+
parts = []
|
|
78
|
+
n = len(compare_cols)
|
|
79
|
+
for i in range(0, n, batch_size):
|
|
80
|
+
batch = compare_cols[i : i + batch_size]
|
|
81
|
+
logger.info(" batch %d-%d of %d columns", i + 1, min(i + len(batch), n), n)
|
|
82
|
+
diff = diff_tbls(before, after, id_cols, batch, join_type=join_type)
|
|
83
|
+
parts.append(checkpoint(diff))
|
|
84
|
+
|
|
85
|
+
return pl.concat(parts)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Column/row checking and key-uniqueness validation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_cols_to_compare(
|
|
9
|
+
before: pl.LazyFrame,
|
|
10
|
+
after: pl.LazyFrame,
|
|
11
|
+
keys: list[str],
|
|
12
|
+
include_cols: list[str] | None = None,
|
|
13
|
+
exclude_cols: list[str] | None = None,
|
|
14
|
+
) -> list[str]:
|
|
15
|
+
"""Work out which non-key columns to compare, from schema intersection."""
|
|
16
|
+
before_names = set(before.collect_schema().names())
|
|
17
|
+
after_names = set(after.collect_schema().names())
|
|
18
|
+
key_set = set(keys)
|
|
19
|
+
|
|
20
|
+
missing = key_set - (before_names & after_names)
|
|
21
|
+
if missing:
|
|
22
|
+
raise ValueError(
|
|
23
|
+
f"Join key(s) missing from one or both sides: {missing}"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if include_cols:
|
|
27
|
+
cols = [
|
|
28
|
+
c
|
|
29
|
+
for c in include_cols
|
|
30
|
+
if c in before_names and c in after_names and c not in key_set
|
|
31
|
+
]
|
|
32
|
+
else:
|
|
33
|
+
cols = sorted((before_names & after_names) - key_set)
|
|
34
|
+
|
|
35
|
+
if exclude_cols:
|
|
36
|
+
exclude_set = set(exclude_cols)
|
|
37
|
+
cols = [c for c in cols if c not in exclude_set]
|
|
38
|
+
|
|
39
|
+
if not cols:
|
|
40
|
+
raise ValueError("No columns to compare after applying filters")
|
|
41
|
+
|
|
42
|
+
return cols
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def validate_key_uniqueness(
|
|
46
|
+
before: pl.LazyFrame,
|
|
47
|
+
after: pl.LazyFrame,
|
|
48
|
+
keys: list[str],
|
|
49
|
+
sample_limit: int = 20,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Raise if either frame has duplicate key combinations."""
|
|
52
|
+
problems: list[str] = []
|
|
53
|
+
for label, lf in [("before", before), ("after", after)]:
|
|
54
|
+
dupes = (
|
|
55
|
+
lf.select(*keys)
|
|
56
|
+
.group_by(keys)
|
|
57
|
+
.len()
|
|
58
|
+
.filter(pl.col("len") > 1)
|
|
59
|
+
.limit(sample_limit)
|
|
60
|
+
.collect()
|
|
61
|
+
)
|
|
62
|
+
if dupes.height:
|
|
63
|
+
problems.append(
|
|
64
|
+
f" {label}: {dupes.height} duplicate key group(s)\n{dupes}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if problems:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"Duplicate keys (would cause row explosion):\n"
|
|
70
|
+
+ "\n".join(problems)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def column_indels(
|
|
75
|
+
before: pl.LazyFrame,
|
|
76
|
+
after: pl.LazyFrame,
|
|
77
|
+
keys: list[str],
|
|
78
|
+
) -> tuple[list[str], list[str]]:
|
|
79
|
+
"""Return (removed_cols, added_cols) relative to before → after."""
|
|
80
|
+
key_set = set(keys)
|
|
81
|
+
before_cols = set(before.collect_schema().names()) - key_set
|
|
82
|
+
after_cols = set(after.collect_schema().names()) - key_set
|
|
83
|
+
return sorted(before_cols - after_cols), sorted(after_cols - before_cols)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def row_indels(
|
|
87
|
+
before: pl.LazyFrame,
|
|
88
|
+
after: pl.LazyFrame,
|
|
89
|
+
id_cols: list[str],
|
|
90
|
+
) -> tuple[pl.LazyFrame, pl.LazyFrame]:
|
|
91
|
+
"""Return (removed_rows, added_rows) as key-only LazyFrames."""
|
|
92
|
+
before_keys = before.select(*id_cols)
|
|
93
|
+
after_keys = after.select(*id_cols)
|
|
94
|
+
removed = before_keys.join(after_keys, on=id_cols, how="anti")
|
|
95
|
+
added = after_keys.join(before_keys, on=id_cols, how="anti")
|
|
96
|
+
return removed, added
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalisation transforms applied *before* diffing to suppress irrelevant
|
|
3
|
+
format differences between disparate sources.
|
|
4
|
+
|
|
5
|
+
Each normaliser has signature: (lf, keys, cols) -> lf
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Callable
|
|
11
|
+
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
NormaliserFn = Callable[[pl.LazyFrame, list[str], list[str]], pl.LazyFrame]
|
|
15
|
+
|
|
16
|
+
NULLISH = ["", "nan", "none", "<na>"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _norm_expr(expr: pl.Expr) -> pl.Expr:
|
|
20
|
+
s = expr.cast(pl.String).str.strip_chars()
|
|
21
|
+
s_lower = s.str.to_lowercase()
|
|
22
|
+
s = (
|
|
23
|
+
pl.when(s.is_null() | s_lower.is_in(NULLISH))
|
|
24
|
+
.then(pl.lit(None, dtype=pl.String))
|
|
25
|
+
.otherwise(s)
|
|
26
|
+
)
|
|
27
|
+
return s.str.replace(r"^(-?\d+)\.0+$", "${1}")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def normalise_float_strings(
|
|
31
|
+
lf: pl.LazyFrame, keys: list[str], cols: list[str]
|
|
32
|
+
) -> pl.LazyFrame:
|
|
33
|
+
"""Cast everything to string, unify nulls, strip whitespace/trailing .0."""
|
|
34
|
+
return lf.select(
|
|
35
|
+
*[_norm_expr(pl.col(k)).alias(k) for k in keys],
|
|
36
|
+
*[_norm_expr(pl.col(c)).alias(c) for c in cols],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
REGISTRY: dict[str, NormaliserFn] = {
|
|
41
|
+
"float_strings": normalise_float_strings,
|
|
42
|
+
}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
from . import adapters
|
|
11
|
+
from .dimensions import (
|
|
12
|
+
validate_key_uniqueness,
|
|
13
|
+
get_cols_to_compare,
|
|
14
|
+
column_indels,
|
|
15
|
+
row_indels,
|
|
16
|
+
)
|
|
17
|
+
from .config import load_config
|
|
18
|
+
from .core import batch_diff_tbls
|
|
19
|
+
from .normalisers import REGISTRY as NORMALISER_REGISTRY
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ComparisonResult:
|
|
26
|
+
name: str
|
|
27
|
+
diff: pl.LazyFrame
|
|
28
|
+
n_diffs: int
|
|
29
|
+
removed_rows: pl.LazyFrame
|
|
30
|
+
added_rows: pl.LazyFrame
|
|
31
|
+
removed_cols: list[str]
|
|
32
|
+
added_cols: list[str]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def run_comparison(job: dict, **context) -> ComparisonResult:
|
|
36
|
+
job = copy.deepcopy(job)
|
|
37
|
+
name = job.get("name", "unnamed")
|
|
38
|
+
keys = job["join_keys"]
|
|
39
|
+
logger.info("Running comparison: %s", name)
|
|
40
|
+
|
|
41
|
+
before = adapters.load(job["before"], **context)
|
|
42
|
+
after = adapters.load(job["after"], **context)
|
|
43
|
+
|
|
44
|
+
removed_cols, added_cols = column_indels(before, after, keys)
|
|
45
|
+
if removed_cols:
|
|
46
|
+
logger.info(" %d column(s) removed: %s", len(removed_cols), removed_cols)
|
|
47
|
+
if added_cols:
|
|
48
|
+
logger.info(" %d column(s) added: %s", len(added_cols), added_cols)
|
|
49
|
+
|
|
50
|
+
cols = get_cols_to_compare(
|
|
51
|
+
before,
|
|
52
|
+
after,
|
|
53
|
+
keys,
|
|
54
|
+
include_cols=job.get("compare_cols"),
|
|
55
|
+
exclude_cols=job.get("exclude_cols"),
|
|
56
|
+
)
|
|
57
|
+
logger.info(" %d column(s) to compare", len(cols))
|
|
58
|
+
validate_key_uniqueness(before, after, keys)
|
|
59
|
+
|
|
60
|
+
removed, added = row_indels(before, after, keys)
|
|
61
|
+
|
|
62
|
+
if norm_name := job.get("normalisation"):
|
|
63
|
+
if norm_name not in NORMALISER_REGISTRY:
|
|
64
|
+
raise ValueError(f"Unknown normaliser: {norm_name!r}")
|
|
65
|
+
norm_fn = NORMALISER_REGISTRY[norm_name]
|
|
66
|
+
before = norm_fn(before, keys, cols)
|
|
67
|
+
after = norm_fn(after, keys, cols)
|
|
68
|
+
|
|
69
|
+
diff = batch_diff_tbls(
|
|
70
|
+
before,
|
|
71
|
+
after,
|
|
72
|
+
keys,
|
|
73
|
+
cols,
|
|
74
|
+
batch_size=job.get("batch_size", 50),
|
|
75
|
+
join_type=job.get("join_type", "full"),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
n = diff.select(pl.len()).collect().item()
|
|
79
|
+
logger.info(" %d difference(s) found", n)
|
|
80
|
+
|
|
81
|
+
return ComparisonResult(
|
|
82
|
+
name=name,
|
|
83
|
+
diff=diff,
|
|
84
|
+
n_diffs=n,
|
|
85
|
+
removed_rows=removed,
|
|
86
|
+
added_rows=added,
|
|
87
|
+
removed_cols=removed_cols,
|
|
88
|
+
added_cols=added_cols,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def write_results(result: ComparisonResult, output_dir: str | Path) -> None:
|
|
93
|
+
out = Path(output_dir)
|
|
94
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
# -- column indels (just a small text/csv file) ----------------------------
|
|
97
|
+
if result.removed_cols or result.added_cols:
|
|
98
|
+
col_indels_path = out / f"{result.name}_column_indels.csv"
|
|
99
|
+
max_len = max(len(result.removed_cols), len(result.added_cols))
|
|
100
|
+
pl.DataFrame(
|
|
101
|
+
{
|
|
102
|
+
"removed_columns": result.removed_cols + [""] * (max_len - len(result.removed_cols)),
|
|
103
|
+
"added_columns": result.added_cols + [""] * (max_len - len(result.added_cols)),
|
|
104
|
+
}
|
|
105
|
+
).write_csv(col_indels_path)
|
|
106
|
+
logger.info(" Wrote %s", col_indels_path)
|
|
107
|
+
|
|
108
|
+
# -- row indels ------------------------------------------------------------
|
|
109
|
+
for label, lf in [
|
|
110
|
+
("removed", result.removed_rows),
|
|
111
|
+
("added", result.added_rows),
|
|
112
|
+
]:
|
|
113
|
+
n = lf.select(pl.len()).collect().item()
|
|
114
|
+
if n > 0:
|
|
115
|
+
path = out / f"{result.name}_{label}_rows.csv"
|
|
116
|
+
lf.sink_csv(path)
|
|
117
|
+
logger.info(" %d %s row(s) → %s", n, label, path)
|
|
118
|
+
else:
|
|
119
|
+
logger.info(" No %s rows", label)
|
|
120
|
+
|
|
121
|
+
# -- cell-level diff -------------------------------------------------------
|
|
122
|
+
if result.n_diffs == 0:
|
|
123
|
+
logger.info(" %s: no cell differences", result.name)
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
detail_path = out / f"{result.name}_detailed.csv"
|
|
127
|
+
result.diff.sink_csv(detail_path)
|
|
128
|
+
logger.info(" Wrote %s", detail_path)
|
|
129
|
+
|
|
130
|
+
summary_path = out / f"{result.name}_summary.csv"
|
|
131
|
+
(
|
|
132
|
+
result.diff.group_by("col_name", "before_val", "after_val")
|
|
133
|
+
.agg(pl.len().alias("count"))
|
|
134
|
+
.sort("count", descending=True)
|
|
135
|
+
.sink_csv(summary_path)
|
|
136
|
+
)
|
|
137
|
+
logger.info(" Wrote %s", summary_path)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def run_config(config_path: str | Path, **context) -> list[ComparisonResult]:
|
|
141
|
+
jobs = load_config(config_path)
|
|
142
|
+
results: list[ComparisonResult] = []
|
|
143
|
+
for job in jobs:
|
|
144
|
+
result = run_comparison(job, **context)
|
|
145
|
+
write_results(result, job.get("output_dir", "./diff_output"))
|
|
146
|
+
results.append(result)
|
|
147
|
+
return results
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lightweight-table-diff
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Cell-level table diffing for Polars
|
|
5
|
+
Requires-Python: >=3.11.1
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: polars>=1.38
|
|
8
|
+
Requires-Dist: polars-checkpoint>=0.1.2
|
|
9
|
+
Provides-Extra: spss
|
|
10
|
+
Requires-Dist: pyreadstat; extra == "spss"
|
|
11
|
+
Provides-Extra: hive
|
|
12
|
+
Requires-Dist: boto3; extra == "hive"
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/lightweight_table_diff/__init__.py
|
|
4
|
+
src/lightweight_table_diff/__main__.py
|
|
5
|
+
src/lightweight_table_diff/config.py
|
|
6
|
+
src/lightweight_table_diff/core.py
|
|
7
|
+
src/lightweight_table_diff/dimensions.py
|
|
8
|
+
src/lightweight_table_diff/normalisers.py
|
|
9
|
+
src/lightweight_table_diff/runner.py
|
|
10
|
+
src/lightweight_table_diff.egg-info/PKG-INFO
|
|
11
|
+
src/lightweight_table_diff.egg-info/SOURCES.txt
|
|
12
|
+
src/lightweight_table_diff.egg-info/dependency_links.txt
|
|
13
|
+
src/lightweight_table_diff.egg-info/requires.txt
|
|
14
|
+
src/lightweight_table_diff.egg-info/top_level.txt
|
|
15
|
+
src/lightweight_table_diff/adapters/__init__.py
|
|
16
|
+
src/lightweight_table_diff/adapters/csv.py
|
|
17
|
+
src/lightweight_table_diff/adapters/hive_s3.py
|
|
18
|
+
src/lightweight_table_diff/adapters/parquet.py
|
|
19
|
+
src/lightweight_table_diff/adapters/sav.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
lightweight_table_diff
|