lightweight-table-diff 0.1.3__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lightweight_table_diff-1.0.0/.gitignore +9 -0
- lightweight_table_diff-1.0.0/PKG-INFO +15 -0
- lightweight_table_diff-1.0.0/README_api_example.md +128 -0
- lightweight_table_diff-1.0.0/codebase.md +2116 -0
- lightweight_table_diff-1.0.0/examples/cloudera_hive/run_diff.py +111 -0
- lightweight_table_diff-1.0.0/pyproject.toml +68 -0
- lightweight_table_diff-1.0.0/src/lightweight_table_diff/__init__.py +16 -0
- {lightweight_table_diff-0.1.3 → lightweight_table_diff-1.0.0}/src/lightweight_table_diff/__main__.py +16 -16
- lightweight_table_diff-1.0.0/src/lightweight_table_diff/api.py +262 -0
- lightweight_table_diff-1.0.0/src/lightweight_table_diff/config.py +149 -0
- lightweight_table_diff-1.0.0/src/lightweight_table_diff/core.py +130 -0
- lightweight_table_diff-1.0.0/src/lightweight_table_diff/dimensions.py +212 -0
- {lightweight_table_diff-0.1.3 → lightweight_table_diff-1.0.0}/src/lightweight_table_diff/normalisers.py +59 -42
- lightweight_table_diff-1.0.0/src/lightweight_table_diff/resolver.py +318 -0
- lightweight_table_diff-1.0.0/src/lightweight_table_diff/result.py +168 -0
- lightweight_table_diff-1.0.0/src/lightweight_table_diff/runner.py +112 -0
- lightweight_table_diff-1.0.0/tests/test_api.py +119 -0
- lightweight_table_diff-1.0.0/tests/test_core.py +63 -0
- lightweight_table_diff-1.0.0/tests/test_dimensions.py +84 -0
- lightweight_table_diff-1.0.0/tests/test_integration.py +80 -0
- lightweight_table_diff-1.0.0/tests/test_normalisers.py +45 -0
- lightweight_table_diff-1.0.0/tycheck.log +1 -0
- lightweight_table_diff-1.0.0/uv.lock +1138 -0
- lightweight_table_diff-0.1.3/PKG-INFO +0 -13
- lightweight_table_diff-0.1.3/README.md +0 -0
- lightweight_table_diff-0.1.3/pyproject.toml +0 -29
- lightweight_table_diff-0.1.3/setup.cfg +0 -4
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/__init__.py +0 -5
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/adapters/__init__.py +0 -33
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/adapters/csv.py +0 -14
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/adapters/hive_s3.py +0 -67
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/adapters/parquet.py +0 -13
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/adapters/sav.py +0 -23
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/config.py +0 -40
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/core.py +0 -85
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/dimensions.py +0 -96
- lightweight_table_diff-0.1.3/src/lightweight_table_diff/runner.py +0 -147
- lightweight_table_diff-0.1.3/src/lightweight_table_diff.egg-info/PKG-INFO +0 -13
- lightweight_table_diff-0.1.3/src/lightweight_table_diff.egg-info/SOURCES.txt +0 -19
- lightweight_table_diff-0.1.3/src/lightweight_table_diff.egg-info/dependency_links.txt +0 -1
- lightweight_table_diff-0.1.3/src/lightweight_table_diff.egg-info/requires.txt +0 -9
- lightweight_table_diff-0.1.3/src/lightweight_table_diff.egg-info/top_level.txt +0 -1
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lightweight-table-diff
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Cell-level table diffing with a Polars engine and Narwhals dataframe input support.
|
|
5
|
+
Requires-Python: >=3.11.1
|
|
6
|
+
Requires-Dist: narwhals>=1.0
|
|
7
|
+
Requires-Dist: polars-checkpoint
|
|
8
|
+
Requires-Dist: polars>=1.0
|
|
9
|
+
Requires-Dist: pyyaml
|
|
10
|
+
Provides-Extra: examples
|
|
11
|
+
Requires-Dist: pyspark; extra == 'examples'
|
|
12
|
+
Requires-Dist: raz-client; extra == 'examples'
|
|
13
|
+
Requires-Dist: types-boto3[s3]; extra == 'examples'
|
|
14
|
+
Provides-Extra: readstat
|
|
15
|
+
Requires-Dist: polars-readstat; extra == 'readstat'
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# API shape
|
|
2
|
+
|
|
3
|
+
The main API is simple:
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
from lightweight_table_diff import diff
|
|
7
|
+
|
|
8
|
+
result = diff("source.parquet", "target.parquet", keys="id")
|
|
9
|
+
|
|
10
|
+
# Check how many cells changed
|
|
11
|
+
print(f"Found {result.n_diffs} cell differences.")
|
|
12
|
+
|
|
13
|
+
# Access the underlying Polars LazyFrame of differences
|
|
14
|
+
diff_df = result.diff.collect()
|
|
15
|
+
|
|
16
|
+
# Write all structural changes and cell differences to CSVs
|
|
17
|
+
result.write("./diff_output_directory")
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Supported inputs:
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
# 1. Most normal: paths
|
|
24
|
+
result = diff("source.parquet", "target.parquet", keys="id")
|
|
25
|
+
|
|
26
|
+
# 2. Dataframe-native: Polars, pandas, PyArrow, etc. via Narwhals
|
|
27
|
+
result = diff(source_df, target_df, keys="id")
|
|
28
|
+
|
|
29
|
+
# 3. One-off infrastructure: zero-argument callables returning supported inputs
|
|
30
|
+
result = diff(lambda: load_source(), lambda: load_target(), keys="id")
|
|
31
|
+
|
|
32
|
+
# functools.partial also works here, just as long as the final callable needs no arguments
|
|
33
|
+
from functools import partial
|
|
34
|
+
|
|
35
|
+
result = diff(
|
|
36
|
+
partial(load_table, spark=spark, name="prod.source_people"),
|
|
37
|
+
partial(load_table, spark=spark, name="prod.target_people"),
|
|
38
|
+
keys="id",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# 4. Config/repeated infrastructure: one-item loader calls
|
|
42
|
+
result = diff(
|
|
43
|
+
{"spark": "prod.source_people"},
|
|
44
|
+
{"spark": "prod.target_people"},
|
|
45
|
+
keys="id",
|
|
46
|
+
loaders={"spark": load_spark_table},
|
|
47
|
+
)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
A loader receives the payload exactly as supplied and returns any supported input:
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
def load_spark_table(table_name):
|
|
54
|
+
return spark.table(table_name)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Payloads can be structured however the loader wants:
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
def load_extract(args):
|
|
61
|
+
return get_extract(
|
|
62
|
+
dataset=args["dataset"],
|
|
63
|
+
period=args["period"],
|
|
64
|
+
version=args["side"],
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
result = diff(
|
|
68
|
+
{"extract": {"dataset": "people", "period": "2024-01", "side": "source"}},
|
|
69
|
+
{"extract": {"dataset": "people", "period": "2024-01", "side": "target"}},
|
|
70
|
+
keys="id",
|
|
71
|
+
loaders={"extract": load_extract},
|
|
72
|
+
)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
If the loader needs infrastructure, bind it yourself:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from functools import partial
|
|
79
|
+
|
|
80
|
+
def load_spark_table(table_name, *, spark):
|
|
81
|
+
return spark.table(table_name)
|
|
82
|
+
|
|
83
|
+
result = diff(
|
|
84
|
+
{"spark": "prod.source_people"},
|
|
85
|
+
{"spark": "prod.target_people"},
|
|
86
|
+
keys="id",
|
|
87
|
+
loaders={"spark": partial(load_spark_table, spark=spark)},
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
YAML config uses the same mechanism:
|
|
92
|
+
|
|
93
|
+
```yaml
|
|
94
|
+
defaults:
|
|
95
|
+
keys: [id]
|
|
96
|
+
normalise: float_strings
|
|
97
|
+
output_dir: ./diff_output
|
|
98
|
+
|
|
99
|
+
comparisons:
|
|
100
|
+
- name: people
|
|
101
|
+
source:
|
|
102
|
+
spark: prod.source_people
|
|
103
|
+
target:
|
|
104
|
+
spark: prod.target_people
|
|
105
|
+
|
|
106
|
+
- name: structured_extract
|
|
107
|
+
source:
|
|
108
|
+
extract:
|
|
109
|
+
dataset: people
|
|
110
|
+
period: 2024-01
|
|
111
|
+
side: source
|
|
112
|
+
target:
|
|
113
|
+
extract:
|
|
114
|
+
dataset: people
|
|
115
|
+
period: 2024-01
|
|
116
|
+
side: target
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from functools import partial
|
|
121
|
+
|
|
122
|
+
from lightweight_table_diff import run_config
|
|
123
|
+
|
|
124
|
+
run_config(
|
|
125
|
+
"diff.yml",
|
|
126
|
+
loaders={"spark": partial(load_spark_table, spark=spark)},
|
|
127
|
+
)
|
|
128
|
+
```
|