lightweight-table-diff 0.1.4__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. lightweight_table_diff-1.0.0/.gitignore +9 -0
  2. lightweight_table_diff-1.0.0/PKG-INFO +15 -0
  3. lightweight_table_diff-1.0.0/README_api_example.md +128 -0
  4. lightweight_table_diff-1.0.0/codebase.md +2116 -0
  5. lightweight_table_diff-1.0.0/examples/cloudera_hive/run_diff.py +111 -0
  6. lightweight_table_diff-1.0.0/pyproject.toml +68 -0
  7. lightweight_table_diff-1.0.0/src/lightweight_table_diff/__init__.py +16 -0
  8. {lightweight_table_diff-0.1.4 → lightweight_table_diff-1.0.0}/src/lightweight_table_diff/__main__.py +16 -16
  9. lightweight_table_diff-1.0.0/src/lightweight_table_diff/api.py +262 -0
  10. lightweight_table_diff-1.0.0/src/lightweight_table_diff/config.py +149 -0
  11. lightweight_table_diff-1.0.0/src/lightweight_table_diff/core.py +130 -0
  12. lightweight_table_diff-1.0.0/src/lightweight_table_diff/dimensions.py +212 -0
  13. {lightweight_table_diff-0.1.4 → lightweight_table_diff-1.0.0}/src/lightweight_table_diff/normalisers.py +59 -42
  14. lightweight_table_diff-1.0.0/src/lightweight_table_diff/resolver.py +318 -0
  15. lightweight_table_diff-1.0.0/src/lightweight_table_diff/result.py +168 -0
  16. lightweight_table_diff-1.0.0/src/lightweight_table_diff/runner.py +112 -0
  17. lightweight_table_diff-1.0.0/tests/test_api.py +119 -0
  18. lightweight_table_diff-1.0.0/tests/test_core.py +63 -0
  19. lightweight_table_diff-1.0.0/tests/test_dimensions.py +84 -0
  20. lightweight_table_diff-1.0.0/tests/test_integration.py +80 -0
  21. lightweight_table_diff-1.0.0/tests/test_normalisers.py +45 -0
  22. lightweight_table_diff-1.0.0/tycheck.log +1 -0
  23. lightweight_table_diff-1.0.0/uv.lock +1138 -0
  24. lightweight_table_diff-0.1.4/PKG-INFO +0 -13
  25. lightweight_table_diff-0.1.4/README.md +0 -0
  26. lightweight_table_diff-0.1.4/pyproject.toml +0 -29
  27. lightweight_table_diff-0.1.4/setup.cfg +0 -4
  28. lightweight_table_diff-0.1.4/src/lightweight_table_diff/__init__.py +0 -5
  29. lightweight_table_diff-0.1.4/src/lightweight_table_diff/adapters/__init__.py +0 -33
  30. lightweight_table_diff-0.1.4/src/lightweight_table_diff/adapters/csv.py +0 -14
  31. lightweight_table_diff-0.1.4/src/lightweight_table_diff/adapters/hive_s3.py +0 -67
  32. lightweight_table_diff-0.1.4/src/lightweight_table_diff/adapters/parquet.py +0 -13
  33. lightweight_table_diff-0.1.4/src/lightweight_table_diff/adapters/sav.py +0 -23
  34. lightweight_table_diff-0.1.4/src/lightweight_table_diff/config.py +0 -40
  35. lightweight_table_diff-0.1.4/src/lightweight_table_diff/core.py +0 -85
  36. lightweight_table_diff-0.1.4/src/lightweight_table_diff/dimensions.py +0 -96
  37. lightweight_table_diff-0.1.4/src/lightweight_table_diff/runner.py +0 -147
  38. lightweight_table_diff-0.1.4/src/lightweight_table_diff.egg-info/PKG-INFO +0 -13
  39. lightweight_table_diff-0.1.4/src/lightweight_table_diff.egg-info/SOURCES.txt +0 -19
  40. lightweight_table_diff-0.1.4/src/lightweight_table_diff.egg-info/dependency_links.txt +0 -1
  41. lightweight_table_diff-0.1.4/src/lightweight_table_diff.egg-info/requires.txt +0 -9
  42. lightweight_table_diff-0.1.4/src/lightweight_table_diff.egg-info/top_level.txt +0 -1
@@ -0,0 +1,9 @@
1
+ # python
2
+ *.pyc
3
+ __pycache__
4
+ *.egg-info
5
+ .venv
6
+ venv
7
+
8
+ # vscode
9
+ .vscode/
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: lightweight-table-diff
3
+ Version: 1.0.0
4
+ Summary: Cell-level table diffing with a Polars engine and Narwhals dataframe input support.
5
+ Requires-Python: >=3.11.1
6
+ Requires-Dist: narwhals>=1.0
7
+ Requires-Dist: polars-checkpoint
8
+ Requires-Dist: polars>=1.0
9
+ Requires-Dist: pyyaml
10
+ Provides-Extra: examples
11
+ Requires-Dist: pyspark; extra == 'examples'
12
+ Requires-Dist: raz-client; extra == 'examples'
13
+ Requires-Dist: types-boto3[s3]; extra == 'examples'
14
+ Provides-Extra: readstat
15
+ Requires-Dist: polars-readstat; extra == 'readstat'
@@ -0,0 +1,128 @@
1
+ # API shape
2
+
3
+ The main API is simple:
4
+
5
+ ```python
6
+ from lightweight_table_diff import diff
7
+
8
+ result = diff("source.parquet", "target.parquet", keys="id")
9
+
10
+ # Check how many cells changed
11
+ print(f"Found {result.n_diffs} cell differences.")
12
+
13
+ # Access the underlying Polars LazyFrame of differences
14
+ diff_df = result.diff.collect()
15
+
16
+ # Write all structural changes and cell differences to CSVs
17
+ result.write("./diff_output_directory")
18
+ ```
19
+
20
+ Supported inputs:
21
+
22
+ ```python
23
+ # 1. Most normal: paths
24
+ result = diff("source.parquet", "target.parquet", keys="id")
25
+
26
+ # 2. Dataframe-native: Polars, pandas, PyArrow, etc. via Narwhals
27
+ result = diff(source_df, target_df, keys="id")
28
+
29
+ # 3. One-off infrastructure: zero-argument callables returning supported inputs
30
+ result = diff(lambda: load_source(), lambda: load_target(), keys="id")
31
+
32
+ # functools.partial also works here, just as long as the final callable needs no arguments
33
+ from functools import partial
34
+
35
+ result = diff(
36
+ partial(load_table, spark=spark, name="prod.source_people"),
37
+ partial(load_table, spark=spark, name="prod.target_people"),
38
+ keys="id",
39
+ )
40
+
41
+ # 4. Config/repeated infrastructure: one-item loader calls
42
+ result = diff(
43
+ {"spark": "prod.source_people"},
44
+ {"spark": "prod.target_people"},
45
+ keys="id",
46
+ loaders={"spark": load_spark_table},
47
+ )
48
+ ```
49
+
50
+ A loader receives the payload exactly as supplied and returns any supported input:
51
+
52
+ ```python
53
+ def load_spark_table(table_name):
54
+ return spark.table(table_name)
55
+ ```
56
+
57
+ Payloads can be structured however the loader wants:
58
+
59
+ ```python
60
+ def load_extract(args):
61
+ return get_extract(
62
+ dataset=args["dataset"],
63
+ period=args["period"],
64
+ version=args["side"],
65
+ )
66
+
67
+ result = diff(
68
+ {"extract": {"dataset": "people", "period": "2024-01", "side": "source"}},
69
+ {"extract": {"dataset": "people", "period": "2024-01", "side": "target"}},
70
+ keys="id",
71
+ loaders={"extract": load_extract},
72
+ )
73
+ ```
74
+
75
+ If the loader needs infrastructure, bind it yourself:
76
+
77
+ ```python
78
+ from functools import partial
79
+
80
+ def load_spark_table(table_name, *, spark):
81
+ return spark.table(table_name)
82
+
83
+ result = diff(
84
+ {"spark": "prod.source_people"},
85
+ {"spark": "prod.target_people"},
86
+ keys="id",
87
+ loaders={"spark": partial(load_spark_table, spark=spark)},
88
+ )
89
+ ```
90
+
91
+ YAML config uses the same mechanism:
92
+
93
+ ```yaml
94
+ defaults:
95
+ keys: [id]
96
+ normalise: float_strings
97
+ output_dir: ./diff_output
98
+
99
+ comparisons:
100
+ - name: people
101
+ source:
102
+ spark: prod.source_people
103
+ target:
104
+ spark: prod.target_people
105
+
106
+ - name: structured_extract
107
+ source:
108
+ extract:
109
+ dataset: people
110
+ period: 2024-01
111
+ side: source
112
+ target:
113
+ extract:
114
+ dataset: people
115
+ period: 2024-01
116
+ side: target
117
+ ```
118
+
119
+ ```python
120
+ from functools import partial
121
+
122
+ from lightweight_table_diff import run_config
123
+
124
+ run_config(
125
+ "diff.yml",
126
+ loaders={"spark": partial(load_spark_table, spark=spark)},
127
+ )
128
+ ```