diffolars 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffolars-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: diffolars
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Yet another data diff
|
|
5
|
+
Author: Kenneth O'Dell
|
|
6
|
+
Author-email: Kenneth O'Dell <kenneth.l.odell@gmail.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Requires-Dist: dotenv>=0.9.9
|
|
9
|
+
Requires-Dist: pandas>=3.0.3
|
|
10
|
+
Requires-Dist: polars>=1.41.2
|
|
11
|
+
Requires-Dist: pyarrow>=24.0.0
|
|
12
|
+
Requires-Dist: pyodbc>=5.3.0
|
|
13
|
+
Requires-Dist: pytest>=9.1.1
|
|
14
|
+
Requires-Dist: ruff>=0.15.20
|
|
15
|
+
Requires-Dist: sqlalchemy>=2.0.50
|
|
16
|
+
Requires-Python: >=3.13
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# diffolars
|
|
20
|
+
|
|
21
|
+
A small [Polars](https://pola.rs)-based toolkit for
|
|
22
|
+
comparing two versions of a dataframe and generating randomized test data to
|
|
23
|
+
exercise that comparison.
|
|
24
|
+
|
|
25
|
+
Ideally used to compare dataloads in the day-to-day of a database analyst.
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv add diffolars
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Generating test data
|
|
34
|
+
|
|
35
|
+
`diffolars.demo` generates a random dataframe and a mutated copy of it, useful
|
|
36
|
+
for testing diff logic without hand-crafting fixtures.
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from diffolars.demo import get_df_pair
|
|
40
|
+
|
|
41
|
+
pair = get_df_pair(
|
|
42
|
+
n_rows=100,
|
|
43
|
+
n_cols=10,
|
|
44
|
+
n_new_rows=5, # rows added in the mutated copy
|
|
45
|
+
n_new_cols=2, # columns added in the mutated copy
|
|
46
|
+
coverage=0.1, # fraction of existing cells randomly changed
|
|
47
|
+
seed=42,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
original = pair["original"]
|
|
51
|
+
mutated = pair["mutated"]
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Every generated row gets a `record_id` UUID column, used to match rows
|
|
55
|
+
between the original and mutated dataframes. `get_random_data` and
|
|
56
|
+
`get_mutated_data` are also available individually if you want to generate or
|
|
57
|
+
mutate a dataframe on its own.
|
|
58
|
+
|
|
59
|
+
## Diffing
|
|
60
|
+
|
|
61
|
+
`diffolars.diff` will define the comparison API (row/column intersection and
|
|
62
|
+
symmetric difference, schema comparison) and is under active development...
|
|
63
|
+
|
|
64
|
+
## License
|
|
65
|
+
|
|
66
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# diffolars
|
|
2
|
+
|
|
3
|
+
A small [Polars](https://pola.rs)-based toolkit for
|
|
4
|
+
comparing two versions of a dataframe and generating randomized test data to
|
|
5
|
+
exercise that comparison.
|
|
6
|
+
|
|
7
|
+
Ideally used to compare dataloads in the day-to-day of a database analyst.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
uv add diffolars
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Generating test data
|
|
16
|
+
|
|
17
|
+
`diffolars.demo` generates a random dataframe and a mutated copy of it, useful
|
|
18
|
+
for testing diff logic without hand-crafting fixtures.
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from diffolars.demo import get_df_pair
|
|
22
|
+
|
|
23
|
+
pair = get_df_pair(
|
|
24
|
+
n_rows=100,
|
|
25
|
+
n_cols=10,
|
|
26
|
+
n_new_rows=5, # rows added in the mutated copy
|
|
27
|
+
n_new_cols=2, # columns added in the mutated copy
|
|
28
|
+
coverage=0.1, # fraction of existing cells randomly changed
|
|
29
|
+
seed=42,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
original = pair["original"]
|
|
33
|
+
mutated = pair["mutated"]
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Every generated row gets a `record_id` UUID column, used to match rows
|
|
37
|
+
between the original and mutated dataframes. `get_random_data` and
|
|
38
|
+
`get_mutated_data` are also available individually if you want to generate or
|
|
39
|
+
mutate a dataframe on its own.
|
|
40
|
+
|
|
41
|
+
## Diffing
|
|
42
|
+
|
|
43
|
+
`diffolars.diff` will define the comparison API (row/column intersection and
|
|
44
|
+
symmetric difference, schema comparison) and is under active development...
|
|
45
|
+
|
|
46
|
+
## License
|
|
47
|
+
|
|
48
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "diffolars"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Yet another data diff"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Kenneth O'Dell", email = "kenneth.l.odell@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
license = "MIT"
|
|
10
|
+
requires-python = ">=3.13"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"dotenv>=0.9.9",
|
|
13
|
+
"pandas>=3.0.3",
|
|
14
|
+
"polars>=1.41.2",
|
|
15
|
+
"pyarrow>=24.0.0",
|
|
16
|
+
"pyodbc>=5.3.0",
|
|
17
|
+
"pytest>=9.1.1",
|
|
18
|
+
"ruff>=0.15.20",
|
|
19
|
+
"sqlalchemy>=2.0.50",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["uv_build>=0.11.2,<0.12.0"]
|
|
24
|
+
build-backend = "uv_build"
|
|
File without changes
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The `demo` module provides functions
|
|
3
|
+
to generate a random initial `polars.DataFrame`
|
|
4
|
+
and a mutated copy.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import itertools
|
|
8
|
+
import random
|
|
9
|
+
import string
|
|
10
|
+
import uuid
|
|
11
|
+
from datetime import datetime, timedelta
|
|
12
|
+
|
|
13
|
+
import polars as pl
|
|
14
|
+
|
|
15
|
+
# The default include types;
|
|
16
|
+
# the randomized data is guaranteed to include a column
|
|
17
|
+
# with these data types. Note that the data types require
|
|
18
|
+
# their own generators.
|
|
19
|
+
DEFAULT_INCLUDE_TYPES = {int, float, str, datetime}
|
|
20
|
+
|
|
21
|
+
# lambda functions for generating random values
|
|
22
|
+
_GENERATORS = {
|
|
23
|
+
int: lambda rng: rng.randint(-1_000_000, 1_000_000),
|
|
24
|
+
float: lambda rng: rng.uniform(-1_000_000.0, 1_000_000.0),
|
|
25
|
+
str: lambda rng: "".join(rng.choices(string.ascii_letters, k=10)),
|
|
26
|
+
datetime: lambda rng: datetime(2000, 1, 1)
|
|
27
|
+
+ timedelta(seconds=rng.randint(0, 60 * 60 * 24 * 365 * 25)),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# lambda functions for mutatin an existing value, keeping its type.
|
|
31
|
+
# to keep things simple, all of the operators are '+'
|
|
32
|
+
_MUTATORS = {
|
|
33
|
+
int: lambda rng, v: v + rng.randint(-1_000, 1_000),
|
|
34
|
+
float: lambda rng, v: v + rng.uniform(-1_000.0, 1_000.0),
|
|
35
|
+
str: lambda rng, v: v + "".join(rng.choices(string.ascii_letters, k=5)),
|
|
36
|
+
datetime: lambda rng, v: v + timedelta(seconds=rng.randint(-100_000, 100_000)),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# generates a random data frame
|
|
40
|
+
def get_random_data(
|
|
41
|
+
n_rows: int,
|
|
42
|
+
n_cols: int,
|
|
43
|
+
include_types: set[type] = DEFAULT_INCLUDE_TYPES,
|
|
44
|
+
seed: int | None = None,
|
|
45
|
+
) -> pl.DataFrame:
|
|
46
|
+
"""Generate a random Polars dataframe of test data.
|
|
47
|
+
|
|
48
|
+
Every row gets a "record_id" uuid column in addition to the n_cols
|
|
49
|
+
generated columns. include_types is cycled across the columns, so
|
|
50
|
+
n_cols must be at least len(include_types).
|
|
51
|
+
|
|
52
|
+
The order of operations is:
|
|
53
|
+
|
|
54
|
+
(1) Mutate existing table cells
|
|
55
|
+
(2) Add new rows
|
|
56
|
+
(3) Add new columns
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
==========
|
|
60
|
+
|
|
61
|
+
n_rows: number of rows to generate.
|
|
62
|
+
|
|
63
|
+
n_cols: number of generated columns (not counting "record_id").
|
|
64
|
+
Must be >= len(include_types).
|
|
65
|
+
|
|
66
|
+
include_types: set of types to cycle through when naming and
|
|
67
|
+
generating columns.
|
|
68
|
+
|
|
69
|
+
seed: optional seed for reproducible output.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
=======
|
|
73
|
+
|
|
74
|
+
Randomly generated `polars.DataFrame`
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
# check if any unsupported types were passed;
|
|
78
|
+
unsupported = include_types - _GENERATORS.keys()
|
|
79
|
+
|
|
80
|
+
if unsupported:
|
|
81
|
+
raise ValueError(f"Unsupported types in include_types: {unsupported}")
|
|
82
|
+
|
|
83
|
+
# cannot specify fewer columns than number of specified included types.
|
|
84
|
+
if n_cols < len(include_types):
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"n_cols ({n_cols}) must be >= number of include_types ({len(include_types)})"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# set the seed and initialize the types as an ordered list...
|
|
90
|
+
rng = random.Random(seed)
|
|
91
|
+
types = list(include_types)
|
|
92
|
+
|
|
93
|
+
# prepare the record_id column.
|
|
94
|
+
data = {"record_id": [str(uuid.uuid4()) for _ in range(n_rows)]}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
for col_idx in range(n_cols):
|
|
98
|
+
|
|
99
|
+
# we mod the index by the type length, so that adding additional columns
|
|
100
|
+
col_type = types[col_idx % len(types)]
|
|
101
|
+
|
|
102
|
+
# get the random generator for our new column
|
|
103
|
+
gen = _GENERATORS[col_type]
|
|
104
|
+
|
|
105
|
+
# add it to our data dict
|
|
106
|
+
data[f"col_{col_idx}_{col_type.__name__}"] = [gen(rng) for _ in range(n_rows)]
|
|
107
|
+
|
|
108
|
+
return pl.DataFrame(data)
|
|
109
|
+
|
|
110
|
+
def get_mutated_data(
|
|
111
|
+
original_df: pl.DataFrame,
|
|
112
|
+
coverage: float = 0.1, # this is the the % of the N x N data matrix that gets mutated.
|
|
113
|
+
n_new_rows: int = 0,
|
|
114
|
+
n_new_cols: int = 0,
|
|
115
|
+
include_types: set[type] = DEFAULT_INCLUDE_TYPES,
|
|
116
|
+
seed: int | None = None,
|
|
117
|
+
) -> pl.DataFrame:
|
|
118
|
+
"""
|
|
119
|
+
Given an input `polars.DataFrame`, returns a mutated version.
|
|
120
|
+
|
|
121
|
+
The mutated version may include additional columns and rows,
|
|
122
|
+
along with randomly mutated fields in the data matrix.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
==========
|
|
126
|
+
|
|
127
|
+
coverage: fraction (0.0-1.0) of existing data cells (every column
|
|
128
|
+
except "record_id") that get randomly nudged in place, e.g. a
|
|
129
|
+
number gets added to, a string gets concatenated, a datetime gets
|
|
130
|
+
shifted.
|
|
131
|
+
|
|
132
|
+
n_new_rows: number of additional randomly generated rows to append,
|
|
133
|
+
each with its own new "record_id".
|
|
134
|
+
|
|
135
|
+
n_new_cols: number of additional randomly generated columns to
|
|
136
|
+
append, cycling through `include_types` the same way as
|
|
137
|
+
`get_random_data`.
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
========
|
|
141
|
+
|
|
142
|
+
Mutated `polars.DataFrame`
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
# check user input for the data coverage. Must be in range [0,1]
|
|
146
|
+
if not 0.0 <= coverage <= 1.0:
|
|
147
|
+
raise ValueError(f"coverage ({coverage}) must be between 0.0 and 1.0")
|
|
148
|
+
|
|
149
|
+
# again, set seed.
|
|
150
|
+
rng = random.Random(seed)
|
|
151
|
+
|
|
152
|
+
# get all of the data columns (except the record_id)
|
|
153
|
+
data_cols = [c for c in original_df.columns if c != "record_id"]
|
|
154
|
+
n_rows = original_df.height
|
|
155
|
+
|
|
156
|
+
# pull out the data as data dict so individual cells can be
|
|
157
|
+
# mutated and rows/columns appended without rebuilding the frame each time...
|
|
158
|
+
data = original_df.to_dict(as_series=False)
|
|
159
|
+
|
|
160
|
+
# mutate a random sample of (row, column) cells in place, matching the
|
|
161
|
+
# mutator to whatever type the existing value already is.
|
|
162
|
+
# We get a Cartesian product of row indices and columns! this is like getting our [i, j] pairs...
|
|
163
|
+
cells = list(itertools.product(range(n_rows), data_cols))
|
|
164
|
+
n_mutate = round(coverage * len(cells))
|
|
165
|
+
|
|
166
|
+
# we sample a random (row, column) for n_mutate times...
|
|
167
|
+
for row_idx, col_name in rng.sample(cells, n_mutate):
|
|
168
|
+
value = data[col_name][row_idx]
|
|
169
|
+
mutate = _MUTATORS[type(value)] # Remember: mutate is our lambda function that takes rng, v
|
|
170
|
+
data[col_name][row_idx] = mutate(rng, value)
|
|
171
|
+
|
|
172
|
+
# append n_new_rows, generated the same way as get_random_data, reusing
|
|
173
|
+
# each existing column's data type.
|
|
174
|
+
if n_new_rows > 0:
|
|
175
|
+
if n_rows == 0:
|
|
176
|
+
raise ValueError("cannot add rows: original_df has no rows to infer column types from")
|
|
177
|
+
|
|
178
|
+
# original data record_id extens to include id's for our n_new_rows...
|
|
179
|
+
data["record_id"].extend(str(uuid.uuid4()) for _ in range(n_new_rows))
|
|
180
|
+
|
|
181
|
+
# again, for each colum in the original input, we generate data to cover the new rows.
|
|
182
|
+
for col_name in data_cols:
|
|
183
|
+
gen = _GENERATORS[type(data[col_name][0])]
|
|
184
|
+
|
|
185
|
+
# data is extended...
|
|
186
|
+
data[col_name].extend(gen(rng) for _ in range(n_new_rows))
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
total_rows = n_rows + n_new_rows
|
|
190
|
+
|
|
191
|
+
# append n_new_cols, cycling through include_types as in get_random_data.
|
|
192
|
+
if n_new_cols > 0:
|
|
193
|
+
|
|
194
|
+
# same check for supported types
|
|
195
|
+
unsupported = include_types - _GENERATORS.keys()
|
|
196
|
+
|
|
197
|
+
if unsupported:
|
|
198
|
+
raise ValueError(f"Unsupported types in include_types: {unsupported}")
|
|
199
|
+
|
|
200
|
+
types = list(include_types)
|
|
201
|
+
|
|
202
|
+
# offset since we're horizontally appending these new columns
|
|
203
|
+
for offset in range(n_new_cols):
|
|
204
|
+
col_type = types[offset % len(types)]
|
|
205
|
+
gen = _GENERATORS[col_type]
|
|
206
|
+
col_idx = len(data_cols) + offset # computed col index
|
|
207
|
+
|
|
208
|
+
data[f"col_{col_idx}_{col_type.__name__}"] = [
|
|
209
|
+
gen(rng) for _ in range(total_rows)
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
return pl.DataFrame(data)
|
|
213
|
+
|
|
214
|
+
def get_df_pair(
|
|
215
|
+
n_cols: int,
|
|
216
|
+
n_rows: int,
|
|
217
|
+
*,
|
|
218
|
+
n_new_rows: int = 0,
|
|
219
|
+
n_new_cols: int = 0,
|
|
220
|
+
seed: int | None = None,
|
|
221
|
+
included_types: set[int, float, str, datetime] = DEFAULT_INCLUDE_TYPES,
|
|
222
|
+
coverage: float = 0.1) -> dict[str, pl.DataFrame]:
|
|
223
|
+
"""Prepares a pair of original and mutated `polars.DataFrame`'s."""
|
|
224
|
+
print(f"Generating initial dataset with {n_rows} rows and {n_cols} columns.")
|
|
225
|
+
df = get_random_data(n_rows=n_rows, n_cols=n_cols, include_types=included_types, seed=seed)
|
|
226
|
+
mut_df = get_mutated_data(
|
|
227
|
+
df, coverage=coverage, n_new_rows = n_new_rows, n_new_cols = n_new_cols,
|
|
228
|
+
include_types=included_types)
|
|
229
|
+
print("Generated mutated dataset.")
|
|
230
|
+
return {
|
|
231
|
+
'original' : df,
|
|
232
|
+
'mutated' : mut_df
|
|
233
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
def deduplicate(orig: pl.DataFrame, mut: pl.DataFrame) -> pl.DataFrame:
|
|
4
|
+
"""Given two input tables, identifies exact row-to-row matches based on a checksum."""
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
def row_intercept(orig: pl.DataFrame, mut: pl.DataFrame, record_id_col: str) -> list[str]:
|
|
8
|
+
"""Identifies shared rows, according to a specified record ID column."""
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
def row_symmetric_diff(orig: pl.DataFrame, mut: pl.DataFrame, record_id_col: str) -> list[str]:
|
|
12
|
+
"""Identifies sets of rows not shared between the two input dataframes,
|
|
13
|
+
according to a specified record ID column"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
def has_same_schema(orig: pl.DataFrame, mut: pl.DataFrame) -> bool:
|
|
17
|
+
"""Return False if two dataframe schemas do not match."""
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
def parse_schema(orig: pl.DataFrame, mut: pl.DataFrame, suffix: str) -> dict[str, set[str]]:
|
|
21
|
+
"""
|
|
22
|
+
Parses the schema of two dataframes and outputs results in the stdout.
|
|
23
|
+
"""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
def column_intercept(orig: pl.DataFrame, mut: pl.DataFrame, suffix: str) -> set[str]:
|
|
27
|
+
"""
|
|
28
|
+
Finds and returns the set of shared columns between two input dataframes.
|
|
29
|
+
Equal columns must have the same column name and data type, excluding the suffix.
|
|
30
|
+
"""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def column_symmetric_diff(orig: pl.DataFrame, mut: pl.DataFrame, suffix: str) -> dict[str, set[str]]:
|
|
34
|
+
"""
|
|
35
|
+
Finds and returns the set of different columns between two input dataframes.
|
|
36
|
+
Different columns may differ by name or data type, excluding the suffix.
|
|
37
|
+
"""
|
|
38
|
+
pass
|
|
File without changes
|