diffolars 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffolars/__init__.py ADDED
File without changes
diffolars/demo.py ADDED
@@ -0,0 +1,233 @@
1
+ """
2
+ The `demo` module provides functions
3
+ to generate a random initial `polars.DataFrame`
4
+ and a mutated copy.
5
+ """
6
+
7
+ import itertools
8
+ import random
9
+ import string
10
+ import uuid
11
+ from datetime import datetime, timedelta
12
+
13
+ import polars as pl
14
+
15
+ # The default include types;
16
+ # the randomized data is guaranteed to include a column
17
+ # with these data types. Note that the data types require
18
+ # their own generators.
19
+ DEFAULT_INCLUDE_TYPES = {int, float, str, datetime}
20
+
21
+ # lambda functions for generating random values
22
+ _GENERATORS = {
23
+ int: lambda rng: rng.randint(-1_000_000, 1_000_000),
24
+ float: lambda rng: rng.uniform(-1_000_000.0, 1_000_000.0),
25
+ str: lambda rng: "".join(rng.choices(string.ascii_letters, k=10)),
26
+ datetime: lambda rng: datetime(2000, 1, 1)
27
+ + timedelta(seconds=rng.randint(0, 60 * 60 * 24 * 365 * 25)),
28
+ }
29
+
30
+ # lambda functions for mutatin an existing value, keeping its type.
31
+ # to keep things simple, all of the operators are '+'
32
+ _MUTATORS = {
33
+ int: lambda rng, v: v + rng.randint(-1_000, 1_000),
34
+ float: lambda rng, v: v + rng.uniform(-1_000.0, 1_000.0),
35
+ str: lambda rng, v: v + "".join(rng.choices(string.ascii_letters, k=5)),
36
+ datetime: lambda rng, v: v + timedelta(seconds=rng.randint(-100_000, 100_000)),
37
+ }
38
+
39
+ # generates a random data frame
40
+ def get_random_data(
41
+ n_rows: int,
42
+ n_cols: int,
43
+ include_types: set[type] = DEFAULT_INCLUDE_TYPES,
44
+ seed: int | None = None,
45
+ ) -> pl.DataFrame:
46
+ """Generate a random Polars dataframe of test data.
47
+
48
+ Every row gets a "record_id" uuid column in addition to the n_cols
49
+ generated columns. include_types is cycled across the columns, so
50
+ n_cols must be at least len(include_types).
51
+
52
+ The order of operations is:
53
+
54
+ (1) Mutate existing table cells
55
+ (2) Add new rows
56
+ (3) Add new columns
57
+
58
+ Parameters
59
+ ==========
60
+
61
+ n_rows: number of rows to generate.
62
+
63
+ n_cols: number of generated columns (not counting "record_id").
64
+ Must be >= len(include_types).
65
+
66
+ include_types: set of types to cycle through when naming and
67
+ generating columns.
68
+
69
+ seed: optional seed for reproducible output.
70
+
71
+ Returns
72
+ =======
73
+
74
+ Randomly generated `polars.DataFrame`
75
+ """
76
+
77
+ # check if any unsupported types were passed;
78
+ unsupported = include_types - _GENERATORS.keys()
79
+
80
+ if unsupported:
81
+ raise ValueError(f"Unsupported types in include_types: {unsupported}")
82
+
83
+ # cannot specify fewer columns than number of specified included types.
84
+ if n_cols < len(include_types):
85
+ raise ValueError(
86
+ f"n_cols ({n_cols}) must be >= number of include_types ({len(include_types)})"
87
+ )
88
+
89
+ # set the seed and initialize the types as an ordered list...
90
+ rng = random.Random(seed)
91
+ types = list(include_types)
92
+
93
+ # prepare the record_id column.
94
+ data = {"record_id": [str(uuid.uuid4()) for _ in range(n_rows)]}
95
+
96
+
97
+ for col_idx in range(n_cols):
98
+
99
+ # we mod the index by the type length, so that adding additional columns
100
+ col_type = types[col_idx % len(types)]
101
+
102
+ # get the random generator for our new column
103
+ gen = _GENERATORS[col_type]
104
+
105
+ # add it to our data dict
106
+ data[f"col_{col_idx}_{col_type.__name__}"] = [gen(rng) for _ in range(n_rows)]
107
+
108
+ return pl.DataFrame(data)
109
+
110
+ def get_mutated_data(
111
+ original_df: pl.DataFrame,
112
+ coverage: float = 0.1, # this is the the % of the N x N data matrix that gets mutated.
113
+ n_new_rows: int = 0,
114
+ n_new_cols: int = 0,
115
+ include_types: set[type] = DEFAULT_INCLUDE_TYPES,
116
+ seed: int | None = None,
117
+ ) -> pl.DataFrame:
118
+ """
119
+ Given an input `polars.DataFrame`, returns a mutated version.
120
+
121
+ The mutated version may include additional columns and rows,
122
+ along with randomly mutated fields in the data matrix.
123
+
124
+ Parameters
125
+ ==========
126
+
127
+ coverage: fraction (0.0-1.0) of existing data cells (every column
128
+ except "record_id") that get randomly nudged in place, e.g. a
129
+ number gets added to, a string gets concatenated, a datetime gets
130
+ shifted.
131
+
132
+ n_new_rows: number of additional randomly generated rows to append,
133
+ each with its own new "record_id".
134
+
135
+ n_new_cols: number of additional randomly generated columns to
136
+ append, cycling through `include_types` the same way as
137
+ `get_random_data`.
138
+
139
+ Returns
140
+ ========
141
+
142
+ Mutated `polars.DataFrame`
143
+ """
144
+
145
+ # check user input for the data coverage. Must be in range [0,1]
146
+ if not 0.0 <= coverage <= 1.0:
147
+ raise ValueError(f"coverage ({coverage}) must be between 0.0 and 1.0")
148
+
149
+ # again, set seed.
150
+ rng = random.Random(seed)
151
+
152
+ # get all of the data columns (except the record_id)
153
+ data_cols = [c for c in original_df.columns if c != "record_id"]
154
+ n_rows = original_df.height
155
+
156
+ # pull out the data as data dict so individual cells can be
157
+ # mutated and rows/columns appended without rebuilding the frame each time...
158
+ data = original_df.to_dict(as_series=False)
159
+
160
+ # mutate a random sample of (row, column) cells in place, matching the
161
+ # mutator to whatever type the existing value already is.
162
+ # We get a Cartesian product of row indices and columns! this is like getting our [i, j] pairs...
163
+ cells = list(itertools.product(range(n_rows), data_cols))
164
+ n_mutate = round(coverage * len(cells))
165
+
166
+ # we sample a random (row, column) for n_mutate times...
167
+ for row_idx, col_name in rng.sample(cells, n_mutate):
168
+ value = data[col_name][row_idx]
169
+ mutate = _MUTATORS[type(value)] # Remember: mutate is our lambda function that takes rng, v
170
+ data[col_name][row_idx] = mutate(rng, value)
171
+
172
+ # append n_new_rows, generated the same way as get_random_data, reusing
173
+ # each existing column's data type.
174
+ if n_new_rows > 0:
175
+ if n_rows == 0:
176
+ raise ValueError("cannot add rows: original_df has no rows to infer column types from")
177
+
178
+ # original data record_id extens to include id's for our n_new_rows...
179
+ data["record_id"].extend(str(uuid.uuid4()) for _ in range(n_new_rows))
180
+
181
+ # again, for each colum in the original input, we generate data to cover the new rows.
182
+ for col_name in data_cols:
183
+ gen = _GENERATORS[type(data[col_name][0])]
184
+
185
+ # data is extended...
186
+ data[col_name].extend(gen(rng) for _ in range(n_new_rows))
187
+
188
+
189
+ total_rows = n_rows + n_new_rows
190
+
191
+ # append n_new_cols, cycling through include_types as in get_random_data.
192
+ if n_new_cols > 0:
193
+
194
+ # same check for supported types
195
+ unsupported = include_types - _GENERATORS.keys()
196
+
197
+ if unsupported:
198
+ raise ValueError(f"Unsupported types in include_types: {unsupported}")
199
+
200
+ types = list(include_types)
201
+
202
+ # offset since we're horizontally appending these new columns
203
+ for offset in range(n_new_cols):
204
+ col_type = types[offset % len(types)]
205
+ gen = _GENERATORS[col_type]
206
+ col_idx = len(data_cols) + offset # computed col index
207
+
208
+ data[f"col_{col_idx}_{col_type.__name__}"] = [
209
+ gen(rng) for _ in range(total_rows)
210
+ ]
211
+
212
+ return pl.DataFrame(data)
213
+
214
+ def get_df_pair(
215
+ n_cols: int,
216
+ n_rows: int,
217
+ *,
218
+ n_new_rows: int = 0,
219
+ n_new_cols: int = 0,
220
+ seed: int | None = None,
221
+ included_types: set[int, float, str, datetime] = DEFAULT_INCLUDE_TYPES,
222
+ coverage: float = 0.1) -> dict[str, pl.DataFrame]:
223
+ """Prepares a pair of original and mutated `polars.DataFrame`'s."""
224
+ print(f"Generating initial dataset with {n_rows} rows and {n_cols} columns.")
225
+ df = get_random_data(n_rows=n_rows, n_cols=n_cols, include_types=included_types, seed=seed)
226
+ mut_df = get_mutated_data(
227
+ df, coverage=coverage, n_new_rows = n_new_rows, n_new_cols = n_new_cols,
228
+ include_types=included_types)
229
+ print("Generated mutated dataset.")
230
+ return {
231
+ 'original' : df,
232
+ 'mutated' : mut_df
233
+ }
diffolars/diff.py ADDED
@@ -0,0 +1,38 @@
1
+ import polars as pl
2
+
3
+ def deduplicate(orig: pl.DataFrame, mut: pl.DataFrame) -> pl.DataFrame:
4
+ """Given two input tables, identifies exact row-to-row matches based on a checksum."""
5
+ pass
6
+
7
+ def row_intercept(orig: pl.DataFrame, mut: pl.DataFrame, record_id_col: str) -> list[str]:
8
+ """Identifies shared rows, according to a specified record ID column."""
9
+ pass
10
+
11
+ def row_symmetric_diff(orig: pl.DataFrame, mut: pl.DataFrame, record_id_col: str) -> list[str]:
12
+ """Identifies sets of rows not shared between the two input dataframes,
13
+ according to a specified record ID column"""
14
+ pass
15
+
16
+ def has_same_schema(orig: pl.DataFrame, mut: pl.DataFrame) -> bool:
17
+ """Return False if two dataframe schemas do not match."""
18
+ pass
19
+
20
+ def parse_schema(orig: pl.DataFrame, mut: pl.DataFrame, suffix: str) -> dict[str, set[str]]:
21
+ """
22
+ Parses the schema of two dataframes and outputs results in the stdout.
23
+ """
24
+ pass
25
+
26
+ def column_intercept(orig: pl.DataFrame, mut: pl.DataFrame, suffix: str) -> set[str]:
27
+ """
28
+ Finds and returns the set of shared columns between two input dataframes.
29
+ Equal columns must have the same column name and data type, excluding the suffix.
30
+ """
31
+ pass
32
+
33
+ def column_symmetric_diff(orig: pl.DataFrame, mut: pl.DataFrame, suffix: str) -> dict[str, set[str]]:
34
+ """
35
+ Finds and returns the set of different columns between two input dataframes.
36
+ Different columns may differ by name or data type, excluding the suffix.
37
+ """
38
+ pass
diffolars/py.typed ADDED
File without changes
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: diffolars
3
+ Version: 0.1.0
4
+ Summary: Yet another data diff
5
+ Author: Kenneth O'Dell
6
+ Author-email: Kenneth O'Dell <kenneth.l.odell@gmail.com>
7
+ License-Expression: MIT
8
+ Requires-Dist: dotenv>=0.9.9
9
+ Requires-Dist: pandas>=3.0.3
10
+ Requires-Dist: polars>=1.41.2
11
+ Requires-Dist: pyarrow>=24.0.0
12
+ Requires-Dist: pyodbc>=5.3.0
13
+ Requires-Dist: pytest>=9.1.1
14
+ Requires-Dist: ruff>=0.15.20
15
+ Requires-Dist: sqlalchemy>=2.0.50
16
+ Requires-Python: >=3.13
17
+ Description-Content-Type: text/markdown
18
+
19
+ # diffolars
20
+
21
+ A small [Polars](https://pola.rs)-based toolkit for
22
+ comparing two versions of a dataframe and generating randomized test data to
23
+ exercise that comparison.
24
+
25
+ Ideally used to compare dataloads in the day-to-day of a database analyst.
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ uv add diffolars
31
+ ```
32
+
33
+ ## Generating test data
34
+
35
+ `diffolars.demo` generates a random dataframe and a mutated copy of it, useful
36
+ for testing diff logic without hand-crafting fixtures.
37
+
38
+ ```python
39
+ from diffolars.demo import get_df_pair
40
+
41
+ pair = get_df_pair(
42
+ n_rows=100,
43
+ n_cols=10,
44
+ n_new_rows=5, # rows added in the mutated copy
45
+ n_new_cols=2, # columns added in the mutated copy
46
+ coverage=0.1, # fraction of existing cells randomly changed
47
+ seed=42,
48
+ )
49
+
50
+ original = pair["original"]
51
+ mutated = pair["mutated"]
52
+ ```
53
+
54
+ Every generated row gets a `record_id` UUID column, used to match rows
55
+ between the original and mutated dataframes. `get_random_data` and
56
+ `get_mutated_data` are also available individually if you want to generate or
57
+ mutate a dataframe on its own.
58
+
59
+ ## Diffing
60
+
61
+ `diffolars.diff` will define the comparison API (row/column intersection and
62
+ symmetric difference, schema comparison) and is under active development...
63
+
64
+ ## License
65
+
66
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,7 @@
1
+ diffolars/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ diffolars/demo.py,sha256=0YEf_mQ1i_71QkIBvUJE14THuY0S2Nd1pczCcUDEYow,8073
3
+ diffolars/diff.py,sha256=RRrVPu1EmydzNZ-DgVBdX9h8uIGZ-57oABj3-9RGCZI,1534
4
+ diffolars/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ diffolars-0.1.0.dist-info/WHEEL,sha256=M4DeIjVCA49okfALADZoWX5JOGwnmHb-JOpQHtI-1c0,80
6
+ diffolars-0.1.0.dist-info/METADATA,sha256=77jbLrjH29cs9Fv16PqbtmlW20A9FnYZjOoBB6ocJMQ,1757
7
+ diffolars-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.11.2
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any