winipedia-utils 0.4.23__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of winipedia-utils might be problematic. Click here for more details.
- winipedia_utils/data/dataframe/cleaning.py +337 -100
- winipedia_utils/git/github/repo/protect.py +1 -2
- {winipedia_utils-0.4.23.dist-info → winipedia_utils-0.4.26.dist-info}/METADATA +41 -2
- {winipedia_utils-0.4.23.dist-info → winipedia_utils-0.4.26.dist-info}/RECORD +6 -6
- {winipedia_utils-0.4.23.dist-info → winipedia_utils-0.4.26.dist-info}/WHEEL +0 -0
- {winipedia_utils-0.4.23.dist-info → winipedia_utils-0.4.26.dist-info}/licenses/LICENSE +0 -0
|
@@ -12,118 +12,201 @@ from typing import Any
|
|
|
12
12
|
import polars as pl
|
|
13
13
|
from polars.datatypes.classes import FloatType
|
|
14
14
|
|
|
15
|
-
from winipedia_utils.data.structures.dicts import reverse_dict
|
|
16
15
|
from winipedia_utils.oop.mixins.mixin import ABCLoggingMixin
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
class CleaningDF(ABCLoggingMixin):
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
This
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
19
|
+
"""A base class for cleaning and standardizing dataframes using Polars.
|
|
20
|
+
|
|
21
|
+
This class provides a comprehensive pipeline
|
|
22
|
+
for importing, cleaning, and standardizing
|
|
23
|
+
data from various sources before loading into databases or other systems.
|
|
24
|
+
It enforces data quality standards
|
|
25
|
+
through a series of configurable cleaning operations.
|
|
26
|
+
|
|
27
|
+
The cleaning pipeline executes in the following order:
|
|
28
|
+
1. Rename columns according to a standardized naming scheme
|
|
29
|
+
2. Drop columns not in the schema
|
|
30
|
+
3. Fill null values with specified defaults
|
|
31
|
+
4. Convert columns to correct data types and apply custom transformations
|
|
32
|
+
5. Drop rows where specified column subsets are entirely null
|
|
33
|
+
6. Handle duplicates by aggregating values and removing duplicates
|
|
34
|
+
7. Sort the dataframe by specified columns
|
|
35
|
+
8. Validate data quality
|
|
36
|
+
(correct dtypes, no nulls in required columns, no NaN values)
|
|
37
|
+
|
|
38
|
+
Child classes must implement abstract methods to define the cleaning configuration:
|
|
39
|
+
- get_rename_map(): Define column name mappings
|
|
40
|
+
- get_col_dtype_map(): Define expected data types for each column
|
|
41
|
+
- get_drop_null_subsets(): Define which column subsets trigger row deletion
|
|
42
|
+
- get_fill_null_map(): Define null value fill strategies
|
|
43
|
+
- get_sort_cols(): Define sort order
|
|
44
|
+
- get_unique_subsets(): Define duplicate detection criteria
|
|
45
|
+
- get_no_null_cols(): Define columns that cannot contain nulls
|
|
46
|
+
- get_col_converter_map(): Define custom column transformations
|
|
47
|
+
- get_add_on_duplicate_cols(): Define columns to aggregate when duplicates are found
|
|
48
|
+
- get_col_precision_map(): Define rounding precision for float columns
|
|
49
|
+
|
|
50
|
+
Best Practices:
|
|
51
|
+
- Define column names as string constants in child classes
|
|
52
|
+
for reusability and maintainability
|
|
53
|
+
- Use this class to build data cleaning pipelines that can be composed and extended
|
|
54
|
+
- The class automatically converts NaN to null for consistency
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
COL_NAME_1 = "col_name_1"
|
|
58
|
+
COL_NAME_2 = "col_name_2"
|
|
33
59
|
"""
|
|
34
60
|
|
|
35
|
-
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
36
|
-
"""Initialize the CleaningDF."""
|
|
37
|
-
self.df = pl.DataFrame(*args, nan_to_null=True, **kwargs)
|
|
38
|
-
self.clean()
|
|
39
|
-
|
|
40
61
|
@classmethod
|
|
41
62
|
@abstractmethod
|
|
42
63
|
def get_rename_map(cls) -> dict[str, str]:
|
|
43
|
-
"""
|
|
64
|
+
"""Define column name mappings for standardization.
|
|
44
65
|
|
|
45
|
-
This method must be implemented in
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
66
|
+
This abstract method must be implemented in child classes to specify how
|
|
67
|
+
raw input column names should be renamed to standardized names. Renaming
|
|
68
|
+
is the first operation in the cleaning pipeline, executed before all other
|
|
69
|
+
cleaning operations.
|
|
70
|
+
|
|
71
|
+
The mapping format follows the CleaningDF convention of mapping standardized
|
|
72
|
+
names to raw input names, allowing the reverse mapping to be applied to the
|
|
73
|
+
dataframe.
|
|
49
74
|
|
|
50
75
|
Returns:
|
|
51
|
-
dict[str, str]: Dictionary mapping
|
|
52
|
-
|
|
76
|
+
dict[str, str]: Dictionary mapping standardized column names to raw input
|
|
77
|
+
column names. Format: {standardized_name: raw_name, ...}
|
|
78
|
+
|
|
79
|
+
Example:
|
|
80
|
+
return {
|
|
81
|
+
"user_id": "UserId",
|
|
82
|
+
"email": "Email_Address",
|
|
83
|
+
"created_at": "CreatedDate"
|
|
84
|
+
}
|
|
53
85
|
"""
|
|
54
86
|
|
|
55
87
|
@classmethod
|
|
56
88
|
@abstractmethod
|
|
57
89
|
def get_col_dtype_map(cls) -> dict[str, type[pl.DataType]]:
|
|
58
|
-
"""
|
|
90
|
+
"""Define the expected data type for each column in the cleaned dataframe.
|
|
59
91
|
|
|
60
|
-
This method must be implemented in
|
|
92
|
+
This abstract method must be implemented in child classes to specify the
|
|
93
|
+
target data types for all columns. The dataframe will be validated against
|
|
94
|
+
this schema after cleaning, and a TypeError will be raised if any column
|
|
95
|
+
has an incorrect type.
|
|
61
96
|
|
|
62
97
|
Returns:
|
|
63
|
-
dict[str, type[pl.DataType]]: Dictionary mapping column names
|
|
98
|
+
dict[str, type[pl.DataType]]: Dictionary mapping standardized column names
|
|
99
|
+
to their expected Polars data types.
|
|
100
|
+
|
|
101
|
+
Example:
|
|
102
|
+
return {
|
|
103
|
+
"user_id": pl.Int64,
|
|
104
|
+
"email": pl.Utf8,
|
|
105
|
+
"created_at": pl.Date,
|
|
106
|
+
"score": pl.Float64
|
|
107
|
+
}
|
|
64
108
|
"""
|
|
65
109
|
|
|
66
110
|
@classmethod
|
|
67
111
|
@abstractmethod
|
|
68
112
|
def get_drop_null_subsets(cls) -> tuple[tuple[str, ...], ...]:
|
|
69
|
-
"""
|
|
113
|
+
"""Define column subsets for dropping rows with all-null values.
|
|
70
114
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
115
|
+
This abstract method specifies which column subsets should trigger row deletion.
|
|
116
|
+
A row is dropped if ALL columns in a subset are null. Multiple subsets can be
|
|
117
|
+
defined to apply different null-dropping rules. If no subsets are defined,
|
|
118
|
+
rows where all columns are null will be dropped.
|
|
74
119
|
|
|
75
120
|
Returns:
|
|
76
|
-
tuple[tuple[str, ...], ...]: Tuple of tuples
|
|
121
|
+
tuple[tuple[str, ...], ...]: Tuple of column name tuples, where each inner
|
|
122
|
+
tuple represents one subset. A row is dropped if all columns in any
|
|
123
|
+
subset are null.
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
return (
|
|
127
|
+
("email", "phone"), # Drop if both email and phone are null
|
|
128
|
+
("address_line1",), # Drop if address_line1 is null
|
|
129
|
+
)
|
|
77
130
|
"""
|
|
78
131
|
|
|
79
132
|
@classmethod
|
|
80
133
|
@abstractmethod
|
|
81
134
|
def get_fill_null_map(cls) -> dict[str, Any]:
|
|
82
|
-
"""
|
|
135
|
+
"""Define null value fill strategies for each column.
|
|
83
136
|
|
|
84
|
-
This method
|
|
137
|
+
This abstract method specifies default values to fill null entries in each
|
|
138
|
+
column. This is applied early in the cleaning pipeline after column renaming.
|
|
85
139
|
|
|
86
140
|
Returns:
|
|
87
|
-
dict[str, Any]: Dictionary mapping column names to their fill
|
|
141
|
+
dict[str, Any]: Dictionary mapping column names to their fill values.
|
|
142
|
+
The fill value can be any type appropriate for the column.
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
return {
|
|
146
|
+
"email": "",
|
|
147
|
+
"phone": "",
|
|
148
|
+
"score": 0,
|
|
149
|
+
"status": "unknown"
|
|
150
|
+
}
|
|
88
151
|
"""
|
|
89
152
|
|
|
90
153
|
@classmethod
|
|
91
154
|
@abstractmethod
|
|
92
155
|
def get_sort_cols(cls) -> tuple[tuple[str, bool], ...]:
|
|
93
|
-
"""
|
|
156
|
+
"""Define the sort order for the cleaned dataframe.
|
|
94
157
|
|
|
95
|
-
This method
|
|
158
|
+
This abstract method specifies which columns to sort by and in what order
|
|
159
|
+
(ascending or descending). Sorting is applied near the end of the cleaning
|
|
160
|
+
pipeline, after all data transformations are complete.
|
|
96
161
|
|
|
97
162
|
Returns:
|
|
98
|
-
tuple[tuple[str, bool], ...]: Tuple of
|
|
99
|
-
|
|
163
|
+
tuple[tuple[str, bool], ...]: Tuple of (column_name, is_descending) tuples.
|
|
164
|
+
Each tuple specifies a column and sort direction. Columns are sorted
|
|
165
|
+
in the order they appear. True = descending, False = ascending.
|
|
166
|
+
|
|
167
|
+
Example:
|
|
168
|
+
return (
|
|
169
|
+
("created_at", True), # Sort by created_at descending
|
|
170
|
+
("user_id", False), # Then by user_id ascending
|
|
171
|
+
)
|
|
100
172
|
"""
|
|
101
173
|
|
|
102
174
|
@classmethod
|
|
103
175
|
@abstractmethod
|
|
104
176
|
def get_unique_subsets(cls) -> tuple[tuple[str, ...], ...]:
|
|
105
|
-
"""
|
|
177
|
+
"""Define column subsets for duplicate detection and removal.
|
|
106
178
|
|
|
107
|
-
This method
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
("col3", "col4"), # subset 2
|
|
112
|
-
)
|
|
179
|
+
This abstract method specifies which column combinations define uniqueness.
|
|
180
|
+
Rows are considered duplicates if they have identical values in all columns
|
|
181
|
+
of a subset. When duplicates are found, values in columns specified by
|
|
182
|
+
get_add_on_duplicate_cols() are summed, and the first row is kept.
|
|
113
183
|
|
|
114
184
|
Returns:
|
|
115
|
-
tuple[tuple[
|
|
185
|
+
tuple[tuple[str, ...], ...]: Tuple of column name tuples, where each inner
|
|
186
|
+
tuple represents one uniqueness constraint. Duplicates are detected
|
|
187
|
+
and handled for each subset independently.
|
|
188
|
+
|
|
189
|
+
Example:
|
|
190
|
+
return (
|
|
191
|
+
("user_id", "date"), # Subset 1: unique by user_id and date
|
|
192
|
+
("transaction_id",), # Subset 2: unique by transaction_id
|
|
193
|
+
)
|
|
116
194
|
"""
|
|
117
195
|
|
|
118
196
|
@classmethod
|
|
119
197
|
@abstractmethod
|
|
120
198
|
def get_no_null_cols(cls) -> tuple[str, ...]:
|
|
121
|
-
"""
|
|
199
|
+
"""Define columns that must not contain null values.
|
|
122
200
|
|
|
123
|
-
This method
|
|
201
|
+
This abstract method specifies which columns are required to have non-null
|
|
202
|
+
values. A ValueError is raised during the final validation step if any of
|
|
203
|
+
these columns contain null values.
|
|
124
204
|
|
|
125
205
|
Returns:
|
|
126
|
-
tuple[str, ...]: Tuple of column names
|
|
206
|
+
tuple[str, ...]: Tuple of column names that must not be null.
|
|
207
|
+
|
|
208
|
+
Example:
|
|
209
|
+
return ("user_id", "email", "created_at")
|
|
127
210
|
"""
|
|
128
211
|
|
|
129
212
|
@classmethod
|
|
@@ -131,49 +214,110 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
131
214
|
def get_col_converter_map(
|
|
132
215
|
cls,
|
|
133
216
|
) -> dict[str, Callable[[pl.Series], pl.Series]]:
|
|
134
|
-
"""
|
|
217
|
+
"""Define custom conversion functions for columns.
|
|
135
218
|
|
|
136
|
-
This method
|
|
137
|
-
|
|
138
|
-
|
|
219
|
+
This abstract method specifies custom transformations to apply to columns
|
|
220
|
+
after standard conversions (string stripping, float rounding). Each function
|
|
221
|
+
receives a Polars Series and returns a transformed Series. Use
|
|
222
|
+
skip_col_converter as a placeholder for columns that don't need custom
|
|
223
|
+
conversion.
|
|
139
224
|
|
|
140
225
|
Returns:
|
|
141
226
|
dict[str, Callable[[pl.Series], pl.Series]]: Dictionary mapping column names
|
|
142
|
-
to their conversion function
|
|
227
|
+
to their conversion functions. Each function takes a Series and returns
|
|
228
|
+
a transformed Series.
|
|
229
|
+
|
|
230
|
+
Example:
|
|
231
|
+
return {
|
|
232
|
+
"email": lambda s: s.str.to_lowercase(),
|
|
233
|
+
"phone": self.parse_phone_number,
|
|
234
|
+
"created_at": self.skip_col_converter, # No custom conversion
|
|
235
|
+
}
|
|
143
236
|
"""
|
|
144
237
|
|
|
145
238
|
@classmethod
|
|
146
239
|
@abstractmethod
|
|
147
240
|
def get_add_on_duplicate_cols(cls) -> tuple[str, ...]:
|
|
148
|
-
"""
|
|
241
|
+
"""Define columns to aggregate when duplicate rows are found.
|
|
149
242
|
|
|
150
|
-
This method
|
|
151
|
-
|
|
243
|
+
This abstract method specifies which columns should have their values summed
|
|
244
|
+
when duplicate rows are detected (based on get_unique_subsets). The summed
|
|
245
|
+
values are kept in the first row, and duplicate rows are removed.
|
|
152
246
|
|
|
153
247
|
Returns:
|
|
154
|
-
tuple[str, ...]: Tuple of column names
|
|
248
|
+
tuple[str, ...]: Tuple of column names whose values should be summed
|
|
249
|
+
when duplicates are found.
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
return ("quantity", "revenue", "impressions")
|
|
155
253
|
"""
|
|
156
254
|
|
|
157
255
|
@classmethod
|
|
158
256
|
@abstractmethod
|
|
159
257
|
def get_col_precision_map(cls) -> dict[str, int]:
|
|
160
|
-
"""
|
|
258
|
+
"""Define rounding precision for float columns.
|
|
161
259
|
|
|
162
|
-
This method
|
|
260
|
+
This abstract method specifies the number of decimal places to round float
|
|
261
|
+
columns to. Rounding is applied during the standard conversion phase and uses
|
|
262
|
+
Kahan summation to compensate for floating-point rounding errors.
|
|
163
263
|
|
|
164
264
|
Returns:
|
|
165
|
-
dict[str, int]: Dictionary mapping column names to their precision
|
|
265
|
+
dict[str, int]: Dictionary mapping float column names to their precision
|
|
266
|
+
(number of decimal places).
|
|
267
|
+
|
|
268
|
+
Example:
|
|
269
|
+
return {
|
|
270
|
+
"price": 2,
|
|
271
|
+
"percentage": 4,
|
|
272
|
+
"score": 1,
|
|
273
|
+
}
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
def __init__(self, data: dict[str, list[Any]], **kwargs: Any) -> None:
|
|
277
|
+
"""Initialize the CleaningDF and execute the cleaning pipeline.
|
|
278
|
+
|
|
279
|
+
Creates a Polars DataFrame with NaN values automatically converted to null,
|
|
280
|
+
then immediately executes the full cleaning pipeline.
|
|
281
|
+
nan_to_null is set to True to always
|
|
282
|
+
schema is set to the dtype map to always have the correct dtypes
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
data: Dictionary mapping column names to lists of values
|
|
286
|
+
**kwargs: Additional keyword arguments passed to pl.DataFrame constructor
|
|
166
287
|
"""
|
|
288
|
+
self.rename_cols(data)
|
|
289
|
+
self.drop_cols(data)
|
|
290
|
+
kwargs["nan_to_null"] = True
|
|
291
|
+
kwargs["schema"] = self.get_col_dtype_map()
|
|
292
|
+
self.df = pl.DataFrame(data=data, **kwargs)
|
|
293
|
+
self.clean()
|
|
167
294
|
|
|
168
295
|
@classmethod
|
|
169
296
|
def get_col_names(cls) -> tuple[str, ...]:
|
|
170
|
-
"""Get the column names
|
|
297
|
+
"""Get the standardized column names from the dtype map.
|
|
298
|
+
|
|
299
|
+
Returns the column names in the order they appear in get_col_dtype_map().
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
tuple[str, ...]: Tuple of standardized column names.
|
|
303
|
+
"""
|
|
171
304
|
return tuple(cls.get_col_dtype_map().keys())
|
|
172
305
|
|
|
173
306
|
def clean(self) -> None:
|
|
174
|
-
"""
|
|
175
|
-
|
|
176
|
-
|
|
307
|
+
"""Execute the complete data cleaning pipeline.
|
|
308
|
+
|
|
309
|
+
Applies all cleaning operations in the following order:
|
|
310
|
+
1. Rename columns to standardized names
|
|
311
|
+
2. Drop columns not in the schema
|
|
312
|
+
3. Fill null values with defaults
|
|
313
|
+
4. Convert columns to correct types and apply transformations
|
|
314
|
+
5. Drop rows with all-null column subsets
|
|
315
|
+
6. Handle duplicates by aggregating and removing
|
|
316
|
+
7. Sort the dataframe
|
|
317
|
+
8. Validate data quality
|
|
318
|
+
|
|
319
|
+
This method is automatically called during __init__.
|
|
320
|
+
"""
|
|
177
321
|
self.fill_nulls()
|
|
178
322
|
self.convert_cols()
|
|
179
323
|
self.drop_null_subsets()
|
|
@@ -187,7 +331,18 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
187
331
|
map_func: Callable[..., dict[str, Any]],
|
|
188
332
|
col_names: tuple[str, ...] | None = None,
|
|
189
333
|
) -> None:
|
|
190
|
-
"""
|
|
334
|
+
"""Validate that all required columns are present in a configuration map.
|
|
335
|
+
|
|
336
|
+
Checks that the columns returned by map_func contain all columns in col_names.
|
|
337
|
+
Raises KeyError if any required columns are missing from the map.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
map_func: A callable that returns a dict with column names as keys
|
|
341
|
+
col_names: Tuple of column names to check. If None, uses get_col_names()
|
|
342
|
+
|
|
343
|
+
Raises:
|
|
344
|
+
KeyError: If any required columns are missing from the map
|
|
345
|
+
"""
|
|
191
346
|
if col_names is None:
|
|
192
347
|
col_names = cls.get_col_names()
|
|
193
348
|
missing_cols = set(col_names) - set(map_func().keys())
|
|
@@ -195,17 +350,32 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
195
350
|
msg = f"Missing columns in {map_func.__name__}: {missing_cols}"
|
|
196
351
|
raise KeyError(msg)
|
|
197
352
|
|
|
198
|
-
def rename_cols(self) -> None:
|
|
199
|
-
"""Rename
|
|
353
|
+
def rename_cols(self, data: dict[str, list[Any]]) -> None:
|
|
354
|
+
"""Rename columns from raw names to standardized names.
|
|
355
|
+
|
|
356
|
+
Applies the reverse of get_rename_map() to rename columns from their raw
|
|
357
|
+
input names to standardized names. Validates that all required columns are
|
|
358
|
+
present in the rename map.
|
|
359
|
+
"""
|
|
200
360
|
self.raise_on_missing_cols(self.get_rename_map)
|
|
201
|
-
|
|
361
|
+
for std_name, raw_name in self.get_rename_map().items():
|
|
362
|
+
data[std_name] = data.pop(raw_name)
|
|
363
|
+
|
|
364
|
+
def drop_cols(self, data: dict[str, list[Any]]) -> None:
|
|
365
|
+
"""Drop columns not in the schema.
|
|
202
366
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
367
|
+
Selects only the columns defined in get_col_names(), removing any extra
|
|
368
|
+
columns that may have been in the input data.
|
|
369
|
+
"""
|
|
370
|
+
for col in set(data.keys()) - set(self.get_col_names()):
|
|
371
|
+
del data[col]
|
|
206
372
|
|
|
207
373
|
def fill_nulls(self) -> None:
|
|
208
|
-
"""Fill null values with
|
|
374
|
+
"""Fill null values with defaults from the fill null map.
|
|
375
|
+
|
|
376
|
+
Replaces null values in each column with the corresponding fill value
|
|
377
|
+
from get_fill_null_map(). Validates that all columns are present in the map.
|
|
378
|
+
"""
|
|
209
379
|
self.raise_on_missing_cols(self.get_fill_null_map)
|
|
210
380
|
self.df = self.df.with_columns(
|
|
211
381
|
[
|
|
@@ -215,15 +385,22 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
215
385
|
)
|
|
216
386
|
|
|
217
387
|
def convert_cols(self) -> None:
|
|
218
|
-
"""Apply
|
|
388
|
+
"""Apply standard and custom column conversions.
|
|
389
|
+
|
|
390
|
+
Orchestrates both standard conversions (string stripping, float rounding)
|
|
391
|
+
and custom conversions defined in get_col_converter_map(). Validates that
|
|
392
|
+
all columns are present in the converter map.
|
|
393
|
+
"""
|
|
219
394
|
self.raise_on_missing_cols(self.get_col_converter_map)
|
|
220
395
|
self.standard_convert_cols()
|
|
221
396
|
self.custom_convert_cols()
|
|
222
397
|
|
|
223
398
|
def standard_convert_cols(self) -> None:
|
|
224
|
-
"""
|
|
399
|
+
"""Apply standard conversions based on data type.
|
|
225
400
|
|
|
226
|
-
|
|
401
|
+
Automatically applies standard transformations:
|
|
402
|
+
- Utf8 columns: strip leading/trailing whitespace
|
|
403
|
+
- Float64 columns: round to specified precision using Kahan summation
|
|
227
404
|
"""
|
|
228
405
|
for col_name, dtype in self.get_col_dtype_map().items():
|
|
229
406
|
if dtype == pl.Utf8:
|
|
@@ -237,7 +414,11 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
237
414
|
)
|
|
238
415
|
|
|
239
416
|
def custom_convert_cols(self) -> None:
|
|
240
|
-
"""Apply
|
|
417
|
+
"""Apply custom conversion functions to columns.
|
|
418
|
+
|
|
419
|
+
Applies custom transformations from get_col_converter_map() to each column,
|
|
420
|
+
skipping columns marked with skip_col_converter.
|
|
421
|
+
"""
|
|
241
422
|
self.df = self.df.with_columns(
|
|
242
423
|
[
|
|
243
424
|
pl.col(col_name).map_batches(
|
|
@@ -250,12 +431,26 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
250
431
|
|
|
251
432
|
@classmethod
|
|
252
433
|
def strip_col(cls, col: pl.Series) -> pl.Series:
|
|
253
|
-
"""
|
|
434
|
+
"""Remove leading and trailing whitespace from string column.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
col: Polars Series of string type
|
|
438
|
+
|
|
439
|
+
Returns:
|
|
440
|
+
pl.Series: Series with whitespace stripped
|
|
441
|
+
"""
|
|
254
442
|
return col.str.strip_chars()
|
|
255
443
|
|
|
256
444
|
@classmethod
|
|
257
445
|
def lower_col(cls, col: pl.Series) -> pl.Series:
|
|
258
|
-
"""Convert
|
|
446
|
+
"""Convert string column to lowercase.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
col: Polars Series of string type
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
pl.Series: Series with all characters converted to lowercase
|
|
453
|
+
"""
|
|
259
454
|
return col.str.to_lowercase()
|
|
260
455
|
|
|
261
456
|
@classmethod
|
|
@@ -266,9 +461,19 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
266
461
|
*,
|
|
267
462
|
compensate: bool = True,
|
|
268
463
|
) -> pl.Series:
|
|
269
|
-
"""Round
|
|
464
|
+
"""Round float column to specified precision.
|
|
270
465
|
|
|
271
|
-
|
|
466
|
+
Uses Kahan summation algorithm to compensate for floating-point rounding
|
|
467
|
+
errors when compensate=True, ensuring that the sum of rounded values
|
|
468
|
+
matches the rounded sum of original values.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
col: Polars Series of float type
|
|
472
|
+
precision: Number of decimal places. If None, uses get_col_precision_map()
|
|
473
|
+
compensate: If True, use Kahan summation to reduce rounding errors
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
pl.Series: Series with values rounded to specified precision
|
|
272
477
|
"""
|
|
273
478
|
if precision is None:
|
|
274
479
|
precision = cls.get_col_precision_map()[str(col.name)]
|
|
@@ -288,9 +493,14 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
288
493
|
|
|
289
494
|
@classmethod
|
|
290
495
|
def skip_col_converter(cls, _col: pl.Series) -> pl.Series:
|
|
291
|
-
"""
|
|
496
|
+
"""Placeholder to skip custom conversion for a column.
|
|
497
|
+
|
|
498
|
+
Use this method in get_col_converter_map() to indicate that a column
|
|
499
|
+
should not have custom conversion applied. This method should never be
|
|
500
|
+
actually called - it's only used as a marker.
|
|
292
501
|
|
|
293
|
-
|
|
502
|
+
Raises:
|
|
503
|
+
NotImplementedError: Always raised if this method is called
|
|
294
504
|
"""
|
|
295
505
|
msg = (
|
|
296
506
|
"skip_col_converter is just a flag to skip conversion for a column "
|
|
@@ -299,9 +509,10 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
299
509
|
raise NotImplementedError(msg)
|
|
300
510
|
|
|
301
511
|
def drop_null_subsets(self) -> None:
|
|
302
|
-
"""Drop rows where
|
|
512
|
+
"""Drop rows where all columns in a subset are null.
|
|
303
513
|
|
|
304
|
-
|
|
514
|
+
Applies null-dropping rules defined in get_drop_null_subsets(). If no
|
|
515
|
+
subsets are defined, drops rows where all columns are null.
|
|
305
516
|
"""
|
|
306
517
|
subsets = self.get_drop_null_subsets()
|
|
307
518
|
if not subsets:
|
|
@@ -311,12 +522,14 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
311
522
|
self.df = self.df.drop_nulls(subset=subset)
|
|
312
523
|
|
|
313
524
|
def handle_duplicates(self) -> None:
|
|
314
|
-
"""
|
|
525
|
+
"""Remove duplicate rows and aggregate specified columns.
|
|
315
526
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
527
|
+
For each uniqueness subset defined in get_unique_subsets():
|
|
528
|
+
1. Sum values in columns specified by get_add_on_duplicate_cols()
|
|
529
|
+
2. Keep only the first row of each duplicate group
|
|
530
|
+
|
|
531
|
+
Example: If two rows have the same (user_id, date) and values 1 and 2
|
|
532
|
+
in the 'quantity' column, the result will have one row with quantity=3.
|
|
320
533
|
"""
|
|
321
534
|
for subset in self.get_unique_subsets():
|
|
322
535
|
for col in self.get_add_on_duplicate_cols():
|
|
@@ -324,24 +537,40 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
324
537
|
self.df = self.df.unique(subset=subset, keep="first")
|
|
325
538
|
|
|
326
539
|
def sort_cols(self) -> None:
|
|
327
|
-
"""Sort the dataframe by
|
|
540
|
+
"""Sort the dataframe by columns and directions from get_sort_cols().
|
|
541
|
+
|
|
542
|
+
Applies multi-column sorting with per-column sort direction
|
|
543
|
+
(ascending/descending).
|
|
544
|
+
"""
|
|
328
545
|
cols, desc = zip(*self.get_sort_cols(), strict=True)
|
|
329
546
|
if not cols:
|
|
330
547
|
return
|
|
331
548
|
self.df = self.df.sort(cols, descending=desc)
|
|
332
549
|
|
|
333
550
|
def check(self) -> None:
|
|
334
|
-
"""
|
|
551
|
+
"""Validate data quality after cleaning.
|
|
552
|
+
|
|
553
|
+
Runs all validation checks:
|
|
554
|
+
- Correct data types for all columns
|
|
555
|
+
- No null values in required columns
|
|
556
|
+
- No NaN values in float columns
|
|
335
557
|
|
|
336
|
-
|
|
337
|
-
|
|
558
|
+
Called automatically at the end of the clean() pipeline.
|
|
559
|
+
|
|
560
|
+
Raises:
|
|
561
|
+
TypeError: If any column has incorrect data type
|
|
562
|
+
ValueError: If required columns contain nulls or float columns contain NaN
|
|
338
563
|
"""
|
|
339
564
|
self.check_correct_dtypes()
|
|
340
565
|
self.check_no_null_cols()
|
|
341
566
|
self.check_no_nan()
|
|
342
567
|
|
|
343
568
|
def check_correct_dtypes(self) -> None:
|
|
344
|
-
"""
|
|
569
|
+
"""Validate that all columns have their expected data types.
|
|
570
|
+
|
|
571
|
+
Raises:
|
|
572
|
+
TypeError: If any column's actual type doesn't match expected type
|
|
573
|
+
"""
|
|
345
574
|
schema = self.df.schema
|
|
346
575
|
col_dtype_map = self.get_col_dtype_map()
|
|
347
576
|
for col, dtype in col_dtype_map.items():
|
|
@@ -351,7 +580,11 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
351
580
|
raise TypeError(msg)
|
|
352
581
|
|
|
353
582
|
def check_no_null_cols(self) -> None:
|
|
354
|
-
"""
|
|
583
|
+
"""Validate that required columns contain no null values.
|
|
584
|
+
|
|
585
|
+
Raises:
|
|
586
|
+
ValueError: If any column in get_no_null_cols() contains null values
|
|
587
|
+
"""
|
|
355
588
|
no_null_cols = self.get_no_null_cols()
|
|
356
589
|
# Use a single select to check all columns at once
|
|
357
590
|
null_flags = self.df.select(
|
|
@@ -364,7 +597,11 @@ class CleaningDF(ABCLoggingMixin):
|
|
|
364
597
|
raise ValueError(msg)
|
|
365
598
|
|
|
366
599
|
def check_no_nan(self) -> None:
|
|
367
|
-
"""
|
|
600
|
+
"""Validate that float columns contain no NaN values.
|
|
601
|
+
|
|
602
|
+
Raises:
|
|
603
|
+
ValueError: If any float column contains NaN values
|
|
604
|
+
"""
|
|
368
605
|
float_cols = [
|
|
369
606
|
col
|
|
370
607
|
for col, dtype in self.get_col_dtype_map().items()
|
|
@@ -65,8 +65,7 @@ def get_default_ruleset_params() -> dict[str, Any]:
|
|
|
65
65
|
"require_code_owner_review": True,
|
|
66
66
|
"require_last_push_approval": True,
|
|
67
67
|
"required_review_thread_resolution": True,
|
|
68
|
-
"
|
|
69
|
-
"allowed_merge_methods": ["merge", "squash", "rebase"],
|
|
68
|
+
"allowed_merge_methods": ["squash", "rebase"],
|
|
70
69
|
},
|
|
71
70
|
required_linear_history={},
|
|
72
71
|
required_signatures={},
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: winipedia-utils
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.26
|
|
4
4
|
Summary: A package with many utility functions
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -108,7 +108,7 @@ The setup creates the following configuration files:
|
|
|
108
108
|
- `.github/workflows/publish.yaml` - Publishing workflow (Publishes to PyPI when a release is created by the release workflow, if you use this workflow, you need to add a PYPI_TOKEN (named PYPI_TOKEN) to your GitHub secrets that has write access to the package on PyPI.)
|
|
109
109
|
- `py.typed` - PEP 561 marker for type hints
|
|
110
110
|
- `experiment.py` - For experimentation (ignored by git)
|
|
111
|
-
- `
|
|
111
|
+
- `test_zero.py` - Test file with one empyt test (so that initial tests pass)
|
|
112
112
|
- `conftest.py` - Pytest configuration file
|
|
113
113
|
|
|
114
114
|
### Pre-commit Hook Workflow
|
|
@@ -160,6 +160,45 @@ Configuration files are managed automatically by the setup system:
|
|
|
160
160
|
- **Custom additions** - You can add custom configurations as long as the standard configurations remain intact
|
|
161
161
|
- **Modified standards** - If you modify the standard configurations, they will be restored on the next setup run
|
|
162
162
|
|
|
163
|
+
## Branch Protection
|
|
164
|
+
|
|
165
|
+
As soon as you push to `main` on GitHub (provided the `REPO_TOKEN` secret is set up correctly), the `health_check.yaml` workflow will run and execute `winipedia_utils.git.github.repo.protect`, which uses PyGithub to protect the repository.
|
|
166
|
+
|
|
167
|
+
### Repository Settings
|
|
168
|
+
|
|
169
|
+
The following repository settings are configured:
|
|
170
|
+
|
|
171
|
+
- **name** - Repository name from `pyproject.toml` or folder name (should match repo name)
|
|
172
|
+
- **description** - Repository description from `pyproject.toml`
|
|
173
|
+
- **default_branch** - `main`
|
|
174
|
+
- **delete_branch_on_merge** - `true`
|
|
175
|
+
- **allow_update_branch** - `true`
|
|
176
|
+
- **allow_merge_commit** - `false`
|
|
177
|
+
- **allow_rebase_merge** - `true`
|
|
178
|
+
- **allow_squash_merge** - `true`
|
|
179
|
+
|
|
180
|
+
### Branch Protection Rules
|
|
181
|
+
|
|
182
|
+
A ruleset named `main protection` is created for the `main` branch with the following rules:
|
|
183
|
+
|
|
184
|
+
- **Deletion** - Prevents branch deletion
|
|
185
|
+
- **Non-fast-forward** - Prevents non-fast-forward pushes (forces linear history by rejecting force pushes)
|
|
186
|
+
- **Creation** - Prevents branch creation directly on the protected branch
|
|
187
|
+
- **Update** - Prevents direct updates to protected branch (all changes must go through pull requests)
|
|
188
|
+
- **Required Linear History** - Enforces linear commit history (no merge commits allowed)
|
|
189
|
+
- **Required Signatures** - Requires all commits to be signed with GPG or SSH keys
|
|
190
|
+
- **Pull Request Requirements:**
|
|
191
|
+
- Requires 1 approving review (at least one person must approve before merge)
|
|
192
|
+
- Dismisses stale reviews on push (if you push a new commit, all reviews are dismissed and must be re-approved)
|
|
193
|
+
- Requires code owner review (designated code owners must approve changes to their files)
|
|
194
|
+
- Requires last push approval (the most recent push must be approved, not an earlier one)
|
|
195
|
+
- Requires review thread resolution (all comments in reviews must be resolved before merge)
|
|
196
|
+
- Allowed merge methods: `squash` and `rebase` (no merge commits, keeps history clean)
|
|
197
|
+
- **Required Status Checks:**
|
|
198
|
+
- Strict mode enabled (all status checks must pass on the latest commit, not older ones (sets the health check as required status check))
|
|
199
|
+
- Health check workflow must pass (the CI/CD pipeline must complete successfully)
|
|
200
|
+
- **Bypass Actors** - Repository admins can bypass all rules (for emergency situations)
|
|
201
|
+
|
|
163
202
|
## Utilities
|
|
164
203
|
|
|
165
204
|
Winipedia Utils provides comprehensive utility modules for common development tasks:
|
|
@@ -5,13 +5,13 @@ winipedia_utils/concurrent/multiprocessing.py,sha256=1pnAU-CS3crNOKlp68gCCvNbTvN
|
|
|
5
5
|
winipedia_utils/concurrent/multithreading.py,sha256=L33gLy1PR51mOsQY6TuXbbahbU8eDPAQijzbmn_pSRc,3080
|
|
6
6
|
winipedia_utils/data/__init__.py,sha256=o6SXX1gBCwhb9abo8xTfZtqUNuLqXmdo2VervnJxRzc,48
|
|
7
7
|
winipedia_utils/data/dataframe/__init__.py,sha256=XHsbmjiaGom-KX-S3leCY9cJD3aP9p_0X6xYMcdkHBU,23
|
|
8
|
-
winipedia_utils/data/dataframe/cleaning.py,sha256=
|
|
8
|
+
winipedia_utils/data/dataframe/cleaning.py,sha256=HMsr5cpPRGXGvd8d6CVzdEDhcSy-9mH36eVArzYd-2g,23432
|
|
9
9
|
winipedia_utils/data/structures/__init__.py,sha256=XHsbmjiaGom-KX-S3leCY9cJD3aP9p_0X6xYMcdkHBU,23
|
|
10
10
|
winipedia_utils/data/structures/dicts.py,sha256=jsFzQ96cvyHsvPSnsEUhksuWvLSGq6-Rryfw6gEXq-c,274
|
|
11
11
|
winipedia_utils/git/__init__.py,sha256=IRmEVz0sUEw47Eli--57YaypWitxlcYThT_ulwkhNTE,47
|
|
12
12
|
winipedia_utils/git/github/__init__.py,sha256=XHsbmjiaGom-KX-S3leCY9cJD3aP9p_0X6xYMcdkHBU,23
|
|
13
13
|
winipedia_utils/git/github/repo/__init__.py,sha256=XHsbmjiaGom-KX-S3leCY9cJD3aP9p_0X6xYMcdkHBU,23
|
|
14
|
-
winipedia_utils/git/github/repo/protect.py,sha256=
|
|
14
|
+
winipedia_utils/git/github/repo/protect.py,sha256=aQxTmCsvh04aHgdGELQ1arB3RxCLmrJmbkQtsgunaE4,3169
|
|
15
15
|
winipedia_utils/git/github/repo/repo.py,sha256=OqoOfqDhe_Iik71dNqi4h3fGrMno33hSjk0bpNg3eZk,7865
|
|
16
16
|
winipedia_utils/git/github/workflows/__init__.py,sha256=BPdntTwFEyBMJ6MyT7gddPHswvRdH9tsRtfK72VSV7Y,57
|
|
17
17
|
winipedia_utils/git/github/workflows/base/__init__.py,sha256=XHsbmjiaGom-KX-S3leCY9cJD3aP9p_0X6xYMcdkHBU,23
|
|
@@ -87,7 +87,7 @@ winipedia_utils/testing/tests/conftest.py,sha256=BLgUJtLecOwuEsIyJ__0buqovd5AhiG
|
|
|
87
87
|
winipedia_utils/text/__init__.py,sha256=j2bwtK6kyeHI6SnoBjpRju0C1W2n2paXBDlNjNtaUxA,48
|
|
88
88
|
winipedia_utils/text/config.py,sha256=lt5QIXS2cY_aS0emZztcG6nRYwiqFKe0lGEq6msMF5E,7434
|
|
89
89
|
winipedia_utils/text/string.py,sha256=yXmwOab5hXyVQG1NwlWDpy2prj0U7Vb2F5HKLT2Y77Q,3382
|
|
90
|
-
winipedia_utils-0.4.
|
|
91
|
-
winipedia_utils-0.4.
|
|
92
|
-
winipedia_utils-0.4.
|
|
93
|
-
winipedia_utils-0.4.
|
|
90
|
+
winipedia_utils-0.4.26.dist-info/METADATA,sha256=xCRmQRjy7CT51dpLPX9sDaY1KSI1cnVtHBHFt_fPrqY,13344
|
|
91
|
+
winipedia_utils-0.4.26.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
92
|
+
winipedia_utils-0.4.26.dist-info/licenses/LICENSE,sha256=o316mE2gGzd__JT69p7S_zlOmKiHh8YjpImCCcWyTvM,1066
|
|
93
|
+
winipedia_utils-0.4.26.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|