winiutils 2.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- winiutils/__init__.py +1 -0
- winiutils/dev/__init__.py +1 -0
- winiutils/dev/builders/__init__.py +1 -0
- winiutils/dev/cli/__init__.py +1 -0
- winiutils/dev/cli/subcommands.py +6 -0
- winiutils/dev/configs/__init__.py +1 -0
- winiutils/dev/tests/__init__.py +1 -0
- winiutils/dev/tests/fixtures/__init__.py +1 -0
- winiutils/dev/tests/fixtures/fixtures.py +32 -0
- winiutils/main.py +9 -0
- winiutils/py.typed +0 -0
- winiutils/resources/__init__.py +1 -0
- winiutils/src/__init__.py +4 -0
- winiutils/src/data/__init__.py +8 -0
- winiutils/src/data/dataframe/__init__.py +7 -0
- winiutils/src/data/dataframe/cleaning.py +734 -0
- winiutils/src/data/structures/__init__.py +8 -0
- winiutils/src/data/structures/dicts.py +40 -0
- winiutils/src/data/structures/text/__init__.py +7 -0
- winiutils/src/data/structures/text/string.py +157 -0
- winiutils/src/iterating/__init__.py +8 -0
- winiutils/src/iterating/concurrent/__init__.py +9 -0
- winiutils/src/iterating/concurrent/concurrent.py +301 -0
- winiutils/src/iterating/concurrent/multiprocessing.py +186 -0
- winiutils/src/iterating/concurrent/multithreading.py +132 -0
- winiutils/src/iterating/iterate.py +45 -0
- winiutils/src/oop/__init__.py +7 -0
- winiutils/src/oop/mixins/__init__.py +8 -0
- winiutils/src/oop/mixins/meta.py +217 -0
- winiutils/src/oop/mixins/mixin.py +58 -0
- winiutils/src/security/__init__.py +8 -0
- winiutils/src/security/cryptography.py +100 -0
- winiutils/src/security/keyring.py +167 -0
- winiutils-2.3.12.dist-info/METADATA +283 -0
- winiutils-2.3.12.dist-info/RECORD +38 -0
- winiutils-2.3.12.dist-info/WHEEL +4 -0
- winiutils-2.3.12.dist-info/entry_points.txt +4 -0
- winiutils-2.3.12.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,734 @@
|
|
|
1
|
+
"""DataFrame cleaning pipeline utilities using Polars.
|
|
2
|
+
|
|
3
|
+
This module provides an abstract base class for building extensible DataFrame
|
|
4
|
+
cleaning pipelines. The ``CleaningDF`` class implements an 8-step cleaning
|
|
5
|
+
pipeline that can be customized by implementing abstract methods in child classes.
|
|
6
|
+
|
|
7
|
+
The cleaning pipeline executes the following operations in order:
|
|
8
|
+
1. Rename columns to standardized names
|
|
9
|
+
2. Drop columns not in the schema
|
|
10
|
+
3. Fill null values with specified defaults
|
|
11
|
+
4. Convert columns to correct data types
|
|
12
|
+
5. Drop rows where specified column subsets are entirely null
|
|
13
|
+
6. Handle duplicates by aggregating and removing
|
|
14
|
+
7. Sort the DataFrame by specified columns
|
|
15
|
+
8. Validate data quality (types, nulls, NaN values)
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
>>> import polars as pl
|
|
19
|
+
>>> from winiutils.src.data.dataframe.cleaning import CleaningDF
|
|
20
|
+
>>>
|
|
21
|
+
>>> class UserCleaner(CleaningDF):
|
|
22
|
+
... USER_ID = "user_id"
|
|
23
|
+
... EMAIL = "email"
|
|
24
|
+
...
|
|
25
|
+
... @classmethod
|
|
26
|
+
... def get_rename_map(cls):
|
|
27
|
+
... return {cls.USER_ID: "UserId", cls.EMAIL: "Email"}
|
|
28
|
+
...
|
|
29
|
+
... # ... implement other abstract methods
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from abc import abstractmethod
|
|
33
|
+
from collections.abc import Callable
|
|
34
|
+
from typing import Any
|
|
35
|
+
|
|
36
|
+
import polars as pl
|
|
37
|
+
from polars.datatypes.classes import FloatType
|
|
38
|
+
|
|
39
|
+
from winiutils.src.data.structures.dicts import reverse_dict
|
|
40
|
+
from winiutils.src.oop.mixins.mixin import ABCLoggingMixin
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class CleaningDF(ABCLoggingMixin):
|
|
44
|
+
"""Abstract base class for cleaning and standardizing DataFrames using Polars.
|
|
45
|
+
|
|
46
|
+
This class provides a comprehensive pipeline for importing, cleaning, and
|
|
47
|
+
standardizing data from various sources before loading into databases or
|
|
48
|
+
other systems. It enforces data quality standards through a series of
|
|
49
|
+
configurable cleaning operations.
|
|
50
|
+
|
|
51
|
+
The cleaning pipeline executes in the following order:
|
|
52
|
+
1. Rename columns according to a standardized naming scheme
|
|
53
|
+
2. Drop columns not in the schema
|
|
54
|
+
3. Fill null values with specified defaults
|
|
55
|
+
4. Convert columns to correct data types and apply custom transformations
|
|
56
|
+
5. Drop rows where specified column subsets are entirely null
|
|
57
|
+
6. Handle duplicates by aggregating values and removing duplicates
|
|
58
|
+
7. Sort the DataFrame by specified columns
|
|
59
|
+
8. Validate data quality (correct dtypes, no nulls in required columns,
|
|
60
|
+
no NaN values)
|
|
61
|
+
|
|
62
|
+
Child classes must implement abstract methods to define the cleaning
|
|
63
|
+
configuration:
|
|
64
|
+
- ``get_rename_map()``: Define column name mappings
|
|
65
|
+
- ``get_col_dtype_map()``: Define expected data types for each column
|
|
66
|
+
- ``get_drop_null_subsets()``: Define which column subsets trigger row
|
|
67
|
+
deletion
|
|
68
|
+
- ``get_fill_null_map()``: Define null value fill strategies
|
|
69
|
+
- ``get_sort_cols()``: Define sort order
|
|
70
|
+
- ``get_unique_subsets()``: Define duplicate detection criteria
|
|
71
|
+
- ``get_no_null_cols()``: Define columns that cannot contain nulls
|
|
72
|
+
- ``get_col_converter_map()``: Define custom column transformations
|
|
73
|
+
- ``get_add_on_duplicate_cols()``: Define columns to aggregate when
|
|
74
|
+
duplicates are found
|
|
75
|
+
- ``get_col_precision_map()``: Define rounding precision for float columns
|
|
76
|
+
|
|
77
|
+
Attributes:
|
|
78
|
+
df: The cleaned Polars DataFrame after the pipeline has executed.
|
|
79
|
+
|
|
80
|
+
Note:
|
|
81
|
+
- Define column names as class-level string constants for reusability
|
|
82
|
+
- NaN values are automatically converted to null for consistency
|
|
83
|
+
- The class inherits automatic method logging from ``ABCLoggingMixin``
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
>>> class UserCleaner(CleaningDF):
|
|
87
|
+
... USER_ID = "user_id"
|
|
88
|
+
... EMAIL = "email"
|
|
89
|
+
... SCORE = "score"
|
|
90
|
+
...
|
|
91
|
+
... @classmethod
|
|
92
|
+
... def get_col_dtype_map(cls):
|
|
93
|
+
... return {cls.USER_ID: pl.Int64, cls.EMAIL: pl.Utf8}
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
@abstractmethod
|
|
98
|
+
def get_rename_map(cls) -> dict[str, str]:
|
|
99
|
+
"""Define column name mappings for standardization.
|
|
100
|
+
|
|
101
|
+
This abstract method must be implemented in child classes to specify how
|
|
102
|
+
raw input column names should be renamed to standardized names. Renaming
|
|
103
|
+
is the first operation in the cleaning pipeline, executed before all other
|
|
104
|
+
cleaning operations.
|
|
105
|
+
|
|
106
|
+
The mapping format follows the CleaningDF convention of mapping
|
|
107
|
+
standardized names to raw input names. The reverse mapping is applied
|
|
108
|
+
to the DataFrame during cleaning.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dictionary mapping standardized column names (keys) to raw input
|
|
112
|
+
column names (values).
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
>>> @classmethod
|
|
116
|
+
... def get_rename_map(cls):
|
|
117
|
+
... return {
|
|
118
|
+
... "user_id": "UserId",
|
|
119
|
+
... "email": "Email_Address",
|
|
120
|
+
... "created_at": "CreatedDate",
|
|
121
|
+
... }
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
@abstractmethod
|
|
126
|
+
def get_col_dtype_map(cls) -> dict[str, type[pl.DataType]]:
|
|
127
|
+
"""Define the expected data type for each column in the cleaned DataFrame.
|
|
128
|
+
|
|
129
|
+
This abstract method must be implemented in child classes to specify the
|
|
130
|
+
target data types for all columns. The DataFrame will be validated against
|
|
131
|
+
this schema after cleaning, and a TypeError will be raised if any column
|
|
132
|
+
has an incorrect type.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Dictionary mapping standardized column names to their expected
|
|
136
|
+
Polars data types.
|
|
137
|
+
|
|
138
|
+
Example:
|
|
139
|
+
>>> @classmethod
|
|
140
|
+
... def get_col_dtype_map(cls):
|
|
141
|
+
... return {
|
|
142
|
+
... "user_id": pl.Int64,
|
|
143
|
+
... "email": pl.Utf8,
|
|
144
|
+
... "created_at": pl.Date,
|
|
145
|
+
... "score": pl.Float64,
|
|
146
|
+
... }
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
@classmethod
|
|
150
|
+
@abstractmethod
|
|
151
|
+
def get_drop_null_subsets(cls) -> tuple[tuple[str, ...], ...]:
|
|
152
|
+
"""Define column subsets for dropping rows with all-null values.
|
|
153
|
+
|
|
154
|
+
This abstract method specifies which column subsets should trigger row
|
|
155
|
+
deletion. A row is dropped if ALL columns in a subset are null. Multiple
|
|
156
|
+
subsets can be defined to apply different null-dropping rules. If no
|
|
157
|
+
subsets are defined, rows where all columns are null will be dropped.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Tuple of column name tuples, where each inner tuple represents one
|
|
161
|
+
subset. A row is dropped if all columns in any subset are null.
|
|
162
|
+
|
|
163
|
+
Example:
|
|
164
|
+
>>> @classmethod
|
|
165
|
+
... def get_drop_null_subsets(cls):
|
|
166
|
+
... return (
|
|
167
|
+
... ("email", "phone"), # Drop if both are null
|
|
168
|
+
... ("address_line1",), # Drop if null
|
|
169
|
+
... )
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
@abstractmethod
|
|
174
|
+
def get_fill_null_map(cls) -> dict[str, Any]:
|
|
175
|
+
"""Define null value fill strategies for each column.
|
|
176
|
+
|
|
177
|
+
This abstract method specifies default values to fill null entries in
|
|
178
|
+
each column. This is applied early in the cleaning pipeline after
|
|
179
|
+
column renaming.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Dictionary mapping column names to their fill values. The fill
|
|
183
|
+
value can be any type appropriate for the column's data type.
|
|
184
|
+
|
|
185
|
+
Example:
|
|
186
|
+
>>> @classmethod
|
|
187
|
+
... def get_fill_null_map(cls):
|
|
188
|
+
... return {
|
|
189
|
+
... "email": "",
|
|
190
|
+
... "phone": "",
|
|
191
|
+
... "score": 0,
|
|
192
|
+
... "status": "unknown",
|
|
193
|
+
... }
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
@classmethod
|
|
197
|
+
@abstractmethod
|
|
198
|
+
def get_sort_cols(cls) -> tuple[tuple[str, bool], ...]:
|
|
199
|
+
"""Define the sort order for the cleaned DataFrame.
|
|
200
|
+
|
|
201
|
+
This abstract method specifies which columns to sort by and in what
|
|
202
|
+
order (ascending or descending). Sorting is applied near the end of
|
|
203
|
+
the cleaning pipeline, after all data transformations are complete.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Tuple of (column_name, is_descending) tuples. Each tuple specifies
|
|
207
|
+
a column and sort direction. Columns are sorted in the order they
|
|
208
|
+
appear. True = descending, False = ascending.
|
|
209
|
+
|
|
210
|
+
Example:
|
|
211
|
+
>>> @classmethod
|
|
212
|
+
... def get_sort_cols(cls):
|
|
213
|
+
... return (
|
|
214
|
+
... ("created_at", True), # Descending
|
|
215
|
+
... ("user_id", False), # Ascending
|
|
216
|
+
... )
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
@classmethod
|
|
220
|
+
@abstractmethod
|
|
221
|
+
def get_unique_subsets(cls) -> tuple[tuple[str, ...], ...]:
|
|
222
|
+
"""Define column subsets for duplicate detection and removal.
|
|
223
|
+
|
|
224
|
+
This abstract method specifies which column combinations define
|
|
225
|
+
uniqueness. Rows are considered duplicates if they have identical
|
|
226
|
+
values in all columns of a subset. When duplicates are found, values
|
|
227
|
+
in columns specified by ``get_add_on_duplicate_cols()`` are summed,
|
|
228
|
+
and the first row is kept.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Tuple of column name tuples, where each inner tuple represents
|
|
232
|
+
one uniqueness constraint. Duplicates are detected and handled
|
|
233
|
+
for each subset independently.
|
|
234
|
+
|
|
235
|
+
Example:
|
|
236
|
+
>>> @classmethod
|
|
237
|
+
... def get_unique_subsets(cls):
|
|
238
|
+
... return (
|
|
239
|
+
... ("user_id", "date"), # Unique by user_id and date
|
|
240
|
+
... ("transaction_id",), # Unique by transaction_id
|
|
241
|
+
... )
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
@classmethod
|
|
245
|
+
@abstractmethod
|
|
246
|
+
def get_no_null_cols(cls) -> tuple[str, ...]:
|
|
247
|
+
"""Define columns that must not contain null values.
|
|
248
|
+
|
|
249
|
+
This abstract method specifies which columns are required to have
|
|
250
|
+
non-null values. A ValueError is raised during the final validation
|
|
251
|
+
step if any of these columns contain null values.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
Tuple of column names that must not contain null values.
|
|
255
|
+
|
|
256
|
+
Example:
|
|
257
|
+
>>> @classmethod
|
|
258
|
+
... def get_no_null_cols(cls):
|
|
259
|
+
... return ("user_id", "email", "created_at")
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
@classmethod
|
|
263
|
+
@abstractmethod
|
|
264
|
+
def get_col_converter_map(
|
|
265
|
+
cls,
|
|
266
|
+
) -> dict[str, Callable[[pl.Series], pl.Series]]:
|
|
267
|
+
"""Define custom conversion functions for columns.
|
|
268
|
+
|
|
269
|
+
This abstract method specifies custom transformations to apply to
|
|
270
|
+
columns after standard conversions (string stripping, float rounding).
|
|
271
|
+
Each function receives a Polars Series and returns a transformed
|
|
272
|
+
Series. Use ``skip_col_converter`` as a placeholder for columns that
|
|
273
|
+
don't need custom conversion.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Dictionary mapping column names to their conversion functions.
|
|
277
|
+
Each function takes a Series and returns a transformed Series.
|
|
278
|
+
|
|
279
|
+
Example:
|
|
280
|
+
>>> @classmethod
|
|
281
|
+
... def get_col_converter_map(cls):
|
|
282
|
+
... return {
|
|
283
|
+
... "email": lambda s: s.str.to_lowercase(),
|
|
284
|
+
... "phone": cls.parse_phone_number,
|
|
285
|
+
... "created_at": cls.skip_col_converter,
|
|
286
|
+
... }
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
@classmethod
|
|
290
|
+
@abstractmethod
|
|
291
|
+
def get_add_on_duplicate_cols(cls) -> tuple[str, ...]:
|
|
292
|
+
"""Define columns to aggregate when duplicate rows are found.
|
|
293
|
+
|
|
294
|
+
This abstract method specifies which columns should have their values
|
|
295
|
+
summed when duplicate rows are detected (based on
|
|
296
|
+
``get_unique_subsets()``). The summed values are kept in the first row,
|
|
297
|
+
and duplicate rows are removed.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Tuple of column names whose values should be summed when duplicates
|
|
301
|
+
are found.
|
|
302
|
+
|
|
303
|
+
Example:
|
|
304
|
+
>>> @classmethod
|
|
305
|
+
... def get_add_on_duplicate_cols(cls):
|
|
306
|
+
... return ("quantity", "revenue", "impressions")
|
|
307
|
+
"""
|
|
308
|
+
|
|
309
|
+
@classmethod
|
|
310
|
+
@abstractmethod
|
|
311
|
+
def get_col_precision_map(cls) -> dict[str, int]:
|
|
312
|
+
"""Define rounding precision for float columns.
|
|
313
|
+
|
|
314
|
+
This abstract method specifies the number of decimal places to round
|
|
315
|
+
float columns to. Rounding is applied during the standard conversion
|
|
316
|
+
phase and uses Kahan summation to compensate for floating-point
|
|
317
|
+
rounding errors.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
Dictionary mapping float column names to their precision
|
|
321
|
+
(number of decimal places).
|
|
322
|
+
|
|
323
|
+
Example:
|
|
324
|
+
>>> @classmethod
|
|
325
|
+
... def get_col_precision_map(cls):
|
|
326
|
+
... return {
|
|
327
|
+
... "price": 2,
|
|
328
|
+
... "percentage": 4,
|
|
329
|
+
... "score": 1,
|
|
330
|
+
... }
|
|
331
|
+
"""
|
|
332
|
+
|
|
333
|
+
def __init__(
|
|
334
|
+
self,
|
|
335
|
+
*args: Any,
|
|
336
|
+
**kwargs: Any,
|
|
337
|
+
) -> None:
|
|
338
|
+
"""Initialize the CleaningDF and execute the cleaning pipeline.
|
|
339
|
+
|
|
340
|
+
Creates a Polars DataFrame with NaN values automatically converted to
|
|
341
|
+
null, then immediately executes the full cleaning pipeline. The schema
|
|
342
|
+
is enforced from ``get_col_dtype_map()``.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
*args: Positional arguments passed to ``pl.DataFrame`` constructor.
|
|
346
|
+
**kwargs: Keyword arguments passed to ``pl.DataFrame`` constructor.
|
|
347
|
+
|
|
348
|
+
Note:
|
|
349
|
+
The following kwargs are automatically set and will override any
|
|
350
|
+
user-provided values:
|
|
351
|
+
- ``nan_to_null``: Always set to True
|
|
352
|
+
- ``schema``: Set from ``get_col_dtype_map()``
|
|
353
|
+
- ``data``: Replaced with renamed and filtered data
|
|
354
|
+
"""
|
|
355
|
+
# create a temp df for standardization and accepting all ploars arg and kwargs
|
|
356
|
+
temp_df = pl.DataFrame(*args, **kwargs)
|
|
357
|
+
temp_df = self.rename_cols(temp_df)
|
|
358
|
+
temp_df = self.drop_cols(temp_df)
|
|
359
|
+
|
|
360
|
+
# enforce standard kwargs and create the final df
|
|
361
|
+
kwargs["data"] = temp_df.to_dict(as_series=True)
|
|
362
|
+
kwargs["nan_to_null"] = True
|
|
363
|
+
kwargs["schema"] = self.get_col_dtype_map()
|
|
364
|
+
self.df = pl.DataFrame(**kwargs)
|
|
365
|
+
self.clean()
|
|
366
|
+
|
|
367
|
+
@classmethod
|
|
368
|
+
def get_col_names(cls) -> tuple[str, ...]:
|
|
369
|
+
"""Get the standardized column names from the dtype map.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
Tuple of standardized column names in the order they appear
|
|
373
|
+
in ``get_col_dtype_map()``.
|
|
374
|
+
"""
|
|
375
|
+
return tuple(cls.get_col_dtype_map().keys())
|
|
376
|
+
|
|
377
|
+
def clean(self) -> None:
|
|
378
|
+
"""Execute the complete data cleaning pipeline.
|
|
379
|
+
|
|
380
|
+
Applies all cleaning operations in the following order:
|
|
381
|
+
1. Fill null values with defaults
|
|
382
|
+
2. Convert columns to correct types and apply transformations
|
|
383
|
+
3. Drop rows with all-null column subsets
|
|
384
|
+
4. Handle duplicates by aggregating and removing
|
|
385
|
+
5. Sort the DataFrame
|
|
386
|
+
6. Validate data quality
|
|
387
|
+
|
|
388
|
+
Note:
|
|
389
|
+
Renaming and dropping columns are done during ``__init__`` before
|
|
390
|
+
this method is called. This method is automatically called during
|
|
391
|
+
initialization.
|
|
392
|
+
"""
|
|
393
|
+
self.fill_nulls()
|
|
394
|
+
self.convert_cols()
|
|
395
|
+
self.drop_null_subsets()
|
|
396
|
+
self.handle_duplicates()
|
|
397
|
+
self.sort_cols()
|
|
398
|
+
self.check()
|
|
399
|
+
|
|
400
|
+
@classmethod
|
|
401
|
+
def raise_on_missing_cols(
|
|
402
|
+
cls,
|
|
403
|
+
map_func: Callable[..., dict[str, Any]],
|
|
404
|
+
) -> None:
|
|
405
|
+
"""Validate that all required columns are present in a configuration map.
|
|
406
|
+
|
|
407
|
+
Checks that the columns returned by ``map_func`` contain all columns
|
|
408
|
+
defined in the schema. Raises KeyError if any required columns are
|
|
409
|
+
missing from the map.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
map_func: A callable that returns a dict with column names as keys.
|
|
413
|
+
|
|
414
|
+
Raises:
|
|
415
|
+
KeyError: If any required columns are missing from the map.
|
|
416
|
+
"""
|
|
417
|
+
col_names = cls.get_col_names()
|
|
418
|
+
missing_cols = set(col_names) - set(map_func().keys())
|
|
419
|
+
if missing_cols:
|
|
420
|
+
msg = f"Missing columns in {map_func}: {missing_cols}"
|
|
421
|
+
raise KeyError(msg)
|
|
422
|
+
|
|
423
|
+
def rename_cols(self, temp_df: pl.DataFrame) -> pl.DataFrame:
|
|
424
|
+
"""Rename columns from raw names to standardized names.
|
|
425
|
+
|
|
426
|
+
Applies the reverse of ``get_rename_map()`` to rename columns from
|
|
427
|
+
their raw input names to standardized names.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
temp_df: The DataFrame with raw column names to rename.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
DataFrame with columns renamed to standardized names.
|
|
434
|
+
|
|
435
|
+
Raises:
|
|
436
|
+
KeyError: If any required columns are missing from the rename map.
|
|
437
|
+
"""
|
|
438
|
+
self.raise_on_missing_cols(self.get_rename_map)
|
|
439
|
+
return temp_df.rename(reverse_dict(self.get_rename_map()))
|
|
440
|
+
|
|
441
|
+
def drop_cols(self, temp_df: pl.DataFrame) -> pl.DataFrame:
|
|
442
|
+
"""Drop columns not defined in the schema.
|
|
443
|
+
|
|
444
|
+
Selects only the columns defined in ``get_col_names()``, removing any
|
|
445
|
+
extra columns that may have been in the input data.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
temp_df: The DataFrame to filter columns from.
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
DataFrame containing only the columns defined in the schema.
|
|
452
|
+
"""
|
|
453
|
+
return temp_df.select(self.get_col_names())
|
|
454
|
+
|
|
455
|
+
def fill_nulls(self) -> None:
|
|
456
|
+
"""Fill null values with defaults from the fill null map.
|
|
457
|
+
|
|
458
|
+
Replaces null values in each column with the corresponding fill value
|
|
459
|
+
from ``get_fill_null_map()``.
|
|
460
|
+
|
|
461
|
+
Raises:
|
|
462
|
+
KeyError: If any columns are missing from the fill null map.
|
|
463
|
+
"""
|
|
464
|
+
self.raise_on_missing_cols(self.get_fill_null_map)
|
|
465
|
+
self.df = self.df.with_columns(
|
|
466
|
+
[
|
|
467
|
+
pl.col(col_name).fill_null(fill_value)
|
|
468
|
+
for col_name, fill_value in self.get_fill_null_map().items()
|
|
469
|
+
]
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
def convert_cols(self) -> None:
|
|
473
|
+
"""Apply standard and custom column conversions.
|
|
474
|
+
|
|
475
|
+
Orchestrates both standard conversions (string stripping, float
|
|
476
|
+
rounding) and custom conversions defined in ``get_col_converter_map()``.
|
|
477
|
+
|
|
478
|
+
Raises:
|
|
479
|
+
KeyError: If any columns are missing from the converter map.
|
|
480
|
+
"""
|
|
481
|
+
self.raise_on_missing_cols(self.get_col_converter_map)
|
|
482
|
+
self.standard_convert_cols()
|
|
483
|
+
self.custom_convert_cols()
|
|
484
|
+
|
|
485
|
+
def standard_convert_cols(self) -> None:
|
|
486
|
+
"""Apply standard conversions based on data type.
|
|
487
|
+
|
|
488
|
+
Automatically applies the following transformations:
|
|
489
|
+
- ``pl.Utf8`` columns: Strip leading/trailing whitespace
|
|
490
|
+
- ``pl.Float64`` columns: Round to precision using Kahan summation
|
|
491
|
+
"""
|
|
492
|
+
for col_name, dtype in self.get_col_dtype_map().items():
|
|
493
|
+
if dtype == pl.Utf8:
|
|
494
|
+
converter = self.strip_col
|
|
495
|
+
elif dtype == pl.Float64:
|
|
496
|
+
converter = self.round_col
|
|
497
|
+
else:
|
|
498
|
+
continue
|
|
499
|
+
self.df = self.df.with_columns(
|
|
500
|
+
pl.col(col_name).map_batches(converter, return_dtype=dtype)
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
def custom_convert_cols(self) -> None:
|
|
504
|
+
"""Apply custom conversion functions to columns.
|
|
505
|
+
|
|
506
|
+
Applies custom transformations from ``get_col_converter_map()`` to each
|
|
507
|
+
column. Columns marked with ``skip_col_converter`` are skipped.
|
|
508
|
+
"""
|
|
509
|
+
self.df = self.df.with_columns(
|
|
510
|
+
[
|
|
511
|
+
pl.col(col_name).map_batches(
|
|
512
|
+
converter, return_dtype=self.get_col_dtype_map()[col_name]
|
|
513
|
+
)
|
|
514
|
+
for col_name, converter in self.get_col_converter_map().items()
|
|
515
|
+
if converter.__name__ != self.skip_col_converter.__name__ # ty:ignore[unresolved-attribute]
|
|
516
|
+
]
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
@classmethod
|
|
520
|
+
def strip_col(cls, col: pl.Series) -> pl.Series:
|
|
521
|
+
"""Remove leading and trailing whitespace from a string column.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
col: Polars Series of string type (``pl.Utf8``).
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
Series with leading and trailing whitespace removed from each value.
|
|
528
|
+
"""
|
|
529
|
+
return col.str.strip_chars()
|
|
530
|
+
|
|
531
|
+
@classmethod
|
|
532
|
+
def lower_col(cls, col: pl.Series) -> pl.Series:
|
|
533
|
+
"""Convert a string column to lowercase.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
col: Polars Series of string type (``pl.Utf8``).
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Series with all characters converted to lowercase.
|
|
540
|
+
"""
|
|
541
|
+
return col.str.to_lowercase()
|
|
542
|
+
|
|
543
|
+
@classmethod
|
|
544
|
+
def round_col(
|
|
545
|
+
cls,
|
|
546
|
+
col: pl.Series,
|
|
547
|
+
precision: int | None = None,
|
|
548
|
+
*,
|
|
549
|
+
compensate: bool = True,
|
|
550
|
+
) -> pl.Series:
|
|
551
|
+
"""Round a float column to specified precision.
|
|
552
|
+
|
|
553
|
+
Uses Kahan summation algorithm to compensate for floating-point
|
|
554
|
+
rounding errors when ``compensate=True``, ensuring that the sum of
|
|
555
|
+
rounded values matches the rounded sum of original values.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
col: Polars Series of float type (``pl.Float64``).
|
|
559
|
+
precision: Number of decimal places. If None, uses the value from
|
|
560
|
+
``get_col_precision_map()`` for this column.
|
|
561
|
+
compensate: If True, use Kahan summation to reduce cumulative
|
|
562
|
+
rounding errors. Defaults to True.
|
|
563
|
+
|
|
564
|
+
Returns:
|
|
565
|
+
Series with values rounded to the specified precision.
|
|
566
|
+
|
|
567
|
+
Note:
|
|
568
|
+
Kahan summation is slower than simple rounding but provides better
|
|
569
|
+
accuracy for financial or scientific calculations where cumulative
|
|
570
|
+
rounding errors matter.
|
|
571
|
+
"""
|
|
572
|
+
if precision is None:
|
|
573
|
+
precision = cls.get_col_precision_map()[str(col.name)]
|
|
574
|
+
if not compensate:
|
|
575
|
+
return col.round(precision)
|
|
576
|
+
|
|
577
|
+
# compensate for rounding errors with kahan sum
|
|
578
|
+
error = 0.0
|
|
579
|
+
values = []
|
|
580
|
+
for value in col.to_list(): # Ensure iteration over Python floats
|
|
581
|
+
corrected = value + error
|
|
582
|
+
rounded = round(corrected, precision)
|
|
583
|
+
error = corrected - rounded
|
|
584
|
+
values.append(rounded)
|
|
585
|
+
|
|
586
|
+
return pl.Series(name=col.name, values=values, dtype=col.dtype)
|
|
587
|
+
|
|
588
|
+
@classmethod
|
|
589
|
+
def skip_col_converter(cls, _col: pl.Series) -> pl.Series:
|
|
590
|
+
"""Placeholder to skip custom conversion for a column.
|
|
591
|
+
|
|
592
|
+
Use this method in ``get_col_converter_map()`` to indicate that a
|
|
593
|
+
column should not have custom conversion applied. This method should
|
|
594
|
+
never be actually called - it's only used as a marker.
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
_col: Unused. The column that would be converted.
|
|
598
|
+
|
|
599
|
+
Raises:
|
|
600
|
+
NotImplementedError: Always raised if this method is called.
|
|
601
|
+
|
|
602
|
+
Example:
|
|
603
|
+
>>> @classmethod
|
|
604
|
+
... def get_col_converter_map(cls):
|
|
605
|
+
... return {
|
|
606
|
+
... "email": lambda s: s.str.to_lowercase(),
|
|
607
|
+
... "user_id": cls.skip_col_converter, # No conversion
|
|
608
|
+
... }
|
|
609
|
+
"""
|
|
610
|
+
msg = (
|
|
611
|
+
"skip_col_converter is just a flag to skip conversion for a column "
|
|
612
|
+
"and should not be actually called."
|
|
613
|
+
)
|
|
614
|
+
raise NotImplementedError(msg)
|
|
615
|
+
|
|
616
|
+
def drop_null_subsets(self) -> None:
|
|
617
|
+
"""Drop rows where all columns in a subset are null.
|
|
618
|
+
|
|
619
|
+
Applies null-dropping rules defined in ``get_drop_null_subsets()``.
|
|
620
|
+
If no subsets are defined, drops rows where all columns are null.
|
|
621
|
+
"""
|
|
622
|
+
subsets = self.get_drop_null_subsets()
|
|
623
|
+
if not subsets:
|
|
624
|
+
self.df = self.df.drop_nulls()
|
|
625
|
+
return
|
|
626
|
+
for subset in subsets:
|
|
627
|
+
self.df = self.df.drop_nulls(subset=subset)
|
|
628
|
+
|
|
629
|
+
def handle_duplicates(self) -> None:
|
|
630
|
+
"""Remove duplicate rows and aggregate specified columns.
|
|
631
|
+
|
|
632
|
+
For each uniqueness subset defined in ``get_unique_subsets()``:
|
|
633
|
+
1. Sum values in columns specified by ``get_add_on_duplicate_cols()``
|
|
634
|
+
2. Keep only the first row of each duplicate group
|
|
635
|
+
|
|
636
|
+
Example:
|
|
637
|
+
If two rows have the same (user_id, date) and values 1 and 2 in
|
|
638
|
+
the 'quantity' column, the result will have one row with
|
|
639
|
+
quantity=3.
|
|
640
|
+
"""
|
|
641
|
+
for subset in self.get_unique_subsets():
|
|
642
|
+
for col in self.get_add_on_duplicate_cols():
|
|
643
|
+
self.df = self.df.with_columns(pl.col(col).sum().over(subset))
|
|
644
|
+
self.df = self.df.unique(subset=subset, keep="first")
|
|
645
|
+
|
|
646
|
+
def sort_cols(self) -> None:
|
|
647
|
+
"""Sort the DataFrame by columns and directions from get_sort_cols().
|
|
648
|
+
|
|
649
|
+
Applies multi-column sorting with per-column sort direction
|
|
650
|
+
(ascending or descending) as defined in ``get_sort_cols()``.
|
|
651
|
+
"""
|
|
652
|
+
cols, desc = zip(*self.get_sort_cols(), strict=True)
|
|
653
|
+
if not cols:
|
|
654
|
+
return
|
|
655
|
+
self.df = self.df.sort(cols, descending=desc)
|
|
656
|
+
|
|
657
|
+
def check(self) -> None:
|
|
658
|
+
"""Validate data quality after cleaning.
|
|
659
|
+
|
|
660
|
+
Runs all validation checks in order:
|
|
661
|
+
1. Correct data types for all columns
|
|
662
|
+
2. No null values in required columns
|
|
663
|
+
3. No NaN values in float columns
|
|
664
|
+
|
|
665
|
+
This method is called automatically at the end of the ``clean()``
|
|
666
|
+
pipeline.
|
|
667
|
+
|
|
668
|
+
Raises:
|
|
669
|
+
TypeError: If any column has an incorrect data type.
|
|
670
|
+
ValueError: If required columns contain nulls or float columns
|
|
671
|
+
contain NaN values.
|
|
672
|
+
"""
|
|
673
|
+
self.check_correct_dtypes()
|
|
674
|
+
self.check_no_null_cols()
|
|
675
|
+
self.check_no_nan()
|
|
676
|
+
|
|
677
|
+
def check_correct_dtypes(self) -> None:
|
|
678
|
+
"""Validate that all columns have their expected data types.
|
|
679
|
+
|
|
680
|
+
Compares the actual DataFrame schema against the expected types
|
|
681
|
+
defined in ``get_col_dtype_map()``.
|
|
682
|
+
|
|
683
|
+
Raises:
|
|
684
|
+
TypeError: If any column's actual type doesn't match the expected
|
|
685
|
+
type from the schema.
|
|
686
|
+
"""
|
|
687
|
+
schema = self.df.schema
|
|
688
|
+
col_dtype_map = self.get_col_dtype_map()
|
|
689
|
+
for col, dtype in col_dtype_map.items():
|
|
690
|
+
schema_dtype = schema[col]
|
|
691
|
+
if schema_dtype != dtype:
|
|
692
|
+
msg = f"Expected dtype {dtype} for column {col}, got {schema_dtype}"
|
|
693
|
+
raise TypeError(msg)
|
|
694
|
+
|
|
695
|
+
def check_no_null_cols(self) -> None:
|
|
696
|
+
"""Validate that required columns contain no null values.
|
|
697
|
+
|
|
698
|
+
Checks all columns defined in ``get_no_null_cols()`` for null values.
|
|
699
|
+
|
|
700
|
+
Raises:
|
|
701
|
+
ValueError: If any column in ``get_no_null_cols()`` contains null
|
|
702
|
+
values.
|
|
703
|
+
"""
|
|
704
|
+
no_null_cols = self.get_no_null_cols()
|
|
705
|
+
# Use a single select to check all columns at once
|
|
706
|
+
null_flags = self.df.select(
|
|
707
|
+
[pl.col(col).is_null().any() for col in no_null_cols]
|
|
708
|
+
)
|
|
709
|
+
# Iterate over columns and check if any have nulls
|
|
710
|
+
for col in no_null_cols:
|
|
711
|
+
if null_flags[col].item():
|
|
712
|
+
msg = f"Null values found in column: {col}"
|
|
713
|
+
raise ValueError(msg)
|
|
714
|
+
|
|
715
|
+
def check_no_nan(self) -> None:
|
|
716
|
+
"""Validate that float columns contain no NaN values.
|
|
717
|
+
|
|
718
|
+
Checks all columns with float data types (``pl.Float64``, etc.) for
|
|
719
|
+
NaN values.
|
|
720
|
+
|
|
721
|
+
Raises:
|
|
722
|
+
ValueError: If any float column contains NaN values.
|
|
723
|
+
"""
|
|
724
|
+
float_cols = [
|
|
725
|
+
col
|
|
726
|
+
for col, dtype in self.get_col_dtype_map().items()
|
|
727
|
+
if issubclass(dtype, FloatType)
|
|
728
|
+
]
|
|
729
|
+
has_nan = self.df.select(
|
|
730
|
+
pl.any_horizontal(pl.col(float_cols).is_nan().any())
|
|
731
|
+
).item()
|
|
732
|
+
if has_nan:
|
|
733
|
+
msg = "NaN values found in the dataframe"
|
|
734
|
+
raise ValueError(msg)
|