winiutils 2.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. winiutils/__init__.py +1 -0
  2. winiutils/dev/__init__.py +1 -0
  3. winiutils/dev/builders/__init__.py +1 -0
  4. winiutils/dev/cli/__init__.py +1 -0
  5. winiutils/dev/cli/subcommands.py +6 -0
  6. winiutils/dev/configs/__init__.py +1 -0
  7. winiutils/dev/tests/__init__.py +1 -0
  8. winiutils/dev/tests/fixtures/__init__.py +1 -0
  9. winiutils/dev/tests/fixtures/fixtures.py +32 -0
  10. winiutils/main.py +9 -0
  11. winiutils/py.typed +0 -0
  12. winiutils/resources/__init__.py +1 -0
  13. winiutils/src/__init__.py +4 -0
  14. winiutils/src/data/__init__.py +8 -0
  15. winiutils/src/data/dataframe/__init__.py +7 -0
  16. winiutils/src/data/dataframe/cleaning.py +734 -0
  17. winiutils/src/data/structures/__init__.py +8 -0
  18. winiutils/src/data/structures/dicts.py +40 -0
  19. winiutils/src/data/structures/text/__init__.py +7 -0
  20. winiutils/src/data/structures/text/string.py +157 -0
  21. winiutils/src/iterating/__init__.py +8 -0
  22. winiutils/src/iterating/concurrent/__init__.py +9 -0
  23. winiutils/src/iterating/concurrent/concurrent.py +301 -0
  24. winiutils/src/iterating/concurrent/multiprocessing.py +186 -0
  25. winiutils/src/iterating/concurrent/multithreading.py +132 -0
  26. winiutils/src/iterating/iterate.py +45 -0
  27. winiutils/src/oop/__init__.py +7 -0
  28. winiutils/src/oop/mixins/__init__.py +8 -0
  29. winiutils/src/oop/mixins/meta.py +217 -0
  30. winiutils/src/oop/mixins/mixin.py +58 -0
  31. winiutils/src/security/__init__.py +8 -0
  32. winiutils/src/security/cryptography.py +100 -0
  33. winiutils/src/security/keyring.py +167 -0
  34. winiutils-2.3.12.dist-info/METADATA +283 -0
  35. winiutils-2.3.12.dist-info/RECORD +38 -0
  36. winiutils-2.3.12.dist-info/WHEEL +4 -0
  37. winiutils-2.3.12.dist-info/entry_points.txt +4 -0
  38. winiutils-2.3.12.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,734 @@
1
+ """DataFrame cleaning pipeline utilities using Polars.
2
+
3
+ This module provides an abstract base class for building extensible DataFrame
4
+ cleaning pipelines. The ``CleaningDF`` class implements an 8-step cleaning
5
+ pipeline that can be customized by implementing abstract methods in child classes.
6
+
7
+ The cleaning pipeline executes the following operations in order:
8
+ 1. Rename columns to standardized names
9
+ 2. Drop columns not in the schema
10
+ 3. Fill null values with specified defaults
11
+ 4. Convert columns to correct data types
12
+ 5. Drop rows where specified column subsets are entirely null
13
+ 6. Handle duplicates by aggregating and removing
14
+ 7. Sort the DataFrame by specified columns
15
+ 8. Validate data quality (types, nulls, NaN values)
16
+
17
+ Example:
18
+ >>> import polars as pl
19
+ >>> from winiutils.src.data.dataframe.cleaning import CleaningDF
20
+ >>>
21
+ >>> class UserCleaner(CleaningDF):
22
+ ... USER_ID = "user_id"
23
+ ... EMAIL = "email"
24
+ ...
25
+ ... @classmethod
26
+ ... def get_rename_map(cls):
27
+ ... return {cls.USER_ID: "UserId", cls.EMAIL: "Email"}
28
+ ...
29
+ ... # ... implement other abstract methods
30
+ """
31
+
32
+ from abc import abstractmethod
33
+ from collections.abc import Callable
34
+ from typing import Any
35
+
36
+ import polars as pl
37
+ from polars.datatypes.classes import FloatType
38
+
39
+ from winiutils.src.data.structures.dicts import reverse_dict
40
+ from winiutils.src.oop.mixins.mixin import ABCLoggingMixin
41
+
42
+
43
+ class CleaningDF(ABCLoggingMixin):
44
+ """Abstract base class for cleaning and standardizing DataFrames using Polars.
45
+
46
+ This class provides a comprehensive pipeline for importing, cleaning, and
47
+ standardizing data from various sources before loading into databases or
48
+ other systems. It enforces data quality standards through a series of
49
+ configurable cleaning operations.
50
+
51
+ The cleaning pipeline executes in the following order:
52
+ 1. Rename columns according to a standardized naming scheme
53
+ 2. Drop columns not in the schema
54
+ 3. Fill null values with specified defaults
55
+ 4. Convert columns to correct data types and apply custom transformations
56
+ 5. Drop rows where specified column subsets are entirely null
57
+ 6. Handle duplicates by aggregating values and removing duplicates
58
+ 7. Sort the DataFrame by specified columns
59
+ 8. Validate data quality (correct dtypes, no nulls in required columns,
60
+ no NaN values)
61
+
62
+ Child classes must implement abstract methods to define the cleaning
63
+ configuration:
64
+ - ``get_rename_map()``: Define column name mappings
65
+ - ``get_col_dtype_map()``: Define expected data types for each column
66
+ - ``get_drop_null_subsets()``: Define which column subsets trigger row
67
+ deletion
68
+ - ``get_fill_null_map()``: Define null value fill strategies
69
+ - ``get_sort_cols()``: Define sort order
70
+ - ``get_unique_subsets()``: Define duplicate detection criteria
71
+ - ``get_no_null_cols()``: Define columns that cannot contain nulls
72
+ - ``get_col_converter_map()``: Define custom column transformations
73
+ - ``get_add_on_duplicate_cols()``: Define columns to aggregate when
74
+ duplicates are found
75
+ - ``get_col_precision_map()``: Define rounding precision for float columns
76
+
77
+ Attributes:
78
+ df: The cleaned Polars DataFrame after the pipeline has executed.
79
+
80
+ Note:
81
+ - Define column names as class-level string constants for reusability
82
+ - NaN values are automatically converted to null for consistency
83
+ - The class inherits automatic method logging from ``ABCLoggingMixin``
84
+
85
+ Example:
86
+ >>> class UserCleaner(CleaningDF):
87
+ ... USER_ID = "user_id"
88
+ ... EMAIL = "email"
89
+ ... SCORE = "score"
90
+ ...
91
+ ... @classmethod
92
+ ... def get_col_dtype_map(cls):
93
+ ... return {cls.USER_ID: pl.Int64, cls.EMAIL: pl.Utf8}
94
+ """
95
+
96
+ @classmethod
97
+ @abstractmethod
98
+ def get_rename_map(cls) -> dict[str, str]:
99
+ """Define column name mappings for standardization.
100
+
101
+ This abstract method must be implemented in child classes to specify how
102
+ raw input column names should be renamed to standardized names. Renaming
103
+ is the first operation in the cleaning pipeline, executed before all other
104
+ cleaning operations.
105
+
106
+ The mapping format follows the CleaningDF convention of mapping
107
+ standardized names to raw input names. The reverse mapping is applied
108
+ to the DataFrame during cleaning.
109
+
110
+ Returns:
111
+ Dictionary mapping standardized column names (keys) to raw input
112
+ column names (values).
113
+
114
+ Example:
115
+ >>> @classmethod
116
+ ... def get_rename_map(cls):
117
+ ... return {
118
+ ... "user_id": "UserId",
119
+ ... "email": "Email_Address",
120
+ ... "created_at": "CreatedDate",
121
+ ... }
122
+ """
123
+
124
+ @classmethod
125
+ @abstractmethod
126
+ def get_col_dtype_map(cls) -> dict[str, type[pl.DataType]]:
127
+ """Define the expected data type for each column in the cleaned DataFrame.
128
+
129
+ This abstract method must be implemented in child classes to specify the
130
+ target data types for all columns. The DataFrame will be validated against
131
+ this schema after cleaning, and a TypeError will be raised if any column
132
+ has an incorrect type.
133
+
134
+ Returns:
135
+ Dictionary mapping standardized column names to their expected
136
+ Polars data types.
137
+
138
+ Example:
139
+ >>> @classmethod
140
+ ... def get_col_dtype_map(cls):
141
+ ... return {
142
+ ... "user_id": pl.Int64,
143
+ ... "email": pl.Utf8,
144
+ ... "created_at": pl.Date,
145
+ ... "score": pl.Float64,
146
+ ... }
147
+ """
148
+
149
+ @classmethod
150
+ @abstractmethod
151
+ def get_drop_null_subsets(cls) -> tuple[tuple[str, ...], ...]:
152
+ """Define column subsets for dropping rows with all-null values.
153
+
154
+ This abstract method specifies which column subsets should trigger row
155
+ deletion. A row is dropped if ALL columns in a subset are null. Multiple
156
+ subsets can be defined to apply different null-dropping rules. If no
157
+ subsets are defined, rows where all columns are null will be dropped.
158
+
159
+ Returns:
160
+ Tuple of column name tuples, where each inner tuple represents one
161
+ subset. A row is dropped if all columns in any subset are null.
162
+
163
+ Example:
164
+ >>> @classmethod
165
+ ... def get_drop_null_subsets(cls):
166
+ ... return (
167
+ ... ("email", "phone"), # Drop if both are null
168
+ ... ("address_line1",), # Drop if null
169
+ ... )
170
+ """
171
+
172
+ @classmethod
173
+ @abstractmethod
174
+ def get_fill_null_map(cls) -> dict[str, Any]:
175
+ """Define null value fill strategies for each column.
176
+
177
+ This abstract method specifies default values to fill null entries in
178
+ each column. This is applied early in the cleaning pipeline after
179
+ column renaming.
180
+
181
+ Returns:
182
+ Dictionary mapping column names to their fill values. The fill
183
+ value can be any type appropriate for the column's data type.
184
+
185
+ Example:
186
+ >>> @classmethod
187
+ ... def get_fill_null_map(cls):
188
+ ... return {
189
+ ... "email": "",
190
+ ... "phone": "",
191
+ ... "score": 0,
192
+ ... "status": "unknown",
193
+ ... }
194
+ """
195
+
196
+ @classmethod
197
+ @abstractmethod
198
+ def get_sort_cols(cls) -> tuple[tuple[str, bool], ...]:
199
+ """Define the sort order for the cleaned DataFrame.
200
+
201
+ This abstract method specifies which columns to sort by and in what
202
+ order (ascending or descending). Sorting is applied near the end of
203
+ the cleaning pipeline, after all data transformations are complete.
204
+
205
+ Returns:
206
+ Tuple of (column_name, is_descending) tuples. Each tuple specifies
207
+ a column and sort direction. Columns are sorted in the order they
208
+ appear. True = descending, False = ascending.
209
+
210
+ Example:
211
+ >>> @classmethod
212
+ ... def get_sort_cols(cls):
213
+ ... return (
214
+ ... ("created_at", True), # Descending
215
+ ... ("user_id", False), # Ascending
216
+ ... )
217
+ """
218
+
219
+ @classmethod
220
+ @abstractmethod
221
+ def get_unique_subsets(cls) -> tuple[tuple[str, ...], ...]:
222
+ """Define column subsets for duplicate detection and removal.
223
+
224
+ This abstract method specifies which column combinations define
225
+ uniqueness. Rows are considered duplicates if they have identical
226
+ values in all columns of a subset. When duplicates are found, values
227
+ in columns specified by ``get_add_on_duplicate_cols()`` are summed,
228
+ and the first row is kept.
229
+
230
+ Returns:
231
+ Tuple of column name tuples, where each inner tuple represents
232
+ one uniqueness constraint. Duplicates are detected and handled
233
+ for each subset independently.
234
+
235
+ Example:
236
+ >>> @classmethod
237
+ ... def get_unique_subsets(cls):
238
+ ... return (
239
+ ... ("user_id", "date"), # Unique by user_id and date
240
+ ... ("transaction_id",), # Unique by transaction_id
241
+ ... )
242
+ """
243
+
244
+ @classmethod
245
+ @abstractmethod
246
+ def get_no_null_cols(cls) -> tuple[str, ...]:
247
+ """Define columns that must not contain null values.
248
+
249
+ This abstract method specifies which columns are required to have
250
+ non-null values. A ValueError is raised during the final validation
251
+ step if any of these columns contain null values.
252
+
253
+ Returns:
254
+ Tuple of column names that must not contain null values.
255
+
256
+ Example:
257
+ >>> @classmethod
258
+ ... def get_no_null_cols(cls):
259
+ ... return ("user_id", "email", "created_at")
260
+ """
261
+
262
+ @classmethod
263
+ @abstractmethod
264
+ def get_col_converter_map(
265
+ cls,
266
+ ) -> dict[str, Callable[[pl.Series], pl.Series]]:
267
+ """Define custom conversion functions for columns.
268
+
269
+ This abstract method specifies custom transformations to apply to
270
+ columns after standard conversions (string stripping, float rounding).
271
+ Each function receives a Polars Series and returns a transformed
272
+ Series. Use ``skip_col_converter`` as a placeholder for columns that
273
+ don't need custom conversion.
274
+
275
+ Returns:
276
+ Dictionary mapping column names to their conversion functions.
277
+ Each function takes a Series and returns a transformed Series.
278
+
279
+ Example:
280
+ >>> @classmethod
281
+ ... def get_col_converter_map(cls):
282
+ ... return {
283
+ ... "email": lambda s: s.str.to_lowercase(),
284
+ ... "phone": cls.parse_phone_number,
285
+ ... "created_at": cls.skip_col_converter,
286
+ ... }
287
+ """
288
+
289
+ @classmethod
290
+ @abstractmethod
291
+ def get_add_on_duplicate_cols(cls) -> tuple[str, ...]:
292
+ """Define columns to aggregate when duplicate rows are found.
293
+
294
+ This abstract method specifies which columns should have their values
295
+ summed when duplicate rows are detected (based on
296
+ ``get_unique_subsets()``). The summed values are kept in the first row,
297
+ and duplicate rows are removed.
298
+
299
+ Returns:
300
+ Tuple of column names whose values should be summed when duplicates
301
+ are found.
302
+
303
+ Example:
304
+ >>> @classmethod
305
+ ... def get_add_on_duplicate_cols(cls):
306
+ ... return ("quantity", "revenue", "impressions")
307
+ """
308
+
309
+ @classmethod
310
+ @abstractmethod
311
+ def get_col_precision_map(cls) -> dict[str, int]:
312
+ """Define rounding precision for float columns.
313
+
314
+ This abstract method specifies the number of decimal places to round
315
+ float columns to. Rounding is applied during the standard conversion
316
+ phase and uses Kahan summation to compensate for floating-point
317
+ rounding errors.
318
+
319
+ Returns:
320
+ Dictionary mapping float column names to their precision
321
+ (number of decimal places).
322
+
323
+ Example:
324
+ >>> @classmethod
325
+ ... def get_col_precision_map(cls):
326
+ ... return {
327
+ ... "price": 2,
328
+ ... "percentage": 4,
329
+ ... "score": 1,
330
+ ... }
331
+ """
332
+
333
+ def __init__(
334
+ self,
335
+ *args: Any,
336
+ **kwargs: Any,
337
+ ) -> None:
338
+ """Initialize the CleaningDF and execute the cleaning pipeline.
339
+
340
+ Creates a Polars DataFrame with NaN values automatically converted to
341
+ null, then immediately executes the full cleaning pipeline. The schema
342
+ is enforced from ``get_col_dtype_map()``.
343
+
344
+ Args:
345
+ *args: Positional arguments passed to ``pl.DataFrame`` constructor.
346
+ **kwargs: Keyword arguments passed to ``pl.DataFrame`` constructor.
347
+
348
+ Note:
349
+ The following kwargs are automatically set and will override any
350
+ user-provided values:
351
+ - ``nan_to_null``: Always set to True
352
+ - ``schema``: Set from ``get_col_dtype_map()``
353
+ - ``data``: Replaced with renamed and filtered data
354
+ """
355
+ # create a temp df for standardization and accepting all ploars arg and kwargs
356
+ temp_df = pl.DataFrame(*args, **kwargs)
357
+ temp_df = self.rename_cols(temp_df)
358
+ temp_df = self.drop_cols(temp_df)
359
+
360
+ # enforce standard kwargs and create the final df
361
+ kwargs["data"] = temp_df.to_dict(as_series=True)
362
+ kwargs["nan_to_null"] = True
363
+ kwargs["schema"] = self.get_col_dtype_map()
364
+ self.df = pl.DataFrame(**kwargs)
365
+ self.clean()
366
+
367
+ @classmethod
368
+ def get_col_names(cls) -> tuple[str, ...]:
369
+ """Get the standardized column names from the dtype map.
370
+
371
+ Returns:
372
+ Tuple of standardized column names in the order they appear
373
+ in ``get_col_dtype_map()``.
374
+ """
375
+ return tuple(cls.get_col_dtype_map().keys())
376
+
377
+ def clean(self) -> None:
378
+ """Execute the complete data cleaning pipeline.
379
+
380
+ Applies all cleaning operations in the following order:
381
+ 1. Fill null values with defaults
382
+ 2. Convert columns to correct types and apply transformations
383
+ 3. Drop rows with all-null column subsets
384
+ 4. Handle duplicates by aggregating and removing
385
+ 5. Sort the DataFrame
386
+ 6. Validate data quality
387
+
388
+ Note:
389
+ Renaming and dropping columns are done during ``__init__`` before
390
+ this method is called. This method is automatically called during
391
+ initialization.
392
+ """
393
+ self.fill_nulls()
394
+ self.convert_cols()
395
+ self.drop_null_subsets()
396
+ self.handle_duplicates()
397
+ self.sort_cols()
398
+ self.check()
399
+
400
+ @classmethod
401
+ def raise_on_missing_cols(
402
+ cls,
403
+ map_func: Callable[..., dict[str, Any]],
404
+ ) -> None:
405
+ """Validate that all required columns are present in a configuration map.
406
+
407
+ Checks that the columns returned by ``map_func`` contain all columns
408
+ defined in the schema. Raises KeyError if any required columns are
409
+ missing from the map.
410
+
411
+ Args:
412
+ map_func: A callable that returns a dict with column names as keys.
413
+
414
+ Raises:
415
+ KeyError: If any required columns are missing from the map.
416
+ """
417
+ col_names = cls.get_col_names()
418
+ missing_cols = set(col_names) - set(map_func().keys())
419
+ if missing_cols:
420
+ msg = f"Missing columns in {map_func}: {missing_cols}"
421
+ raise KeyError(msg)
422
+
423
+ def rename_cols(self, temp_df: pl.DataFrame) -> pl.DataFrame:
424
+ """Rename columns from raw names to standardized names.
425
+
426
+ Applies the reverse of ``get_rename_map()`` to rename columns from
427
+ their raw input names to standardized names.
428
+
429
+ Args:
430
+ temp_df: The DataFrame with raw column names to rename.
431
+
432
+ Returns:
433
+ DataFrame with columns renamed to standardized names.
434
+
435
+ Raises:
436
+ KeyError: If any required columns are missing from the rename map.
437
+ """
438
+ self.raise_on_missing_cols(self.get_rename_map)
439
+ return temp_df.rename(reverse_dict(self.get_rename_map()))
440
+
441
+ def drop_cols(self, temp_df: pl.DataFrame) -> pl.DataFrame:
442
+ """Drop columns not defined in the schema.
443
+
444
+ Selects only the columns defined in ``get_col_names()``, removing any
445
+ extra columns that may have been in the input data.
446
+
447
+ Args:
448
+ temp_df: The DataFrame to filter columns from.
449
+
450
+ Returns:
451
+ DataFrame containing only the columns defined in the schema.
452
+ """
453
+ return temp_df.select(self.get_col_names())
454
+
455
+ def fill_nulls(self) -> None:
456
+ """Fill null values with defaults from the fill null map.
457
+
458
+ Replaces null values in each column with the corresponding fill value
459
+ from ``get_fill_null_map()``.
460
+
461
+ Raises:
462
+ KeyError: If any columns are missing from the fill null map.
463
+ """
464
+ self.raise_on_missing_cols(self.get_fill_null_map)
465
+ self.df = self.df.with_columns(
466
+ [
467
+ pl.col(col_name).fill_null(fill_value)
468
+ for col_name, fill_value in self.get_fill_null_map().items()
469
+ ]
470
+ )
471
+
472
+ def convert_cols(self) -> None:
473
+ """Apply standard and custom column conversions.
474
+
475
+ Orchestrates both standard conversions (string stripping, float
476
+ rounding) and custom conversions defined in ``get_col_converter_map()``.
477
+
478
+ Raises:
479
+ KeyError: If any columns are missing from the converter map.
480
+ """
481
+ self.raise_on_missing_cols(self.get_col_converter_map)
482
+ self.standard_convert_cols()
483
+ self.custom_convert_cols()
484
+
485
+ def standard_convert_cols(self) -> None:
486
+ """Apply standard conversions based on data type.
487
+
488
+ Automatically applies the following transformations:
489
+ - ``pl.Utf8`` columns: Strip leading/trailing whitespace
490
+ - ``pl.Float64`` columns: Round to precision using Kahan summation
491
+ """
492
+ for col_name, dtype in self.get_col_dtype_map().items():
493
+ if dtype == pl.Utf8:
494
+ converter = self.strip_col
495
+ elif dtype == pl.Float64:
496
+ converter = self.round_col
497
+ else:
498
+ continue
499
+ self.df = self.df.with_columns(
500
+ pl.col(col_name).map_batches(converter, return_dtype=dtype)
501
+ )
502
+
503
+ def custom_convert_cols(self) -> None:
504
+ """Apply custom conversion functions to columns.
505
+
506
+ Applies custom transformations from ``get_col_converter_map()`` to each
507
+ column. Columns marked with ``skip_col_converter`` are skipped.
508
+ """
509
+ self.df = self.df.with_columns(
510
+ [
511
+ pl.col(col_name).map_batches(
512
+ converter, return_dtype=self.get_col_dtype_map()[col_name]
513
+ )
514
+ for col_name, converter in self.get_col_converter_map().items()
515
+ if converter.__name__ != self.skip_col_converter.__name__ # ty:ignore[unresolved-attribute]
516
+ ]
517
+ )
518
+
519
+ @classmethod
520
+ def strip_col(cls, col: pl.Series) -> pl.Series:
521
+ """Remove leading and trailing whitespace from a string column.
522
+
523
+ Args:
524
+ col: Polars Series of string type (``pl.Utf8``).
525
+
526
+ Returns:
527
+ Series with leading and trailing whitespace removed from each value.
528
+ """
529
+ return col.str.strip_chars()
530
+
531
+ @classmethod
532
+ def lower_col(cls, col: pl.Series) -> pl.Series:
533
+ """Convert a string column to lowercase.
534
+
535
+ Args:
536
+ col: Polars Series of string type (``pl.Utf8``).
537
+
538
+ Returns:
539
+ Series with all characters converted to lowercase.
540
+ """
541
+ return col.str.to_lowercase()
542
+
543
+ @classmethod
544
+ def round_col(
545
+ cls,
546
+ col: pl.Series,
547
+ precision: int | None = None,
548
+ *,
549
+ compensate: bool = True,
550
+ ) -> pl.Series:
551
+ """Round a float column to specified precision.
552
+
553
+ Uses Kahan summation algorithm to compensate for floating-point
554
+ rounding errors when ``compensate=True``, ensuring that the sum of
555
+ rounded values matches the rounded sum of original values.
556
+
557
+ Args:
558
+ col: Polars Series of float type (``pl.Float64``).
559
+ precision: Number of decimal places. If None, uses the value from
560
+ ``get_col_precision_map()`` for this column.
561
+ compensate: If True, use Kahan summation to reduce cumulative
562
+ rounding errors. Defaults to True.
563
+
564
+ Returns:
565
+ Series with values rounded to the specified precision.
566
+
567
+ Note:
568
+ Kahan summation is slower than simple rounding but provides better
569
+ accuracy for financial or scientific calculations where cumulative
570
+ rounding errors matter.
571
+ """
572
+ if precision is None:
573
+ precision = cls.get_col_precision_map()[str(col.name)]
574
+ if not compensate:
575
+ return col.round(precision)
576
+
577
+ # compensate for rounding errors with kahan sum
578
+ error = 0.0
579
+ values = []
580
+ for value in col.to_list(): # Ensure iteration over Python floats
581
+ corrected = value + error
582
+ rounded = round(corrected, precision)
583
+ error = corrected - rounded
584
+ values.append(rounded)
585
+
586
+ return pl.Series(name=col.name, values=values, dtype=col.dtype)
587
+
588
+ @classmethod
589
+ def skip_col_converter(cls, _col: pl.Series) -> pl.Series:
590
+ """Placeholder to skip custom conversion for a column.
591
+
592
+ Use this method in ``get_col_converter_map()`` to indicate that a
593
+ column should not have custom conversion applied. This method should
594
+ never be actually called - it's only used as a marker.
595
+
596
+ Args:
597
+ _col: Unused. The column that would be converted.
598
+
599
+ Raises:
600
+ NotImplementedError: Always raised if this method is called.
601
+
602
+ Example:
603
+ >>> @classmethod
604
+ ... def get_col_converter_map(cls):
605
+ ... return {
606
+ ... "email": lambda s: s.str.to_lowercase(),
607
+ ... "user_id": cls.skip_col_converter, # No conversion
608
+ ... }
609
+ """
610
+ msg = (
611
+ "skip_col_converter is just a flag to skip conversion for a column "
612
+ "and should not be actually called."
613
+ )
614
+ raise NotImplementedError(msg)
615
+
616
+ def drop_null_subsets(self) -> None:
617
+ """Drop rows where all columns in a subset are null.
618
+
619
+ Applies null-dropping rules defined in ``get_drop_null_subsets()``.
620
+ If no subsets are defined, drops rows where all columns are null.
621
+ """
622
+ subsets = self.get_drop_null_subsets()
623
+ if not subsets:
624
+ self.df = self.df.drop_nulls()
625
+ return
626
+ for subset in subsets:
627
+ self.df = self.df.drop_nulls(subset=subset)
628
+
629
+ def handle_duplicates(self) -> None:
630
+ """Remove duplicate rows and aggregate specified columns.
631
+
632
+ For each uniqueness subset defined in ``get_unique_subsets()``:
633
+ 1. Sum values in columns specified by ``get_add_on_duplicate_cols()``
634
+ 2. Keep only the first row of each duplicate group
635
+
636
+ Example:
637
+ If two rows have the same (user_id, date) and values 1 and 2 in
638
+ the 'quantity' column, the result will have one row with
639
+ quantity=3.
640
+ """
641
+ for subset in self.get_unique_subsets():
642
+ for col in self.get_add_on_duplicate_cols():
643
+ self.df = self.df.with_columns(pl.col(col).sum().over(subset))
644
+ self.df = self.df.unique(subset=subset, keep="first")
645
+
646
+ def sort_cols(self) -> None:
647
+ """Sort the DataFrame by columns and directions from get_sort_cols().
648
+
649
+ Applies multi-column sorting with per-column sort direction
650
+ (ascending or descending) as defined in ``get_sort_cols()``.
651
+ """
652
+ cols, desc = zip(*self.get_sort_cols(), strict=True)
653
+ if not cols:
654
+ return
655
+ self.df = self.df.sort(cols, descending=desc)
656
+
657
+ def check(self) -> None:
658
+ """Validate data quality after cleaning.
659
+
660
+ Runs all validation checks in order:
661
+ 1. Correct data types for all columns
662
+ 2. No null values in required columns
663
+ 3. No NaN values in float columns
664
+
665
+ This method is called automatically at the end of the ``clean()``
666
+ pipeline.
667
+
668
+ Raises:
669
+ TypeError: If any column has an incorrect data type.
670
+ ValueError: If required columns contain nulls or float columns
671
+ contain NaN values.
672
+ """
673
+ self.check_correct_dtypes()
674
+ self.check_no_null_cols()
675
+ self.check_no_nan()
676
+
677
+ def check_correct_dtypes(self) -> None:
678
+ """Validate that all columns have their expected data types.
679
+
680
+ Compares the actual DataFrame schema against the expected types
681
+ defined in ``get_col_dtype_map()``.
682
+
683
+ Raises:
684
+ TypeError: If any column's actual type doesn't match the expected
685
+ type from the schema.
686
+ """
687
+ schema = self.df.schema
688
+ col_dtype_map = self.get_col_dtype_map()
689
+ for col, dtype in col_dtype_map.items():
690
+ schema_dtype = schema[col]
691
+ if schema_dtype != dtype:
692
+ msg = f"Expected dtype {dtype} for column {col}, got {schema_dtype}"
693
+ raise TypeError(msg)
694
+
695
+ def check_no_null_cols(self) -> None:
696
+ """Validate that required columns contain no null values.
697
+
698
+ Checks all columns defined in ``get_no_null_cols()`` for null values.
699
+
700
+ Raises:
701
+ ValueError: If any column in ``get_no_null_cols()`` contains null
702
+ values.
703
+ """
704
+ no_null_cols = self.get_no_null_cols()
705
+ # Use a single select to check all columns at once
706
+ null_flags = self.df.select(
707
+ [pl.col(col).is_null().any() for col in no_null_cols]
708
+ )
709
+ # Iterate over columns and check if any have nulls
710
+ for col in no_null_cols:
711
+ if null_flags[col].item():
712
+ msg = f"Null values found in column: {col}"
713
+ raise ValueError(msg)
714
+
715
+ def check_no_nan(self) -> None:
716
+ """Validate that float columns contain no NaN values.
717
+
718
+ Checks all columns with float data types (``pl.Float64``, etc.) for
719
+ NaN values.
720
+
721
+ Raises:
722
+ ValueError: If any float column contains NaN values.
723
+ """
724
+ float_cols = [
725
+ col
726
+ for col, dtype in self.get_col_dtype_map().items()
727
+ if issubclass(dtype, FloatType)
728
+ ]
729
+ has_nan = self.df.select(
730
+ pl.any_horizontal(pl.col(float_cols).is_nan().any())
731
+ ).item()
732
+ if has_nan:
733
+ msg = "NaN values found in the dataframe"
734
+ raise ValueError(msg)