winipedia-utils 0.2.63__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of winipedia-utils might be problematic. Click here for more details.

Files changed (51) hide show
  1. winipedia_utils/artifacts/build.py +78 -0
  2. winipedia_utils/concurrent/concurrent.py +7 -2
  3. winipedia_utils/concurrent/multiprocessing.py +1 -2
  4. winipedia_utils/concurrent/multithreading.py +2 -2
  5. winipedia_utils/data/dataframe/cleaning.py +337 -100
  6. winipedia_utils/git/github/__init__.py +1 -0
  7. winipedia_utils/git/github/github.py +31 -0
  8. winipedia_utils/git/github/repo/__init__.py +1 -0
  9. winipedia_utils/git/github/repo/protect.py +103 -0
  10. winipedia_utils/git/github/repo/repo.py +205 -0
  11. winipedia_utils/git/github/workflows/base/__init__.py +1 -0
  12. winipedia_utils/git/github/workflows/base/base.py +889 -0
  13. winipedia_utils/git/github/workflows/health_check.py +69 -0
  14. winipedia_utils/git/github/workflows/publish.py +51 -0
  15. winipedia_utils/git/github/workflows/release.py +90 -0
  16. winipedia_utils/git/gitignore/config.py +77 -0
  17. winipedia_utils/git/gitignore/gitignore.py +5 -63
  18. winipedia_utils/git/pre_commit/config.py +49 -59
  19. winipedia_utils/git/pre_commit/hooks.py +46 -46
  20. winipedia_utils/git/pre_commit/run_hooks.py +19 -12
  21. winipedia_utils/iterating/iterate.py +63 -1
  22. winipedia_utils/modules/class_.py +69 -12
  23. winipedia_utils/modules/function.py +26 -3
  24. winipedia_utils/modules/inspection.py +56 -0
  25. winipedia_utils/modules/module.py +22 -28
  26. winipedia_utils/modules/package.py +116 -10
  27. winipedia_utils/projects/poetry/config.py +255 -112
  28. winipedia_utils/projects/poetry/poetry.py +230 -13
  29. winipedia_utils/projects/project.py +11 -42
  30. winipedia_utils/setup.py +11 -29
  31. winipedia_utils/testing/config.py +127 -0
  32. winipedia_utils/testing/create_tests.py +5 -19
  33. winipedia_utils/testing/skip.py +19 -0
  34. winipedia_utils/testing/tests/base/fixtures/fixture.py +36 -0
  35. winipedia_utils/testing/tests/base/fixtures/scopes/class_.py +3 -3
  36. winipedia_utils/testing/tests/base/fixtures/scopes/module.py +9 -6
  37. winipedia_utils/testing/tests/base/fixtures/scopes/session.py +27 -176
  38. winipedia_utils/testing/tests/base/utils/utils.py +27 -57
  39. winipedia_utils/text/config.py +250 -0
  40. winipedia_utils/text/string.py +30 -0
  41. winipedia_utils-0.6.6.dist-info/METADATA +390 -0
  42. {winipedia_utils-0.2.63.dist-info → winipedia_utils-0.6.6.dist-info}/RECORD +46 -34
  43. winipedia_utils/consts.py +0 -21
  44. winipedia_utils/git/workflows/base/base.py +0 -77
  45. winipedia_utils/git/workflows/publish.py +0 -79
  46. winipedia_utils/git/workflows/release.py +0 -91
  47. winipedia_utils-0.2.63.dist-info/METADATA +0 -738
  48. /winipedia_utils/{git/workflows/base → artifacts}/__init__.py +0 -0
  49. /winipedia_utils/git/{workflows → github/workflows}/__init__.py +0 -0
  50. {winipedia_utils-0.2.63.dist-info → winipedia_utils-0.6.6.dist-info}/WHEEL +0 -0
  51. {winipedia_utils-0.2.63.dist-info → winipedia_utils-0.6.6.dist-info}/licenses/LICENSE +0 -0
@@ -12,118 +12,201 @@ from typing import Any
12
12
  import polars as pl
13
13
  from polars.datatypes.classes import FloatType
14
14
 
15
- from winipedia_utils.data.structures.dicts import reverse_dict
16
15
  from winipedia_utils.oop.mixins.mixin import ABCLoggingMixin
17
16
 
18
17
 
19
18
  class CleaningDF(ABCLoggingMixin):
20
- """Inherits from polars.DataFrame and ABCLoggingMixin.
21
-
22
- This will be a base class for importing all kinds of Data to e.g. a database.
23
- It will be used to import data from different sources an clean it
24
- Bring the data into the correct format and name the columns correctly.
25
- And the df takes over and does the rest, like cleaning the data, filling NAs, etc.
26
-
27
- It is good practice to define col names as str constants in the child class.
28
- E.g.
29
- COL_NAME_1 = "col_name_1" so they can be reused and are easy to change.
30
-
31
- This class defaults to nan_to_null=True when creating the dataframe for simplicity.
32
-
19
+ """A base class for cleaning and standardizing dataframes using Polars.
20
+
21
+ This class provides a comprehensive pipeline
22
+ for importing, cleaning, and standardizing
23
+ data from various sources before loading into databases or other systems.
24
+ It enforces data quality standards
25
+ through a series of configurable cleaning operations.
26
+
27
+ The cleaning pipeline executes in the following order:
28
+ 1. Rename columns according to a standardized naming scheme
29
+ 2. Drop columns not in the schema
30
+ 3. Fill null values with specified defaults
31
+ 4. Convert columns to correct data types and apply custom transformations
32
+ 5. Drop rows where specified column subsets are entirely null
33
+ 6. Handle duplicates by aggregating values and removing duplicates
34
+ 7. Sort the dataframe by specified columns
35
+ 8. Validate data quality
36
+ (correct dtypes, no nulls in required columns, no NaN values)
37
+
38
+ Child classes must implement abstract methods to define the cleaning configuration:
39
+ - get_rename_map(): Define column name mappings
40
+ - get_col_dtype_map(): Define expected data types for each column
41
+ - get_drop_null_subsets(): Define which column subsets trigger row deletion
42
+ - get_fill_null_map(): Define null value fill strategies
43
+ - get_sort_cols(): Define sort order
44
+ - get_unique_subsets(): Define duplicate detection criteria
45
+ - get_no_null_cols(): Define columns that cannot contain nulls
46
+ - get_col_converter_map(): Define custom column transformations
47
+ - get_add_on_duplicate_cols(): Define columns to aggregate when duplicates are found
48
+ - get_col_precision_map(): Define rounding precision for float columns
49
+
50
+ Best Practices:
51
+ - Define column names as string constants in child classes
52
+ for reusability and maintainability
53
+ - Use this class to build data cleaning pipelines that can be composed and extended
54
+ - The class automatically converts NaN to null for consistency
55
+
56
+ Example:
57
+ COL_NAME_1 = "col_name_1"
58
+ COL_NAME_2 = "col_name_2"
33
59
  """
34
60
 
35
- def __init__(self, *args: Any, **kwargs: Any) -> None:
36
- """Initialize the CleaningDF."""
37
- self.df = pl.DataFrame(*args, nan_to_null=True, **kwargs)
38
- self.clean()
39
-
40
61
  @classmethod
41
62
  @abstractmethod
42
63
  def get_rename_map(cls) -> dict[str, str]:
43
- """Rename the columns.
64
+ """Define column name mappings for standardization.
44
65
 
45
- This method must be implemented in the child class.
46
- This will be done before any other cleaning operations.
47
- Format: {new_name: old_name, ...}
48
- ClenaingDF convention is to map the real col names to smth in all maps
66
+ This abstract method must be implemented in child classes to specify how
67
+ raw input column names should be renamed to standardized names. Renaming
68
+ is the first operation in the cleaning pipeline, executed before all other
69
+ cleaning operations.
70
+
71
+ The mapping format follows the CleaningDF convention of mapping standardized
72
+ names to raw input names, allowing the reverse mapping to be applied to the
73
+ dataframe.
49
74
 
50
75
  Returns:
51
- dict[str, str]: Dictionary mapping old column names to new column names
52
- Format: {new_name: old_name, ...}
76
+ dict[str, str]: Dictionary mapping standardized column names to raw input
77
+ column names. Format: {standardized_name: raw_name, ...}
78
+
79
+ Example:
80
+ return {
81
+ "user_id": "UserId",
82
+ "email": "Email_Address",
83
+ "created_at": "CreatedDate"
84
+ }
53
85
  """
54
86
 
55
87
  @classmethod
56
88
  @abstractmethod
57
89
  def get_col_dtype_map(cls) -> dict[str, type[pl.DataType]]:
58
- """Map the column names to the correct data type.
90
+ """Define the expected data type for each column in the cleaned dataframe.
59
91
 
60
- This method must be implemented in the child class.
92
+ This abstract method must be implemented in child classes to specify the
93
+ target data types for all columns. The dataframe will be validated against
94
+ this schema after cleaning, and a TypeError will be raised if any column
95
+ has an incorrect type.
61
96
 
62
97
  Returns:
63
- dict[str, type[pl.DataType]]: Dictionary mapping column names to their types
98
+ dict[str, type[pl.DataType]]: Dictionary mapping standardized column names
99
+ to their expected Polars data types.
100
+
101
+ Example:
102
+ return {
103
+ "user_id": pl.Int64,
104
+ "email": pl.Utf8,
105
+ "created_at": pl.Date,
106
+ "score": pl.Float64
107
+ }
64
108
  """
65
109
 
66
110
  @classmethod
67
111
  @abstractmethod
68
112
  def get_drop_null_subsets(cls) -> tuple[tuple[str, ...], ...]:
69
- """Drops rows where the subset of columns are all null.
113
+ """Define column subsets for dropping rows with all-null values.
70
114
 
71
- Drops a row if all columns in the subset are null.
72
- You can define several subsets to check.
73
- Each returned tuple is one subset.
115
+ This abstract method specifies which column subsets should trigger row deletion.
116
+ A row is dropped if ALL columns in a subset are null. Multiple subsets can be
117
+ defined to apply different null-dropping rules. If no subsets are defined,
118
+ rows where all columns are null will be dropped.
74
119
 
75
120
  Returns:
76
- tuple[tuple[str, ...], ...]: Tuple of tuples of column names
121
+ tuple[tuple[str, ...], ...]: Tuple of column name tuples, where each inner
122
+ tuple represents one subset. A row is dropped if all columns in any
123
+ subset are null.
124
+
125
+ Example:
126
+ return (
127
+ ("email", "phone"), # Drop if both email and phone are null
128
+ ("address_line1",), # Drop if address_line1 is null
129
+ )
77
130
  """
78
131
 
79
132
  @classmethod
80
133
  @abstractmethod
81
134
  def get_fill_null_map(cls) -> dict[str, Any]:
82
- """Fill null values with the specified value.
135
+ """Define null value fill strategies for each column.
83
136
 
84
- This method must be implemented in the child class.
137
+ This abstract method specifies default values to fill null entries in each
138
+ column. This is applied early in the cleaning pipeline after column renaming.
85
139
 
86
140
  Returns:
87
- dict[str, Any]: Dictionary mapping column names to their fill value
141
+ dict[str, Any]: Dictionary mapping column names to their fill values.
142
+ The fill value can be any type appropriate for the column.
143
+
144
+ Example:
145
+ return {
146
+ "email": "",
147
+ "phone": "",
148
+ "score": 0,
149
+ "status": "unknown"
150
+ }
88
151
  """
89
152
 
90
153
  @classmethod
91
154
  @abstractmethod
92
155
  def get_sort_cols(cls) -> tuple[tuple[str, bool], ...]:
93
- """Sort the dataframe by the specified columns.
156
+ """Define the sort order for the cleaned dataframe.
94
157
 
95
- This method must be implemented in the child class.
158
+ This abstract method specifies which columns to sort by and in what order
159
+ (ascending or descending). Sorting is applied near the end of the cleaning
160
+ pipeline, after all data transformations are complete.
96
161
 
97
162
  Returns:
98
- tuple[tuple[str, bool], ...]: Tuple of tuples of column names and
99
- how to sort, True for descending, False for ascending in polars
163
+ tuple[tuple[str, bool], ...]: Tuple of (column_name, is_descending) tuples.
164
+ Each tuple specifies a column and sort direction. Columns are sorted
165
+ in the order they appear. True = descending, False = ascending.
166
+
167
+ Example:
168
+ return (
169
+ ("created_at", True), # Sort by created_at descending
170
+ ("user_id", False), # Then by user_id ascending
171
+ )
100
172
  """
101
173
 
102
174
  @classmethod
103
175
  @abstractmethod
104
176
  def get_unique_subsets(cls) -> tuple[tuple[str, ...], ...]:
105
- """Drop duplicates based on the specified subsets.
177
+ """Define column subsets for duplicate detection and removal.
106
178
 
107
- This method must be implemented in the child class.
108
- E.g.
109
- (
110
- (("col1", "col2"), # subset 1
111
- ("col3", "col4"), # subset 2
112
- )
179
+ This abstract method specifies which column combinations define uniqueness.
180
+ Rows are considered duplicates if they have identical values in all columns
181
+ of a subset. When duplicates are found, values in columns specified by
182
+ get_add_on_duplicate_cols() are summed, and the first row is kept.
113
183
 
114
184
  Returns:
115
- tuple[tuple[tuple[str, bool], ...], ...]: Tuple of tuples of column names
185
+ tuple[tuple[str, ...], ...]: Tuple of column name tuples, where each inner
186
+ tuple represents one uniqueness constraint. Duplicates are detected
187
+ and handled for each subset independently.
188
+
189
+ Example:
190
+ return (
191
+ ("user_id", "date"), # Subset 1: unique by user_id and date
192
+ ("transaction_id",), # Subset 2: unique by transaction_id
193
+ )
116
194
  """
117
195
 
118
196
  @classmethod
119
197
  @abstractmethod
120
198
  def get_no_null_cols(cls) -> tuple[str, ...]:
121
- """Disallow null values in the specified columns.
199
+ """Define columns that must not contain null values.
122
200
 
123
- This method must be implemented in the child class.
201
+ This abstract method specifies which columns are required to have non-null
202
+ values. A ValueError is raised during the final validation step if any of
203
+ these columns contain null values.
124
204
 
125
205
  Returns:
126
- tuple[str, ...]: Tuple of column names
206
+ tuple[str, ...]: Tuple of column names that must not be null.
207
+
208
+ Example:
209
+ return ("user_id", "email", "created_at")
127
210
  """
128
211
 
129
212
  @classmethod
@@ -131,49 +214,110 @@ class CleaningDF(ABCLoggingMixin):
131
214
  def get_col_converter_map(
132
215
  cls,
133
216
  ) -> dict[str, Callable[[pl.Series], pl.Series]]:
134
- """Convert the column to the specified type.
217
+ """Define custom conversion functions for columns.
135
218
 
136
- This method must be implemented in the child class.
137
- It takes a polars series and returns a polars series.
138
- Can be used to e.g. parse dates, or do a specific operation on a column.
219
+ This abstract method specifies custom transformations to apply to columns
220
+ after standard conversions (string stripping, float rounding). Each function
221
+ receives a Polars Series and returns a transformed Series. Use
222
+ skip_col_converter as a placeholder for columns that don't need custom
223
+ conversion.
139
224
 
140
225
  Returns:
141
226
  dict[str, Callable[[pl.Series], pl.Series]]: Dictionary mapping column names
142
- to their conversion function
227
+ to their conversion functions. Each function takes a Series and returns
228
+ a transformed Series.
229
+
230
+ Example:
231
+ return {
232
+ "email": lambda s: s.str.to_lowercase(),
233
+ "phone": self.parse_phone_number,
234
+ "created_at": self.skip_col_converter, # No custom conversion
235
+ }
143
236
  """
144
237
 
145
238
  @classmethod
146
239
  @abstractmethod
147
240
  def get_add_on_duplicate_cols(cls) -> tuple[str, ...]:
148
- """Adds the values of cols together when dupliactes of two rows are found.
241
+ """Define columns to aggregate when duplicate rows are found.
149
242
 
150
- This method must be implemented in the child class.
151
- duplicates are determined by the get_unique_subsets method.
243
+ This abstract method specifies which columns should have their values summed
244
+ when duplicate rows are detected (based on get_unique_subsets). The summed
245
+ values are kept in the first row, and duplicate rows are removed.
152
246
 
153
247
  Returns:
154
- tuple[str, ...]: Tuple of column names
248
+ tuple[str, ...]: Tuple of column names whose values should be summed
249
+ when duplicates are found.
250
+
251
+ Example:
252
+ return ("quantity", "revenue", "impressions")
155
253
  """
156
254
 
157
255
  @classmethod
158
256
  @abstractmethod
159
257
  def get_col_precision_map(cls) -> dict[str, int]:
160
- """Round the column to the specified precision.
258
+ """Define rounding precision for float columns.
161
259
 
162
- This method must be implemented in the child class.
260
+ This abstract method specifies the number of decimal places to round float
261
+ columns to. Rounding is applied during the standard conversion phase and uses
262
+ Kahan summation to compensate for floating-point rounding errors.
163
263
 
164
264
  Returns:
165
- dict[str, int]: Dictionary mapping column names to their precision
265
+ dict[str, int]: Dictionary mapping float column names to their precision
266
+ (number of decimal places).
267
+
268
+ Example:
269
+ return {
270
+ "price": 2,
271
+ "percentage": 4,
272
+ "score": 1,
273
+ }
274
+ """
275
+
276
+ def __init__(self, data: dict[str, list[Any]], **kwargs: Any) -> None:
277
+ """Initialize the CleaningDF and execute the cleaning pipeline.
278
+
279
+ Creates a Polars DataFrame with NaN values automatically converted to null,
280
+ then immediately executes the full cleaning pipeline.
281
+ nan_to_null is set to True to always
282
+ schema is set to the dtype map to always have the correct dtypes
283
+
284
+ Args:
285
+ data: Dictionary mapping column names to lists of values
286
+ **kwargs: Additional keyword arguments passed to pl.DataFrame constructor
166
287
  """
288
+ self.rename_cols(data)
289
+ self.drop_cols(data)
290
+ kwargs["nan_to_null"] = True
291
+ kwargs["schema"] = self.get_col_dtype_map()
292
+ self.df = pl.DataFrame(data=data, **kwargs)
293
+ self.clean()
167
294
 
168
295
  @classmethod
169
296
  def get_col_names(cls) -> tuple[str, ...]:
170
- """Get the column names of the dataframe."""
297
+ """Get the standardized column names from the dtype map.
298
+
299
+ Returns the column names in the order they appear in get_col_dtype_map().
300
+
301
+ Returns:
302
+ tuple[str, ...]: Tuple of standardized column names.
303
+ """
171
304
  return tuple(cls.get_col_dtype_map().keys())
172
305
 
173
306
  def clean(self) -> None:
174
- """Clean the dataframe."""
175
- self.rename_cols()
176
- self.drop_cols()
307
+ """Execute the complete data cleaning pipeline.
308
+
309
+ Applies all cleaning operations in the following order:
310
+ 1. Rename columns to standardized names
311
+ 2. Drop columns not in the schema
312
+ 3. Fill null values with defaults
313
+ 4. Convert columns to correct types and apply transformations
314
+ 5. Drop rows with all-null column subsets
315
+ 6. Handle duplicates by aggregating and removing
316
+ 7. Sort the dataframe
317
+ 8. Validate data quality
318
+
319
+ This method is automatically called during __init__.
320
+ """
177
321
  self.fill_nulls()
178
322
  self.convert_cols()
179
323
  self.drop_null_subsets()
@@ -187,7 +331,18 @@ class CleaningDF(ABCLoggingMixin):
187
331
  map_func: Callable[..., dict[str, Any]],
188
332
  col_names: tuple[str, ...] | None = None,
189
333
  ) -> None:
190
- """Raise a KeyError if the columns in the map are not in the dataframe."""
334
+ """Validate that all required columns are present in a configuration map.
335
+
336
+ Checks that the columns returned by map_func contain all columns in col_names.
337
+ Raises KeyError if any required columns are missing from the map.
338
+
339
+ Args:
340
+ map_func: A callable that returns a dict with column names as keys
341
+ col_names: Tuple of column names to check. If None, uses get_col_names()
342
+
343
+ Raises:
344
+ KeyError: If any required columns are missing from the map
345
+ """
191
346
  if col_names is None:
192
347
  col_names = cls.get_col_names()
193
348
  missing_cols = set(col_names) - set(map_func().keys())
@@ -195,17 +350,32 @@ class CleaningDF(ABCLoggingMixin):
195
350
  msg = f"Missing columns in {map_func.__name__}: {missing_cols}"
196
351
  raise KeyError(msg)
197
352
 
198
- def rename_cols(self) -> None:
199
- """Rename the columns according to the rename map."""
353
+ def rename_cols(self, data: dict[str, list[Any]]) -> None:
354
+ """Rename columns from raw names to standardized names.
355
+
356
+ Applies the reverse of get_rename_map() to rename columns from their raw
357
+ input names to standardized names. Validates that all required columns are
358
+ present in the rename map.
359
+ """
200
360
  self.raise_on_missing_cols(self.get_rename_map)
201
- self.df = self.df.rename(reverse_dict(self.get_rename_map()))
361
+ for std_name, raw_name in self.get_rename_map().items():
362
+ data[std_name] = data.pop(raw_name)
363
+
364
+ def drop_cols(self, data: dict[str, list[Any]]) -> None:
365
+ """Drop columns not in the schema.
202
366
 
203
- def drop_cols(self) -> None:
204
- """Drop columns that are not in the col_dtype_map."""
205
- self.df = self.df.select(self.get_col_names())
367
+ Selects only the columns defined in get_col_names(), removing any extra
368
+ columns that may have been in the input data.
369
+ """
370
+ for col in set(data.keys()) - set(self.get_col_names()):
371
+ del data[col]
206
372
 
207
373
  def fill_nulls(self) -> None:
208
- """Fill null values with the specified values from the fill null map."""
374
+ """Fill null values with defaults from the fill null map.
375
+
376
+ Replaces null values in each column with the corresponding fill value
377
+ from get_fill_null_map(). Validates that all columns are present in the map.
378
+ """
209
379
  self.raise_on_missing_cols(self.get_fill_null_map)
210
380
  self.df = self.df.with_columns(
211
381
  [
@@ -215,15 +385,22 @@ class CleaningDF(ABCLoggingMixin):
215
385
  )
216
386
 
217
387
  def convert_cols(self) -> None:
218
- """Apply the conversion functions to the columns."""
388
+ """Apply standard and custom column conversions.
389
+
390
+ Orchestrates both standard conversions (string stripping, float rounding)
391
+ and custom conversions defined in get_col_converter_map(). Validates that
392
+ all columns are present in the converter map.
393
+ """
219
394
  self.raise_on_missing_cols(self.get_col_converter_map)
220
395
  self.standard_convert_cols()
221
396
  self.custom_convert_cols()
222
397
 
223
398
  def standard_convert_cols(self) -> None:
224
- """Assumes some Data standards and converts cols accordingly.
399
+ """Apply standard conversions based on data type.
225
400
 
226
- E.g. strips strings, rounds floats
401
+ Automatically applies standard transformations:
402
+ - Utf8 columns: strip leading/trailing whitespace
403
+ - Float64 columns: round to specified precision using Kahan summation
227
404
  """
228
405
  for col_name, dtype in self.get_col_dtype_map().items():
229
406
  if dtype == pl.Utf8:
@@ -237,7 +414,11 @@ class CleaningDF(ABCLoggingMixin):
237
414
  )
238
415
 
239
416
  def custom_convert_cols(self) -> None:
240
- """Apply the conversion functions to the columns."""
417
+ """Apply custom conversion functions to columns.
418
+
419
+ Applies custom transformations from get_col_converter_map() to each column,
420
+ skipping columns marked with skip_col_converter.
421
+ """
241
422
  self.df = self.df.with_columns(
242
423
  [
243
424
  pl.col(col_name).map_batches(
@@ -250,12 +431,26 @@ class CleaningDF(ABCLoggingMixin):
250
431
 
251
432
  @classmethod
252
433
  def strip_col(cls, col: pl.Series) -> pl.Series:
253
- """Strip the column of leading and trailing whitespace."""
434
+ """Remove leading and trailing whitespace from string column.
435
+
436
+ Args:
437
+ col: Polars Series of string type
438
+
439
+ Returns:
440
+ pl.Series: Series with whitespace stripped
441
+ """
254
442
  return col.str.strip_chars()
255
443
 
256
444
  @classmethod
257
445
  def lower_col(cls, col: pl.Series) -> pl.Series:
258
- """Convert the column to lowercase."""
446
+ """Convert string column to lowercase.
447
+
448
+ Args:
449
+ col: Polars Series of string type
450
+
451
+ Returns:
452
+ pl.Series: Series with all characters converted to lowercase
453
+ """
259
454
  return col.str.to_lowercase()
260
455
 
261
456
  @classmethod
@@ -266,9 +461,19 @@ class CleaningDF(ABCLoggingMixin):
266
461
  *,
267
462
  compensate: bool = True,
268
463
  ) -> pl.Series:
269
- """Round the column to the specified precision.
464
+ """Round float column to specified precision.
270
465
 
271
- The precision is defined in the get_col_precision_map method.
466
+ Uses Kahan summation algorithm to compensate for floating-point rounding
467
+ errors when compensate=True, ensuring that the sum of rounded values
468
+ matches the rounded sum of original values.
469
+
470
+ Args:
471
+ col: Polars Series of float type
472
+ precision: Number of decimal places. If None, uses get_col_precision_map()
473
+ compensate: If True, use Kahan summation to reduce rounding errors
474
+
475
+ Returns:
476
+ pl.Series: Series with values rounded to specified precision
272
477
  """
273
478
  if precision is None:
274
479
  precision = cls.get_col_precision_map()[str(col.name)]
@@ -288,9 +493,14 @@ class CleaningDF(ABCLoggingMixin):
288
493
 
289
494
  @classmethod
290
495
  def skip_col_converter(cls, _col: pl.Series) -> pl.Series:
291
- """Conversion is not needed for this column and will be skipped.
496
+ """Placeholder to skip custom conversion for a column.
497
+
498
+ Use this method in get_col_converter_map() to indicate that a column
499
+ should not have custom conversion applied. This method should never be
500
+ actually called - it's only used as a marker.
292
501
 
293
- Function should not be invoked if col_name is in get_col_converter_map.
502
+ Raises:
503
+ NotImplementedError: Always raised if this method is called
294
504
  """
295
505
  msg = (
296
506
  "skip_col_converter is just a flag to skip conversion for a column "
@@ -299,9 +509,10 @@ class CleaningDF(ABCLoggingMixin):
299
509
  raise NotImplementedError(msg)
300
510
 
301
511
  def drop_null_subsets(self) -> None:
302
- """Drop rows where the subset of columns are all null.
512
+ """Drop rows where all columns in a subset are null.
303
513
 
304
- If no subsets are defined, drop all rows where all columns are null.
514
+ Applies null-dropping rules defined in get_drop_null_subsets(). If no
515
+ subsets are defined, drops rows where all columns are null.
305
516
  """
306
517
  subsets = self.get_drop_null_subsets()
307
518
  if not subsets:
@@ -311,12 +522,14 @@ class CleaningDF(ABCLoggingMixin):
311
522
  self.df = self.df.drop_nulls(subset=subset)
312
523
 
313
524
  def handle_duplicates(self) -> None:
314
- """Drop duplicates based on the specified subsets.
525
+ """Remove duplicate rows and aggregate specified columns.
315
526
 
316
- If add_on_duplicate_cols are defined, add the values of the cols together.
317
- This func adds up the vals of the duplicates and keeps the first row.
318
- E.g. if you have a df with two rows with the same subset
319
- and value 1 and 2 in col1 the result will be 3 in col1 for the first row.
527
+ For each uniqueness subset defined in get_unique_subsets():
528
+ 1. Sum values in columns specified by get_add_on_duplicate_cols()
529
+ 2. Keep only the first row of each duplicate group
530
+
531
+ Example: If two rows have the same (user_id, date) and values 1 and 2
532
+ in the 'quantity' column, the result will have one row with quantity=3.
320
533
  """
321
534
  for subset in self.get_unique_subsets():
322
535
  for col in self.get_add_on_duplicate_cols():
@@ -324,24 +537,40 @@ class CleaningDF(ABCLoggingMixin):
324
537
  self.df = self.df.unique(subset=subset, keep="first")
325
538
 
326
539
  def sort_cols(self) -> None:
327
- """Sort the dataframe by the specified columns."""
540
+ """Sort the dataframe by columns and directions from get_sort_cols().
541
+
542
+ Applies multi-column sorting with per-column sort direction
543
+ (ascending/descending).
544
+ """
328
545
  cols, desc = zip(*self.get_sort_cols(), strict=True)
329
546
  if not cols:
330
547
  return
331
548
  self.df = self.df.sort(cols, descending=desc)
332
549
 
333
550
  def check(self) -> None:
334
- """Check the data and some conditions.
551
+ """Validate data quality after cleaning.
552
+
553
+ Runs all validation checks:
554
+ - Correct data types for all columns
555
+ - No null values in required columns
556
+ - No NaN values in float columns
335
557
 
336
- This method is called at the end of the clean method.
337
- checks e.g. non null values in no_null_cols
558
+ Called automatically at the end of the clean() pipeline.
559
+
560
+ Raises:
561
+ TypeError: If any column has incorrect data type
562
+ ValueError: If required columns contain nulls or float columns contain NaN
338
563
  """
339
564
  self.check_correct_dtypes()
340
565
  self.check_no_null_cols()
341
566
  self.check_no_nan()
342
567
 
343
568
  def check_correct_dtypes(self) -> None:
344
- """Check that all columns have the correct dtype."""
569
+ """Validate that all columns have their expected data types.
570
+
571
+ Raises:
572
+ TypeError: If any column's actual type doesn't match expected type
573
+ """
345
574
  schema = self.df.schema
346
575
  col_dtype_map = self.get_col_dtype_map()
347
576
  for col, dtype in col_dtype_map.items():
@@ -351,7 +580,11 @@ class CleaningDF(ABCLoggingMixin):
351
580
  raise TypeError(msg)
352
581
 
353
582
  def check_no_null_cols(self) -> None:
354
- """Check that there are no null values in the no null columns."""
583
+ """Validate that required columns contain no null values.
584
+
585
+ Raises:
586
+ ValueError: If any column in get_no_null_cols() contains null values
587
+ """
355
588
  no_null_cols = self.get_no_null_cols()
356
589
  # Use a single select to check all columns at once
357
590
  null_flags = self.df.select(
@@ -364,7 +597,11 @@ class CleaningDF(ABCLoggingMixin):
364
597
  raise ValueError(msg)
365
598
 
366
599
  def check_no_nan(self) -> None:
367
- """Check that there are no nan values in the df."""
600
+ """Validate that float columns contain no NaN values.
601
+
602
+ Raises:
603
+ ValueError: If any float column contains NaN values
604
+ """
368
605
  float_cols = [
369
606
  col
370
607
  for col, dtype in self.get_col_dtype_map().items()
@@ -0,0 +1 @@
1
+ """__init__ module."""