winipedia-utils 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of winipedia-utils might be problematic. Click here for more details.

Files changed (90) hide show
  1. winipedia_utils/concurrent/concurrent.py +245 -245
  2. winipedia_utils/concurrent/multiprocessing.py +130 -130
  3. winipedia_utils/concurrent/multithreading.py +93 -93
  4. winipedia_utils/consts.py +21 -23
  5. winipedia_utils/data/__init__.py +1 -1
  6. winipedia_utils/data/dataframe/__init__.py +1 -1
  7. winipedia_utils/data/dataframe/cleaning.py +378 -378
  8. winipedia_utils/data/structures/__init__.py +1 -1
  9. winipedia_utils/data/structures/dicts.py +16 -16
  10. winipedia_utils/git/__init__.py +1 -1
  11. winipedia_utils/git/gitignore/__init__.py +1 -1
  12. winipedia_utils/git/gitignore/gitignore.py +136 -136
  13. winipedia_utils/git/pre_commit/__init__.py +1 -1
  14. winipedia_utils/git/pre_commit/config.py +70 -70
  15. winipedia_utils/git/pre_commit/hooks.py +109 -109
  16. winipedia_utils/git/pre_commit/run_hooks.py +49 -49
  17. winipedia_utils/iterating/__init__.py +1 -1
  18. winipedia_utils/iterating/iterate.py +29 -29
  19. winipedia_utils/logging/ansi.py +6 -6
  20. winipedia_utils/logging/config.py +64 -64
  21. winipedia_utils/logging/logger.py +26 -26
  22. winipedia_utils/modules/class_.py +119 -119
  23. winipedia_utils/modules/function.py +101 -101
  24. winipedia_utils/modules/module.py +379 -379
  25. winipedia_utils/modules/package.py +390 -390
  26. winipedia_utils/oop/mixins/meta.py +333 -333
  27. winipedia_utils/oop/mixins/mixin.py +37 -37
  28. winipedia_utils/os/__init__.py +1 -1
  29. winipedia_utils/os/os.py +63 -63
  30. winipedia_utils/projects/__init__.py +1 -1
  31. winipedia_utils/projects/poetry/__init__.py +1 -1
  32. winipedia_utils/projects/poetry/config.py +91 -91
  33. winipedia_utils/projects/poetry/poetry.py +31 -31
  34. winipedia_utils/projects/project.py +48 -48
  35. winipedia_utils/resources/__init__.py +1 -1
  36. winipedia_utils/resources/svgs/__init__.py +1 -1
  37. winipedia_utils/resources/svgs/download_arrow.svg +2 -2
  38. winipedia_utils/resources/svgs/exit_fullscreen_icon.svg +5 -5
  39. winipedia_utils/resources/svgs/fullscreen_icon.svg +2 -2
  40. winipedia_utils/resources/svgs/menu_icon.svg +3 -3
  41. winipedia_utils/resources/svgs/pause_icon.svg +3 -3
  42. winipedia_utils/resources/svgs/play_icon.svg +16 -16
  43. winipedia_utils/resources/svgs/plus_icon.svg +23 -23
  44. winipedia_utils/resources/svgs/svg.py +15 -15
  45. winipedia_utils/security/__init__.py +1 -1
  46. winipedia_utils/security/cryptography.py +29 -29
  47. winipedia_utils/security/keyring.py +70 -70
  48. winipedia_utils/setup.py +47 -47
  49. winipedia_utils/testing/assertions.py +23 -23
  50. winipedia_utils/testing/convention.py +177 -177
  51. winipedia_utils/testing/create_tests.py +291 -291
  52. winipedia_utils/testing/fixtures.py +28 -28
  53. winipedia_utils/testing/tests/base/fixtures/__init__.py +1 -1
  54. winipedia_utils/testing/tests/base/fixtures/fixture.py +6 -6
  55. winipedia_utils/testing/tests/base/fixtures/scopes/class_.py +33 -33
  56. winipedia_utils/testing/tests/base/fixtures/scopes/function.py +7 -7
  57. winipedia_utils/testing/tests/base/fixtures/scopes/module.py +31 -31
  58. winipedia_utils/testing/tests/base/fixtures/scopes/package.py +7 -7
  59. winipedia_utils/testing/tests/base/fixtures/scopes/session.py +312 -312
  60. winipedia_utils/testing/tests/base/utils/utils.py +82 -82
  61. winipedia_utils/testing/tests/conftest.py +32 -32
  62. winipedia_utils/text/string.py +126 -126
  63. {winipedia_utils-0.2.0.dist-info → winipedia_utils-0.2.1.dist-info}/METADATA +1 -4
  64. winipedia_utils-0.2.1.dist-info/RECORD +80 -0
  65. {winipedia_utils-0.2.0.dist-info → winipedia_utils-0.2.1.dist-info}/licenses/LICENSE +21 -21
  66. winipedia_utils/django/__init__.py +0 -24
  67. winipedia_utils/django/bulk.py +0 -538
  68. winipedia_utils/django/command.py +0 -334
  69. winipedia_utils/django/database.py +0 -289
  70. winipedia_utils/pyside/__init__.py +0 -1
  71. winipedia_utils/pyside/core/__init__.py +0 -1
  72. winipedia_utils/pyside/core/py_qiodevice.py +0 -476
  73. winipedia_utils/pyside/ui/__init__.py +0 -1
  74. winipedia_utils/pyside/ui/base/__init__.py +0 -1
  75. winipedia_utils/pyside/ui/base/base.py +0 -180
  76. winipedia_utils/pyside/ui/pages/__init__.py +0 -1
  77. winipedia_utils/pyside/ui/pages/base/__init__.py +0 -1
  78. winipedia_utils/pyside/ui/pages/base/base.py +0 -92
  79. winipedia_utils/pyside/ui/pages/browser.py +0 -26
  80. winipedia_utils/pyside/ui/pages/player.py +0 -85
  81. winipedia_utils/pyside/ui/widgets/__init__.py +0 -1
  82. winipedia_utils/pyside/ui/widgets/browser.py +0 -243
  83. winipedia_utils/pyside/ui/widgets/clickable_widget.py +0 -57
  84. winipedia_utils/pyside/ui/widgets/media_player.py +0 -430
  85. winipedia_utils/pyside/ui/widgets/notification.py +0 -78
  86. winipedia_utils/pyside/ui/windows/__init__.py +0 -1
  87. winipedia_utils/pyside/ui/windows/base/__init__.py +0 -1
  88. winipedia_utils/pyside/ui/windows/base/base.py +0 -49
  89. winipedia_utils-0.2.0.dist-info/RECORD +0 -103
  90. {winipedia_utils-0.2.0.dist-info → winipedia_utils-0.2.1.dist-info}/WHEEL +0 -0
@@ -1,378 +1,378 @@
1
- """A Cleaning DF class that streamlines common cleaning operations on dataframes.
2
-
3
- This is usefull to build Pipelines and when extending the class you can add your own
4
- cleaning operations.
5
- This module uses polars for dataframe operations and assumes some standards on the data
6
- """
7
-
8
- from abc import abstractmethod
9
- from collections.abc import Callable
10
- from typing import Any
11
-
12
- import polars as pl
13
- from polars.datatypes.classes import FloatType
14
-
15
- from winipedia_utils.data.structures.dicts import reverse_dict
16
- from winipedia_utils.oop.mixins.mixin import ABCLoggingMixin
17
-
18
-
19
- class CleaningDF(ABCLoggingMixin):
20
- """Inherits from polars.DataFrame and ABCLoggingMixin.
21
-
22
- This will be a base class for importing all kinds of Data to e.g. a database.
23
- It will be used to import data from different sources an clean it
24
- Bring the data into the correct format and name the columns correctly.
25
- And the df takes over and does the rest, like cleaning the data, filling NAs, etc.
26
-
27
- It is good practice to define col names as str constants in the child class.
28
- E.g.
29
- COL_NAME_1 = "col_name_1" so they can be reused and are easy to change.
30
-
31
- This class defaults to nan_to_null=True when creating the dataframe for simplicity.
32
-
33
- """
34
-
35
- def __init__(self, *args: Any, **kwargs: Any) -> None:
36
- """Initialize the CleaningDF."""
37
- self.df = pl.DataFrame(*args, nan_to_null=True, **kwargs)
38
- self.clean()
39
-
40
- @classmethod
41
- @abstractmethod
42
- def get_rename_map(cls) -> dict[str, str]:
43
- """Rename the columns.
44
-
45
- This method must be implemented in the child class.
46
- This will be done before any other cleaning operations.
47
- Format: {new_name: old_name, ...}
48
- ClenaingDF convention is to map the real col names to smth in all maps
49
-
50
- Returns:
51
- dict[str, str]: Dictionary mapping old column names to new column names
52
- Format: {new_name: old_name, ...}
53
- """
54
-
55
- @classmethod
56
- @abstractmethod
57
- def get_col_dtype_map(cls) -> dict[str, type[pl.DataType]]:
58
- """Map the column names to the correct data type.
59
-
60
- This method must be implemented in the child class.
61
-
62
- Returns:
63
- dict[str, type[pl.DataType]]: Dictionary mapping column names to their types
64
- """
65
-
66
- @classmethod
67
- @abstractmethod
68
- def get_drop_null_subsets(cls) -> tuple[tuple[str, ...], ...]:
69
- """Drops rows where the subset of columns are all null.
70
-
71
- Drops a row if all columns in the subset are null.
72
- You can define several subsets to check.
73
- Each returned tuple is one subset.
74
-
75
- Returns:
76
- tuple[tuple[str, ...], ...]: Tuple of tuples of column names
77
- """
78
-
79
- @classmethod
80
- @abstractmethod
81
- def get_fill_null_map(cls) -> dict[str, Any]:
82
- """Fill null values with the specified value.
83
-
84
- This method must be implemented in the child class.
85
-
86
- Returns:
87
- dict[str, Any]: Dictionary mapping column names to their fill value
88
- """
89
-
90
- @classmethod
91
- @abstractmethod
92
- def get_sort_cols(cls) -> tuple[tuple[str, bool], ...]:
93
- """Sort the dataframe by the specified columns.
94
-
95
- This method must be implemented in the child class.
96
-
97
- Returns:
98
- tuple[tuple[str, bool], ...]: Tuple of tuples of column names and
99
- how to sort, True for descending, False for ascending in polars
100
- """
101
-
102
- @classmethod
103
- @abstractmethod
104
- def get_unique_subsets(cls) -> tuple[tuple[str, ...], ...]:
105
- """Drop duplicates based on the specified subsets.
106
-
107
- This method must be implemented in the child class.
108
- E.g.
109
- (
110
- (("col1", "col2"), # subset 1
111
- ("col3", "col4"), # subset 2
112
- )
113
-
114
- Returns:
115
- tuple[tuple[tuple[str, bool], ...], ...]: Tuple of tuples of column names
116
- """
117
-
118
- @classmethod
119
- @abstractmethod
120
- def get_no_null_cols(cls) -> tuple[str, ...]:
121
- """Disallow null values in the specified columns.
122
-
123
- This method must be implemented in the child class.
124
-
125
- Returns:
126
- tuple[str, ...]: Tuple of column names
127
- """
128
-
129
- @classmethod
130
- @abstractmethod
131
- def get_col_converter_map(
132
- cls,
133
- ) -> dict[str, Callable[[pl.Series], pl.Series]]:
134
- """Convert the column to the specified type.
135
-
136
- This method must be implemented in the child class.
137
- It takes a polars series and returns a polars series.
138
- Can be used to e.g. parse dates, or do a specific operation on a column.
139
-
140
- Returns:
141
- dict[str, Callable[[pl.Series], pl.Series]]: Dictionary mapping column names
142
- to their conversion function
143
- """
144
-
145
- @classmethod
146
- @abstractmethod
147
- def get_add_on_duplicate_cols(cls) -> tuple[str, ...]:
148
- """Adds the values of cols together when dupliactes of two rows are found.
149
-
150
- This method must be implemented in the child class.
151
- duplicates are determined by the get_unique_subsets method.
152
-
153
- Returns:
154
- tuple[str, ...]: Tuple of column names
155
- """
156
-
157
- @classmethod
158
- @abstractmethod
159
- def get_col_precision_map(cls) -> dict[str, int]:
160
- """Round the column to the specified precision.
161
-
162
- This method must be implemented in the child class.
163
-
164
- Returns:
165
- dict[str, int]: Dictionary mapping column names to their precision
166
- """
167
-
168
- @classmethod
169
- def get_col_names(cls) -> tuple[str, ...]:
170
- """Get the column names of the dataframe."""
171
- return tuple(cls.get_col_dtype_map().keys())
172
-
173
- def clean(self) -> None:
174
- """Clean the dataframe."""
175
- self.rename_cols()
176
- self.drop_cols()
177
- self.fill_nulls()
178
- self.convert_cols()
179
- self.drop_null_subsets()
180
- self.handle_duplicates()
181
- self.sort_cols()
182
- self.check()
183
-
184
- @classmethod
185
- def raise_on_missing_cols(
186
- cls,
187
- map_func: Callable[..., dict[str, Any]],
188
- col_names: tuple[str, ...] | None = None,
189
- ) -> None:
190
- """Raise a KeyError if the columns in the map are not in the dataframe."""
191
- if col_names is None:
192
- col_names = cls.get_col_names()
193
- missing_cols = set(col_names) - set(map_func().keys())
194
- if missing_cols:
195
- msg = f"Missing columns in {map_func.__name__}: {missing_cols}"
196
- raise KeyError(msg)
197
-
198
- def rename_cols(self) -> None:
199
- """Rename the columns according to the rename map."""
200
- self.raise_on_missing_cols(self.get_rename_map)
201
- self.df = self.df.rename(reverse_dict(self.get_rename_map()))
202
-
203
- def drop_cols(self) -> None:
204
- """Drop columns that are not in the col_dtype_map."""
205
- self.df = self.df.select(self.get_col_names())
206
-
207
- def fill_nulls(self) -> None:
208
- """Fill null values with the specified values from the fill null map."""
209
- self.raise_on_missing_cols(self.get_fill_null_map)
210
- self.df = self.df.with_columns(
211
- [
212
- pl.col(col_name).fill_null(fill_value)
213
- for col_name, fill_value in self.get_fill_null_map().items()
214
- ]
215
- )
216
-
217
- def convert_cols(self) -> None:
218
- """Apply the conversion functions to the columns."""
219
- self.raise_on_missing_cols(self.get_col_converter_map)
220
- self.standard_convert_cols()
221
- self.custom_convert_cols()
222
-
223
- def standard_convert_cols(self) -> None:
224
- """Assumes some Data standards and converts cols accordingly.
225
-
226
- E.g. strips strings, rounds floats
227
- """
228
- for col_name, dtype in self.get_col_dtype_map().items():
229
- if dtype == pl.Utf8:
230
- converter = self.strip_col
231
- elif dtype == pl.Float64:
232
- converter = self.round_col
233
- else:
234
- continue
235
- self.df = self.df.with_columns(
236
- pl.col(col_name).map_batches(converter, return_dtype=dtype)
237
- )
238
-
239
- def custom_convert_cols(self) -> None:
240
- """Apply the conversion functions to the columns."""
241
- self.df = self.df.with_columns(
242
- [
243
- pl.col(col_name).map_batches(
244
- converter, return_dtype=self.get_col_dtype_map()[col_name]
245
- )
246
- for col_name, converter in self.get_col_converter_map().items()
247
- if converter.__name__ != self.skip_col_converter.__name__
248
- ]
249
- )
250
-
251
- @classmethod
252
- def strip_col(cls, col: pl.Series) -> pl.Series:
253
- """Strip the column of leading and trailing whitespace."""
254
- return col.str.strip_chars()
255
-
256
- @classmethod
257
- def lower_col(cls, col: pl.Series) -> pl.Series:
258
- """Convert the column to lowercase."""
259
- return col.str.to_lowercase()
260
-
261
- @classmethod
262
- def round_col(
263
- cls,
264
- col: pl.Series,
265
- precision: int | None = None,
266
- *,
267
- compensate: bool = True,
268
- ) -> pl.Series:
269
- """Round the column to the specified precision.
270
-
271
- The precision is defined in the get_col_precision_map method.
272
- """
273
- if precision is None:
274
- precision = cls.get_col_precision_map()[str(col.name)]
275
- if not compensate:
276
- return col.round(precision)
277
-
278
- # compensate for rounding errors with kahan sum
279
- error = 0.0
280
- values = []
281
- for value in col.to_list(): # Ensure iteration over Python floats
282
- corrected = value + error
283
- rounded = round(corrected, precision)
284
- error = corrected - rounded
285
- values.append(rounded)
286
-
287
- return pl.Series(name=col.name, values=values, dtype=col.dtype)
288
-
289
- @classmethod
290
- def skip_col_converter(cls, _col: pl.Series) -> pl.Series:
291
- """Conversion is not needed for this column and will be skipped.
292
-
293
- Function should not be invoked if col_name is in get_col_converter_map.
294
- """
295
- msg = (
296
- "skip_col_converter is just a flag to skip conversion for a column "
297
- "and should not be actually called."
298
- )
299
- raise NotImplementedError(msg)
300
-
301
- def drop_null_subsets(self) -> None:
302
- """Drop rows where the subset of columns are all null.
303
-
304
- If no subsets are defined, drop all rows where all columns are null.
305
- """
306
- subsets = self.get_drop_null_subsets()
307
- if not subsets:
308
- self.df = self.df.drop_nulls()
309
- return
310
- for subset in subsets:
311
- self.df = self.df.drop_nulls(subset=subset)
312
-
313
- def handle_duplicates(self) -> None:
314
- """Drop duplicates based on the specified subsets.
315
-
316
- If add_on_duplicate_cols are defined, add the values of the cols together.
317
- This func adds up the vals of the duplicates and keeps the first row.
318
- E.g. if you have a df with two rows with the same subset
319
- and value 1 and 2 in col1 the result will be 3 in col1 for the first row.
320
- """
321
- for subset in self.get_unique_subsets():
322
- for col in self.get_add_on_duplicate_cols():
323
- self.df = self.df.with_columns(pl.col(col).sum().over(subset))
324
- self.df = self.df.unique(subset=subset, keep="first")
325
-
326
- def sort_cols(self) -> None:
327
- """Sort the dataframe by the specified columns."""
328
- cols, desc = zip(*self.get_sort_cols(), strict=True)
329
- if not cols:
330
- return
331
- self.df = self.df.sort(cols, descending=desc)
332
-
333
- def check(self) -> None:
334
- """Check the data and some conditions.
335
-
336
- This method is called at the end of the clean method.
337
- checks e.g. non null values in no_null_cols
338
- """
339
- self.check_correct_dtypes()
340
- self.check_no_null_cols()
341
- self.check_no_nan()
342
-
343
- def check_correct_dtypes(self) -> None:
344
- """Check that all columns have the correct dtype."""
345
- schema = self.df.schema
346
- col_dtype_map = self.get_col_dtype_map()
347
- for col, dtype in col_dtype_map.items():
348
- schema_dtype = schema[col]
349
- if schema_dtype != dtype:
350
- msg = f"Expected dtype {dtype} for column {col}, got {schema_dtype}"
351
- raise TypeError(msg)
352
-
353
- def check_no_null_cols(self) -> None:
354
- """Check that there are no null values in the no null columns."""
355
- no_null_cols = self.get_no_null_cols()
356
- # Use a single select to check all columns at once
357
- null_flags = self.df.select(
358
- [pl.col(col).is_null().any() for col in no_null_cols]
359
- )
360
- # Iterate over columns and check if any have nulls
361
- for col in no_null_cols:
362
- if null_flags[col].item():
363
- msg = f"Null values found in column: {col}"
364
- raise ValueError(msg)
365
-
366
- def check_no_nan(self) -> None:
367
- """Check that there are no nan values in the df."""
368
- float_cols = [
369
- col
370
- for col, dtype in self.get_col_dtype_map().items()
371
- if issubclass(dtype, FloatType)
372
- ]
373
- has_nan = self.df.select(
374
- pl.any_horizontal(pl.col(float_cols).is_nan().any())
375
- ).item()
376
- if has_nan:
377
- msg = "NaN values found in the dataframe"
378
- raise ValueError(msg)
1
+ """A Cleaning DF class that streamlines common cleaning operations on dataframes.
2
+
3
+ This is usefull to build Pipelines and when extending the class you can add your own
4
+ cleaning operations.
5
+ This module uses polars for dataframe operations and assumes some standards on the data
6
+ """
7
+
8
+ from abc import abstractmethod
9
+ from collections.abc import Callable
10
+ from typing import Any
11
+
12
+ import polars as pl
13
+ from polars.datatypes.classes import FloatType
14
+
15
+ from winipedia_utils.data.structures.dicts import reverse_dict
16
+ from winipedia_utils.oop.mixins.mixin import ABCLoggingMixin
17
+
18
+
19
+ class CleaningDF(ABCLoggingMixin):
20
+ """Inherits from polars.DataFrame and ABCLoggingMixin.
21
+
22
+ This will be a base class for importing all kinds of Data to e.g. a database.
23
+ It will be used to import data from different sources an clean it
24
+ Bring the data into the correct format and name the columns correctly.
25
+ And the df takes over and does the rest, like cleaning the data, filling NAs, etc.
26
+
27
+ It is good practice to define col names as str constants in the child class.
28
+ E.g.
29
+ COL_NAME_1 = "col_name_1" so they can be reused and are easy to change.
30
+
31
+ This class defaults to nan_to_null=True when creating the dataframe for simplicity.
32
+
33
+ """
34
+
35
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
36
+ """Initialize the CleaningDF."""
37
+ self.df = pl.DataFrame(*args, nan_to_null=True, **kwargs)
38
+ self.clean()
39
+
40
+ @classmethod
41
+ @abstractmethod
42
+ def get_rename_map(cls) -> dict[str, str]:
43
+ """Rename the columns.
44
+
45
+ This method must be implemented in the child class.
46
+ This will be done before any other cleaning operations.
47
+ Format: {new_name: old_name, ...}
48
+ ClenaingDF convention is to map the real col names to smth in all maps
49
+
50
+ Returns:
51
+ dict[str, str]: Dictionary mapping old column names to new column names
52
+ Format: {new_name: old_name, ...}
53
+ """
54
+
55
+ @classmethod
56
+ @abstractmethod
57
+ def get_col_dtype_map(cls) -> dict[str, type[pl.DataType]]:
58
+ """Map the column names to the correct data type.
59
+
60
+ This method must be implemented in the child class.
61
+
62
+ Returns:
63
+ dict[str, type[pl.DataType]]: Dictionary mapping column names to their types
64
+ """
65
+
66
+ @classmethod
67
+ @abstractmethod
68
+ def get_drop_null_subsets(cls) -> tuple[tuple[str, ...], ...]:
69
+ """Drops rows where the subset of columns are all null.
70
+
71
+ Drops a row if all columns in the subset are null.
72
+ You can define several subsets to check.
73
+ Each returned tuple is one subset.
74
+
75
+ Returns:
76
+ tuple[tuple[str, ...], ...]: Tuple of tuples of column names
77
+ """
78
+
79
+ @classmethod
80
+ @abstractmethod
81
+ def get_fill_null_map(cls) -> dict[str, Any]:
82
+ """Fill null values with the specified value.
83
+
84
+ This method must be implemented in the child class.
85
+
86
+ Returns:
87
+ dict[str, Any]: Dictionary mapping column names to their fill value
88
+ """
89
+
90
+ @classmethod
91
+ @abstractmethod
92
+ def get_sort_cols(cls) -> tuple[tuple[str, bool], ...]:
93
+ """Sort the dataframe by the specified columns.
94
+
95
+ This method must be implemented in the child class.
96
+
97
+ Returns:
98
+ tuple[tuple[str, bool], ...]: Tuple of tuples of column names and
99
+ how to sort, True for descending, False for ascending in polars
100
+ """
101
+
102
+ @classmethod
103
+ @abstractmethod
104
+ def get_unique_subsets(cls) -> tuple[tuple[str, ...], ...]:
105
+ """Drop duplicates based on the specified subsets.
106
+
107
+ This method must be implemented in the child class.
108
+ E.g.
109
+ (
110
+ (("col1", "col2"), # subset 1
111
+ ("col3", "col4"), # subset 2
112
+ )
113
+
114
+ Returns:
115
+ tuple[tuple[tuple[str, bool], ...], ...]: Tuple of tuples of column names
116
+ """
117
+
118
+ @classmethod
119
+ @abstractmethod
120
+ def get_no_null_cols(cls) -> tuple[str, ...]:
121
+ """Disallow null values in the specified columns.
122
+
123
+ This method must be implemented in the child class.
124
+
125
+ Returns:
126
+ tuple[str, ...]: Tuple of column names
127
+ """
128
+
129
+ @classmethod
130
+ @abstractmethod
131
+ def get_col_converter_map(
132
+ cls,
133
+ ) -> dict[str, Callable[[pl.Series], pl.Series]]:
134
+ """Convert the column to the specified type.
135
+
136
+ This method must be implemented in the child class.
137
+ It takes a polars series and returns a polars series.
138
+ Can be used to e.g. parse dates, or do a specific operation on a column.
139
+
140
+ Returns:
141
+ dict[str, Callable[[pl.Series], pl.Series]]: Dictionary mapping column names
142
+ to their conversion function
143
+ """
144
+
145
+ @classmethod
146
+ @abstractmethod
147
+ def get_add_on_duplicate_cols(cls) -> tuple[str, ...]:
148
+ """Adds the values of cols together when dupliactes of two rows are found.
149
+
150
+ This method must be implemented in the child class.
151
+ duplicates are determined by the get_unique_subsets method.
152
+
153
+ Returns:
154
+ tuple[str, ...]: Tuple of column names
155
+ """
156
+
157
+ @classmethod
158
+ @abstractmethod
159
+ def get_col_precision_map(cls) -> dict[str, int]:
160
+ """Round the column to the specified precision.
161
+
162
+ This method must be implemented in the child class.
163
+
164
+ Returns:
165
+ dict[str, int]: Dictionary mapping column names to their precision
166
+ """
167
+
168
+ @classmethod
169
+ def get_col_names(cls) -> tuple[str, ...]:
170
+ """Get the column names of the dataframe."""
171
+ return tuple(cls.get_col_dtype_map().keys())
172
+
173
+ def clean(self) -> None:
174
+ """Clean the dataframe."""
175
+ self.rename_cols()
176
+ self.drop_cols()
177
+ self.fill_nulls()
178
+ self.convert_cols()
179
+ self.drop_null_subsets()
180
+ self.handle_duplicates()
181
+ self.sort_cols()
182
+ self.check()
183
+
184
+ @classmethod
185
+ def raise_on_missing_cols(
186
+ cls,
187
+ map_func: Callable[..., dict[str, Any]],
188
+ col_names: tuple[str, ...] | None = None,
189
+ ) -> None:
190
+ """Raise a KeyError if the columns in the map are not in the dataframe."""
191
+ if col_names is None:
192
+ col_names = cls.get_col_names()
193
+ missing_cols = set(col_names) - set(map_func().keys())
194
+ if missing_cols:
195
+ msg = f"Missing columns in {map_func.__name__}: {missing_cols}"
196
+ raise KeyError(msg)
197
+
198
+ def rename_cols(self) -> None:
199
+ """Rename the columns according to the rename map."""
200
+ self.raise_on_missing_cols(self.get_rename_map)
201
+ self.df = self.df.rename(reverse_dict(self.get_rename_map()))
202
+
203
+ def drop_cols(self) -> None:
204
+ """Drop columns that are not in the col_dtype_map."""
205
+ self.df = self.df.select(self.get_col_names())
206
+
207
+ def fill_nulls(self) -> None:
208
+ """Fill null values with the specified values from the fill null map."""
209
+ self.raise_on_missing_cols(self.get_fill_null_map)
210
+ self.df = self.df.with_columns(
211
+ [
212
+ pl.col(col_name).fill_null(fill_value)
213
+ for col_name, fill_value in self.get_fill_null_map().items()
214
+ ]
215
+ )
216
+
217
+ def convert_cols(self) -> None:
218
+ """Apply the conversion functions to the columns."""
219
+ self.raise_on_missing_cols(self.get_col_converter_map)
220
+ self.standard_convert_cols()
221
+ self.custom_convert_cols()
222
+
223
+ def standard_convert_cols(self) -> None:
224
+ """Assumes some Data standards and converts cols accordingly.
225
+
226
+ E.g. strips strings, rounds floats
227
+ """
228
+ for col_name, dtype in self.get_col_dtype_map().items():
229
+ if dtype == pl.Utf8:
230
+ converter = self.strip_col
231
+ elif dtype == pl.Float64:
232
+ converter = self.round_col
233
+ else:
234
+ continue
235
+ self.df = self.df.with_columns(
236
+ pl.col(col_name).map_batches(converter, return_dtype=dtype)
237
+ )
238
+
239
+ def custom_convert_cols(self) -> None:
240
+ """Apply the conversion functions to the columns."""
241
+ self.df = self.df.with_columns(
242
+ [
243
+ pl.col(col_name).map_batches(
244
+ converter, return_dtype=self.get_col_dtype_map()[col_name]
245
+ )
246
+ for col_name, converter in self.get_col_converter_map().items()
247
+ if converter.__name__ != self.skip_col_converter.__name__
248
+ ]
249
+ )
250
+
251
+ @classmethod
252
+ def strip_col(cls, col: pl.Series) -> pl.Series:
253
+ """Strip the column of leading and trailing whitespace."""
254
+ return col.str.strip_chars()
255
+
256
+ @classmethod
257
+ def lower_col(cls, col: pl.Series) -> pl.Series:
258
+ """Convert the column to lowercase."""
259
+ return col.str.to_lowercase()
260
+
261
+ @classmethod
262
+ def round_col(
263
+ cls,
264
+ col: pl.Series,
265
+ precision: int | None = None,
266
+ *,
267
+ compensate: bool = True,
268
+ ) -> pl.Series:
269
+ """Round the column to the specified precision.
270
+
271
+ The precision is defined in the get_col_precision_map method.
272
+ """
273
+ if precision is None:
274
+ precision = cls.get_col_precision_map()[str(col.name)]
275
+ if not compensate:
276
+ return col.round(precision)
277
+
278
+ # compensate for rounding errors with kahan sum
279
+ error = 0.0
280
+ values = []
281
+ for value in col.to_list(): # Ensure iteration over Python floats
282
+ corrected = value + error
283
+ rounded = round(corrected, precision)
284
+ error = corrected - rounded
285
+ values.append(rounded)
286
+
287
+ return pl.Series(name=col.name, values=values, dtype=col.dtype)
288
+
289
+ @classmethod
290
+ def skip_col_converter(cls, _col: pl.Series) -> pl.Series:
291
+ """Conversion is not needed for this column and will be skipped.
292
+
293
+ Function should not be invoked if col_name is in get_col_converter_map.
294
+ """
295
+ msg = (
296
+ "skip_col_converter is just a flag to skip conversion for a column "
297
+ "and should not be actually called."
298
+ )
299
+ raise NotImplementedError(msg)
300
+
301
+ def drop_null_subsets(self) -> None:
302
+ """Drop rows where the subset of columns are all null.
303
+
304
+ If no subsets are defined, drop all rows where all columns are null.
305
+ """
306
+ subsets = self.get_drop_null_subsets()
307
+ if not subsets:
308
+ self.df = self.df.drop_nulls()
309
+ return
310
+ for subset in subsets:
311
+ self.df = self.df.drop_nulls(subset=subset)
312
+
313
+ def handle_duplicates(self) -> None:
314
+ """Drop duplicates based on the specified subsets.
315
+
316
+ If add_on_duplicate_cols are defined, add the values of the cols together.
317
+ This func adds up the vals of the duplicates and keeps the first row.
318
+ E.g. if you have a df with two rows with the same subset
319
+ and value 1 and 2 in col1 the result will be 3 in col1 for the first row.
320
+ """
321
+ for subset in self.get_unique_subsets():
322
+ for col in self.get_add_on_duplicate_cols():
323
+ self.df = self.df.with_columns(pl.col(col).sum().over(subset))
324
+ self.df = self.df.unique(subset=subset, keep="first")
325
+
326
+ def sort_cols(self) -> None:
327
+ """Sort the dataframe by the specified columns."""
328
+ cols, desc = zip(*self.get_sort_cols(), strict=True)
329
+ if not cols:
330
+ return
331
+ self.df = self.df.sort(cols, descending=desc)
332
+
333
+ def check(self) -> None:
334
+ """Check the data and some conditions.
335
+
336
+ This method is called at the end of the clean method.
337
+ checks e.g. non null values in no_null_cols
338
+ """
339
+ self.check_correct_dtypes()
340
+ self.check_no_null_cols()
341
+ self.check_no_nan()
342
+
343
+ def check_correct_dtypes(self) -> None:
344
+ """Check that all columns have the correct dtype."""
345
+ schema = self.df.schema
346
+ col_dtype_map = self.get_col_dtype_map()
347
+ for col, dtype in col_dtype_map.items():
348
+ schema_dtype = schema[col]
349
+ if schema_dtype != dtype:
350
+ msg = f"Expected dtype {dtype} for column {col}, got {schema_dtype}"
351
+ raise TypeError(msg)
352
+
353
+ def check_no_null_cols(self) -> None:
354
+ """Check that there are no null values in the no null columns."""
355
+ no_null_cols = self.get_no_null_cols()
356
+ # Use a single select to check all columns at once
357
+ null_flags = self.df.select(
358
+ [pl.col(col).is_null().any() for col in no_null_cols]
359
+ )
360
+ # Iterate over columns and check if any have nulls
361
+ for col in no_null_cols:
362
+ if null_flags[col].item():
363
+ msg = f"Null values found in column: {col}"
364
+ raise ValueError(msg)
365
+
366
+ def check_no_nan(self) -> None:
367
+ """Check that there are no nan values in the df."""
368
+ float_cols = [
369
+ col
370
+ for col, dtype in self.get_col_dtype_map().items()
371
+ if issubclass(dtype, FloatType)
372
+ ]
373
+ has_nan = self.df.select(
374
+ pl.any_horizontal(pl.col(float_cols).is_nan().any())
375
+ ).item()
376
+ if has_nan:
377
+ msg = "NaN values found in the dataframe"
378
+ raise ValueError(msg)