dragon-ml-toolbox 9.2.0__py3-none-any.whl → 10.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 9.2.0
3
+ Version: 10.0.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,7 @@
1
- dragon_ml_toolbox-9.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-9.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
- ml_tools/ETL_engineering.py,sha256=xagI_Gaxt9nHz8XfEPuQlOZAhr2PMV8MILQ2IDPx-KM,46718
1
+ dragon_ml_toolbox-10.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-10.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
+ ml_tools/ETL_cleaning.py,sha256=NJj1Iw-94D9MQvSkX1ce7wPbNM5b_1-NUMffZfod7VI,14957
4
+ ml_tools/ETL_engineering.py,sha256=sgpIhlFIeId4eSJ-a33MnVuPNXs50msxFWa8-kw2hOI,36369
4
5
  ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
5
6
  ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
6
7
  ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
@@ -28,8 +29,8 @@ ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,1400
28
29
  ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
29
30
  ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
30
31
  ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
31
- ml_tools/utilities.py,sha256=zzfYR7SUSb2rZILTNoCjl_pfLlPdHf4263atXuEb3iE,19341
32
- dragon_ml_toolbox-9.2.0.dist-info/METADATA,sha256=ZMonhfZYz7qRdUjTsB_K-M4oAAHB55YaWOrWcuKUWrw,6941
33
- dragon_ml_toolbox-9.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
34
- dragon_ml_toolbox-9.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
35
- dragon_ml_toolbox-9.2.0.dist-info/RECORD,,
32
+ ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
33
+ dragon_ml_toolbox-10.0.0.dist-info/METADATA,sha256=QvDD6uzokGUUKjj8s5wziNLu6QLGldCVSsTm1qc8-7w,6942
34
+ dragon_ml_toolbox-10.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
+ dragon_ml_toolbox-10.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
+ dragon_ml_toolbox-10.0.0.dist-info/RECORD,,
@@ -0,0 +1,372 @@
1
+ import polars as pl
2
+ import pandas as pd
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
6
+ from .path_manager import sanitize_filename, make_fullpath
7
+ from .utilities import save_dataframe, load_dataframe
8
+ from ._script_info import _script_info
9
+ from ._logger import _LOGGER
10
+
11
+
12
+ __all__ = [
13
+ "save_unique_values",
14
+ "basic_clean",
15
+ "ColumnCleaner",
16
+ "DataFrameCleaner"
17
+ ]
18
+
19
+
20
+ ################ Unique Values per column #################
21
+ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
22
+ """
23
+ Loads a CSV file, then analyzes it and saves the unique non-null values
24
+ from each column into a separate text file exactly as they appear.
25
+
26
+ This is useful for understanding the raw categories or range of values
27
+ within a dataset before cleaning.
28
+
29
+ Args:
30
+ csv_path (Union[str, Path]):
31
+ The file path to the input CSV file.
32
+ output_dir (Union[str, Path]):
33
+ The path to the directory where the .txt files will be saved.
34
+ The directory will be created if it does not exist.
35
+ """
36
+ # --- 1. Input Validation ---
37
+ csv_path = make_fullpath(input_path=csv_path, enforce="file")
38
+ output_dir = make_fullpath(input_path=output_dir, make=True)
39
+
40
+ # --- 2. Load Data ---
41
+ try:
42
+ # Load all columns as strings to preserve original formatting
43
+ df = pd.read_csv(csv_path, dtype=str, encoding='utf-8')
44
+ except FileNotFoundError as e:
45
+ _LOGGER.error(f"The file was not found at '{csv_path}'.")
46
+ raise e
47
+ except Exception as e2:
48
+ _LOGGER.error(f"An error occurred while reading the CSV file.")
49
+ raise e2
50
+ else:
51
+ _LOGGER.info(f"Data loaded from '{csv_path}'")
52
+
53
+ # --- 3. Process Each Column ---
54
+ for i, column_name in enumerate(df.columns):
55
+ # _LOGGER.info(f"Processing column: '{column_name}'...")
56
+
57
+ # --- Get unique values AS IS ---
58
+ try:
59
+ # Drop nulls, get unique values, and sort them.
60
+ # The values are preserved exactly as they are in the cells.
61
+ unique_values = df[column_name].dropna().unique()
62
+ sorted_uniques = sorted(unique_values)
63
+ except Exception:
64
+ _LOGGER.exception(f"Could not process column '{column_name}'.")
65
+ continue
66
+
67
+ if not sorted_uniques:
68
+ _LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
69
+ continue
70
+
71
+ # --- Sanitize column name to create a valid filename ---
72
+ sanitized_name = sanitize_filename(column_name)
73
+ if not sanitized_name.strip('_'):
74
+ sanitized_name = f'column_{i}'
75
+ file_path = output_dir / f"{sanitized_name}_unique_values.txt"
76
+
77
+ # --- Write to file ---
78
+ try:
79
+ with open(file_path, 'w', encoding='utf-8') as f:
80
+ f.write(f"# Unique values for column: '{column_name}'\n")
81
+ f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
82
+ f.write("-" * 30 + "\n")
83
+ for value in sorted_uniques:
84
+ f.write(f"{value}\n")
85
+ f.write("-" * 30 + "\n")
86
+ except IOError:
87
+ _LOGGER.exception(f"Error writing to file {file_path}.")
88
+ else:
89
+ _LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
90
+
91
+ _LOGGER.info("Process complete.")
92
+
93
+
94
+ ########## Basic df cleaner #############
95
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path,None]=None):
96
+ """
97
+ Performs a comprehensive, standardized cleaning on all columns of a CSV file.
98
+
99
+ The cleaning process includes:
100
+ - Normalizing full-width and typographical punctuation to standard equivalents.
101
+ - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
102
+ - Stripping any leading or trailing whitespace.
103
+ - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
104
+ - Converting strings that become empty after cleaning into true null values.
105
+ - Normalizing all text to lowercase.
106
+
107
+ Args:
108
+ input_filepath (Union[str, Path]):
109
+ The path to the source CSV file to be cleaned.
110
+ output_filepath (Union[str, Path, None], optional):
111
+ The path to save the cleaned CSV file. If None (default),
112
+ the original input file will be overwritten.
113
+ """
114
+ # Handle paths
115
+ input_path = make_fullpath(input_filepath, enforce="file")
116
+
117
+ # Unless explicitly defined, overwrite file.
118
+ if output_filepath is not None:
119
+ parent_dir = make_fullpath(Path(output_filepath).parent, make=True, enforce="directory")
120
+ output_path = parent_dir / Path(output_filepath).name
121
+ else:
122
+ output_path = input_path
123
+
124
+ # load polars df
125
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
126
+
127
+ # Cleaning rules
128
+ cleaning_rules = {
129
+ # 1. Comprehensive Punctuation & Symbol Normalization
130
+ # Remove invisible control characters
131
+ r'\p{C}+': '',
132
+
133
+ # Full-width to half-width
134
+ '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
135
+ '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#',
136
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|',
137
+
138
+ # Others
139
+ '©': '',
140
+ '®': '',
141
+ '™': '',
142
+
143
+ # Collapse repeating punctuation (explicit method)
144
+ r'\.{2,}': '.', # Replace two or more dots with a single dot
145
+ r'\?{2,}': '?', # Replace two or more question marks with a single question mark
146
+ r'!{2,}': '!', # Replace two or more exclamation marks with a single one
147
+
148
+ # Typographical standardization
149
+ # Unify various dashes and hyphens to a standard hyphen-minus
150
+ r'[—–―]': '-',
151
+ # Unify various quote types to standard single quotes
152
+ r'[“”]': "'",
153
+ r'[‘’]': "'",
154
+
155
+ # 2. Internal Whitespace Consolidation
156
+ # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
157
+ r'\s+': ' ',
158
+
159
+ # 3. Leading/Trailing Whitespace Removal
160
+ # Strip any whitespace from the beginning or end of the string
161
+ r'^\s+|\s+$': '',
162
+
163
+ # 4. Textual Null Standardization (New Step)
164
+ # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
165
+ r'^(N/A|NA|NULL|NONE|NIL|)$': None,
166
+
167
+ # 5. Final Nullification of Empty Strings
168
+ # After all cleaning, if a string is now empty, convert it to a null
169
+ r'^$': None
170
+ }
171
+
172
+ # Clean data
173
+ try:
174
+ # Create a cleaner for every column in the dataframe
175
+ all_columns = df.columns
176
+ column_cleaners = [
177
+ ColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
178
+ ]
179
+
180
+ # Instantiate and run the main dataframe cleaner
181
+ df_cleaner = DataFrameCleaner(cleaners=column_cleaners)
182
+ df_cleaned = df_cleaner.clean(df, clone_df=False) # Use clone_df=False for efficiency
183
+
184
+ # apply lowercase to all string columns
185
+ df_final = df_cleaned.with_columns(
186
+ pl.col(pl.String).str.to_lowercase()
187
+ )
188
+ except Exception as e:
189
+ _LOGGER.error(f"An error occurred during the cleaning process for '{input_path.name}'.")
190
+ raise e
191
+
192
+ # Save cleaned dataframe
193
+ save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
194
+
195
+ _LOGGER.info(f"Successfully cleaned and saved data to '{output_path.name}'.")
196
+
197
+
198
+ ########## EXTRACT and CLEAN ##########
199
+ class ColumnCleaner:
200
+ """
201
+ A configuration object that defines cleaning rules for a single Polars DataFrame column.
202
+
203
+ This class holds a dictionary of regex-to-replacement rules, the target column name,
204
+ and the case-sensitivity setting. It is intended to be used with the DataFrameCleaner.
205
+
206
+ Notes:
207
+ - Define rules from most specific to more general to create a fallback system.
208
+ - Beware of chain replacements (rules matching strings that have already been
209
+ changed by a previous rule in the same cleaner).
210
+
211
+ Args:
212
+ column_name (str):
213
+ The name of the column to be cleaned.
214
+ rules (Dict[str, str]):
215
+ A dictionary of regex patterns to replacement strings. Can use
216
+ backreferences (e.g., r'$1 $2') for captured groups. Note that Polars
217
+ uses a '$' prefix for backreferences.
218
+ case_insensitive (bool):
219
+ If True (default), regex matching ignores case.
220
+
221
+ ## Usage Example
222
+
223
+ ```python
224
+ id_rules = {
225
+ # Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
226
+ r'ID[- ](\\d+)': r'ID:$1'
227
+ }
228
+
229
+ id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
230
+ # This object would then be passed to a DataFrameCleaner.
231
+ ```
232
+ """
233
+ def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
234
+ if not isinstance(column_name, str) or not column_name:
235
+ _LOGGER.error("The 'column_name' must be a non-empty string.")
236
+ raise TypeError()
237
+ if not isinstance(rules, dict):
238
+ _LOGGER.error("The 'rules' argument must be a dictionary.")
239
+ raise TypeError()
240
+
241
+ # Validate each regex pattern for correctness
242
+ for pattern in rules.keys():
243
+ try:
244
+ re.compile(pattern)
245
+ except re.error:
246
+ _LOGGER.error(f"Invalid regex pattern '{pattern}'.")
247
+ raise
248
+
249
+ self.column_name = column_name
250
+ self.rules = rules
251
+ self.case_insensitive = case_insensitive
252
+
253
+
254
+ class DataFrameCleaner:
255
+ """
256
+ Orchestrates cleaning multiple columns in a Polars DataFrame.
257
+
258
+ This class takes a list of ColumnCleaner objects and applies their defined
259
+ rules to the corresponding columns of a DataFrame using high-performance
260
+ Polars expressions.
261
+
262
+ Args:
263
+ cleaners (List[ColumnCleaner]):
264
+ A list of ColumnCleaner configuration objects.
265
+
266
+ Raises:
267
+ TypeError: If 'cleaners' is not a list or contains non-ColumnCleaner objects.
268
+ ValueError: If multiple ColumnCleaner objects target the same column.
269
+ """
270
+ def __init__(self, cleaners: List[ColumnCleaner]):
271
+ if not isinstance(cleaners, list):
272
+ _LOGGER.error("The 'cleaners' argument must be a list of ColumnCleaner objects.")
273
+ raise TypeError()
274
+
275
+ seen_columns = set()
276
+ for cleaner in cleaners:
277
+ if not isinstance(cleaner, ColumnCleaner):
278
+ _LOGGER.error(f"All items in 'cleaners' list must be ColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
279
+ raise TypeError()
280
+ if cleaner.column_name in seen_columns:
281
+ _LOGGER.error(f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
282
+ raise ValueError()
283
+ seen_columns.add(cleaner.column_name)
284
+
285
+ self.cleaners = cleaners
286
+
287
+ def clean(self, df: pl.DataFrame, clone_df: bool=True) -> pl.DataFrame:
288
+ """
289
+ Applies all defined cleaning rules to the Polars DataFrame.
290
+
291
+ Args:
292
+ df (pl.DataFrame): The Polars DataFrame to clean.
293
+ clone_df (bool): Whether to work on a clone to prevent undesired changes.
294
+
295
+ Returns:
296
+ pl.DataFrame: A new, cleaned Polars DataFrame.
297
+
298
+ Raises:
299
+ ValueError: If any columns specified in the cleaners are not found
300
+ in the input DataFrame.
301
+ """
302
+ rule_columns = {c.column_name for c in self.cleaners}
303
+ df_columns = set(df.columns)
304
+ missing_columns = rule_columns - df_columns
305
+
306
+ if missing_columns:
307
+ _LOGGER.error("The following columns specified in cleaning rules were not found in the DataFrame:")
308
+ for miss_col in sorted(list(missing_columns)):
309
+ print(f"\t- {miss_col}")
310
+ raise ValueError()
311
+
312
+ if clone_df:
313
+ df_cleaned = df.clone()
314
+ else:
315
+ df_cleaned = df
316
+
317
+ # Build and apply a series of expressions for each column
318
+ for cleaner in self.cleaners:
319
+ col_name = cleaner.column_name
320
+
321
+ # Start with the column, cast to String for replacement operations
322
+ col_expr = pl.col(col_name).cast(pl.String)
323
+
324
+ # Sequentially chain 'replace_all' expressions for each rule
325
+ for pattern, replacement in cleaner.rules.items():
326
+ final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
327
+
328
+ if replacement is None:
329
+ # If replacement is None, use a when/then expression to set matching values to null
330
+ col_expr = pl.when(col_expr.str.contains(final_pattern)) \
331
+ .then(None) \
332
+ .otherwise(col_expr)
333
+ else:
334
+ col_expr = col_expr.str.replace_all(final_pattern, replacement)
335
+
336
+ # Execute the expression chain for the column
337
+ df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
338
+
339
+ _LOGGER.info(f"Cleaned {len(self.cleaners)} columns.")
340
+
341
+ return df_cleaned
342
+
343
+ def load_clean_save(self, input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
344
+ """
345
+ This convenience method encapsulates the entire cleaning process into a
346
+ single call. It loads a DataFrame from a specified file, applies all
347
+ cleaning rules configured in the `DataFrameCleaner` instance, and saves
348
+ the resulting cleaned DataFrame to a new file.
349
+
350
+ The method ensures that all data is loaded as string types to prevent
351
+ unintended type inference issues before cleaning operations are applied.
352
+
353
+ Args:
354
+ input_filepath (Union[str, Path]):
355
+ The path to the input data file.
356
+ output_filepath (Union[str, Path]):
357
+ The full path, where the cleaned data file will be saved.
358
+ """
359
+ df, _ = load_dataframe(df_path=input_filepath, kind="polars", all_strings=True)
360
+
361
+ df_clean = self.clean(df=df, clone_df=False)
362
+
363
+ if isinstance(output_filepath, str):
364
+ output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
365
+
366
+ save_dataframe(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
367
+
368
+ return None
369
+
370
+
371
+ def info():
372
+ _script_info(__all__)
@@ -1,18 +1,11 @@
1
1
  import polars as pl
2
- import pandas as pd
3
2
  import re
4
- from pathlib import Path
5
3
  from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
6
- from .path_manager import sanitize_filename, make_fullpath
7
- from .utilities import save_dataframe, load_dataframe
8
4
  from ._script_info import _script_info
9
5
  from ._logger import _LOGGER
10
6
 
11
7
 
12
8
  __all__ = [
13
- "save_unique_values",
14
- "ColumnCleaner",
15
- "DataFrameCleaner",
16
9
  "TransformationRecipe",
17
10
  "DataProcessor",
18
11
  "BinaryTransformer",
@@ -28,253 +21,6 @@ __all__ = [
28
21
  "DateFeatureExtractor"
29
22
  ]
30
23
 
31
- ################ Unique Values per column #################
32
- def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
33
- """
34
- Loads a CSV file, then analyzes it and saves the unique non-null values
35
- from each column into a separate text file exactly as they appear.
36
-
37
- This is useful for understanding the raw categories or range of values
38
- within a dataset before cleaning.
39
-
40
- Args:
41
- csv_path (Union[str, Path]):
42
- The file path to the input CSV file.
43
- output_dir (Union[str, Path]):
44
- The path to the directory where the .txt files will be saved.
45
- The directory will be created if it does not exist.
46
- """
47
- # --- 1. Input Validation ---
48
- csv_path = make_fullpath(input_path=csv_path, enforce="file")
49
- output_dir = make_fullpath(input_path=output_dir, make=True)
50
-
51
- # --- 2. Load Data ---
52
- try:
53
- # Load all columns as strings to preserve original formatting
54
- df = pd.read_csv(csv_path, dtype=str, encoding='utf-8')
55
- except FileNotFoundError as e:
56
- _LOGGER.error(f"The file was not found at '{csv_path}'.")
57
- raise e
58
- except Exception as e2:
59
- _LOGGER.error(f"An error occurred while reading the CSV file.")
60
- raise e2
61
- else:
62
- _LOGGER.info(f"Data loaded from '{csv_path}'")
63
-
64
- # --- 3. Process Each Column ---
65
- for i, column_name in enumerate(df.columns):
66
- # _LOGGER.info(f"Processing column: '{column_name}'...")
67
-
68
- # --- Get unique values AS IS ---
69
- try:
70
- # Drop nulls, get unique values, and sort them.
71
- # The values are preserved exactly as they are in the cells.
72
- unique_values = df[column_name].dropna().unique()
73
- sorted_uniques = sorted(unique_values)
74
- except Exception:
75
- _LOGGER.exception(f"Could not process column '{column_name}'.")
76
- continue
77
-
78
- if not sorted_uniques:
79
- _LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
80
- continue
81
-
82
- # --- Sanitize column name to create a valid filename ---
83
- sanitized_name = sanitize_filename(column_name)
84
- if not sanitized_name.strip('_'):
85
- sanitized_name = f'column_{i}'
86
- file_path = output_dir / f"{sanitized_name}_unique_values.txt"
87
-
88
- # --- Write to file ---
89
- try:
90
- with open(file_path, 'w', encoding='utf-8') as f:
91
- f.write(f"# Unique values for column: '{column_name}'\n")
92
- f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
93
- f.write("-" * 30 + "\n")
94
- for value in sorted_uniques:
95
- f.write(f"{value}\n")
96
- f.write("-" * 30 + "\n")
97
- except IOError:
98
- _LOGGER.exception(f"Error writing to file {file_path}.")
99
- else:
100
- _LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
101
-
102
- _LOGGER.info("Process complete.")
103
-
104
-
105
- ########## EXTRACT and CLEAN ##########
106
- class ColumnCleaner:
107
- """
108
- A configuration object that defines cleaning rules for a single Polars DataFrame column.
109
-
110
- This class holds a dictionary of regex-to-replacement rules, the target column name,
111
- and the case-sensitivity setting. It is intended to be used with the DataFrameCleaner.
112
-
113
- Notes:
114
- - Define rules from most specific to more general to create a fallback system.
115
- - Beware of chain replacements (rules matching strings that have already been
116
- changed by a previous rule in the same cleaner).
117
-
118
- Args:
119
- column_name (str):
120
- The name of the column to be cleaned.
121
- rules (Dict[str, str]):
122
- A dictionary of regex patterns to replacement strings. Can use
123
- backreferences (e.g., r'$1 $2') for captured groups. Note that Polars
124
- uses a '$' prefix for backreferences.
125
- case_insensitive (bool):
126
- If True (default), regex matching ignores case.
127
-
128
- ## Usage Example
129
-
130
- ```python
131
- id_rules = {
132
- # Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
133
- r'ID[- ](\\d+)': r'ID:$1'
134
- }
135
-
136
- id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
137
- # This object would then be passed to a DataFrameCleaner.
138
- ```
139
- """
140
- def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
141
- if not isinstance(column_name, str) or not column_name:
142
- _LOGGER.error("The 'column_name' must be a non-empty string.")
143
- raise TypeError()
144
- if not isinstance(rules, dict):
145
- _LOGGER.error("The 'rules' argument must be a dictionary.")
146
- raise TypeError()
147
-
148
- # Validate each regex pattern for correctness
149
- for pattern in rules.keys():
150
- try:
151
- re.compile(pattern)
152
- except re.error:
153
- _LOGGER.error(f"Invalid regex pattern '{pattern}'.")
154
- raise
155
-
156
- self.column_name = column_name
157
- self.rules = rules
158
- self.case_insensitive = case_insensitive
159
-
160
-
161
- class DataFrameCleaner:
162
- """
163
- Orchestrates cleaning multiple columns in a Polars DataFrame.
164
-
165
- This class takes a list of ColumnCleaner objects and applies their defined
166
- rules to the corresponding columns of a DataFrame using high-performance
167
- Polars expressions.
168
-
169
- Args:
170
- cleaners (List[ColumnCleaner]):
171
- A list of ColumnCleaner configuration objects.
172
-
173
- Raises:
174
- TypeError: If 'cleaners' is not a list or contains non-ColumnCleaner objects.
175
- ValueError: If multiple ColumnCleaner objects target the same column.
176
- """
177
- def __init__(self, cleaners: List[ColumnCleaner]):
178
- if not isinstance(cleaners, list):
179
- _LOGGER.error("The 'cleaners' argument must be a list of ColumnCleaner objects.")
180
- raise TypeError()
181
-
182
- seen_columns = set()
183
- for cleaner in cleaners:
184
- if not isinstance(cleaner, ColumnCleaner):
185
- _LOGGER.error(f"All items in 'cleaners' list must be ColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
186
- raise TypeError()
187
- if cleaner.column_name in seen_columns:
188
- _LOGGER.error(f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
189
- raise ValueError()
190
- seen_columns.add(cleaner.column_name)
191
-
192
- self.cleaners = cleaners
193
-
194
- def clean(self, df: pl.DataFrame, clone_df: bool=True) -> pl.DataFrame:
195
- """
196
- Applies all defined cleaning rules to the Polars DataFrame.
197
-
198
- Args:
199
- df (pl.DataFrame): The Polars DataFrame to clean.
200
- clone_df (bool): Whether to work on a clone to prevent undesired changes.
201
-
202
- Returns:
203
- pl.DataFrame: A new, cleaned Polars DataFrame.
204
-
205
- Raises:
206
- ValueError: If any columns specified in the cleaners are not found
207
- in the input DataFrame.
208
- """
209
- rule_columns = {c.column_name for c in self.cleaners}
210
- df_columns = set(df.columns)
211
- missing_columns = rule_columns - df_columns
212
-
213
- if missing_columns:
214
- _LOGGER.error("The following columns specified in cleaning rules were not found in the DataFrame:")
215
- for miss_col in sorted(list(missing_columns)):
216
- print(f"\t- {miss_col}")
217
- raise ValueError()
218
-
219
- if clone_df:
220
- df_cleaned = df.clone()
221
- else:
222
- df_cleaned = df
223
-
224
- # Build and apply a series of expressions for each column
225
- for cleaner in self.cleaners:
226
- col_name = cleaner.column_name
227
-
228
- # Start with the column, cast to String for replacement operations
229
- col_expr = pl.col(col_name).cast(pl.String)
230
-
231
- # Sequentially chain 'replace_all' expressions for each rule
232
- for pattern, replacement in cleaner.rules.items():
233
- final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
234
-
235
- if replacement is None:
236
- # If replacement is None, use a when/then expression to set matching values to null
237
- col_expr = pl.when(col_expr.str.contains(final_pattern)) \
238
- .then(None) \
239
- .otherwise(col_expr)
240
- else:
241
- col_expr = col_expr.str.replace_all(final_pattern, replacement)
242
-
243
- # Execute the expression chain for the column
244
- df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
245
-
246
- _LOGGER.info(f"Cleaned {len(self.cleaners)} columns.")
247
-
248
- return df_cleaned
249
-
250
- def load_clean_save(self, input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
251
- """
252
- This convenience method encapsulates the entire cleaning process into a
253
- single call. It loads a DataFrame from a specified file, applies all
254
- cleaning rules configured in the `DataFrameCleaner` instance, and saves
255
- the resulting cleaned DataFrame to a new file.
256
-
257
- The method ensures that all data is loaded as string types to prevent
258
- unintended type inference issues before cleaning operations are applied.
259
-
260
- Args:
261
- input_filepath (Union[str, Path]):
262
- The path to the input data file.
263
- output_filepath (Union[str, Path]):
264
- The full path, where the cleaned data file will be saved.
265
- """
266
- df, _ = load_dataframe(df_path=input_filepath, kind="polars", all_strings=True)
267
-
268
- df_clean = self.clean(df=df, clone_df=False)
269
-
270
- if isinstance(output_filepath, str):
271
- output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
272
-
273
- save_dataframe(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
274
-
275
- return None
276
-
277
-
278
24
  ############ TRANSFORM MAIN ####################
279
25
 
280
26
  # Magic word for rename-only transformation
@@ -631,7 +377,7 @@ class MultiBinaryDummifier:
631
377
  )
632
378
  output_expressions.append(expr)
633
379
 
634
- return pl.select(output_expressions)
380
+ return pl.select(output_expressions) # type: ignore
635
381
 
636
382
 
637
383
  class KeywordDummifier:
ml_tools/utilities.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
3
  import pandas as pd
4
4
  import polars as pl
5
5
  from pathlib import Path
6
- from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
6
+ from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple, overload
7
7
  import joblib
8
8
  from joblib.externals.loky.process_executor import TerminatedWorkerError
9
9
  from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
@@ -28,12 +28,32 @@ __all__ = [
28
28
  ]
29
29
 
30
30
 
31
+ # Overload 1: When kind='pandas'
32
+ @overload
33
+ def load_dataframe(
34
+ df_path: Union[str, Path],
35
+ kind: Literal["pandas"] = "pandas",
36
+ all_strings: bool = False,
37
+ verbose: bool = True
38
+ ) -> Tuple[pd.DataFrame, str]:
39
+ ... # for overload stubs
40
+
41
+ # Overload 2: When kind='polars'
42
+ @overload
43
+ def load_dataframe(
44
+ df_path: Union[str, Path],
45
+ kind: Literal["polars"],
46
+ all_strings: bool = False,
47
+ verbose: bool = True
48
+ ) -> Tuple[pl.DataFrame, str]:
49
+ ... # for overload stubs
50
+
31
51
  def load_dataframe(
32
52
  df_path: Union[str, Path],
33
53
  kind: Literal["pandas", "polars"] = "pandas",
34
54
  all_strings: bool = False,
35
55
  verbose: bool = True
36
- ) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
56
+ ) -> Union[Tuple[pd.DataFrame, str], Tuple[pl.DataFrame, str]]:
37
57
  """
38
58
  Load a CSV file into a DataFrame and extract its base name.
39
59
 
@@ -41,13 +61,13 @@ def load_dataframe(
41
61
  columns as string types to prevent type inference errors.
42
62
 
43
63
  Args:
44
- df_path (Union[str, Path]):
64
+ df_path (str, Path):
45
65
  The path to the CSV file.
46
- kind (Literal["pandas", "polars"], optional):
66
+ kind ("pandas", "polars"):
47
67
  The type of DataFrame to load. Defaults to "pandas".
48
- all_strings (bool, optional):
68
+ all_strings (bool):
49
69
  If True, loads all columns as string data types. This is useful for
50
- ETL tasks and to avoid type-inference errors. Defaults to False.
70
+ ETL tasks and to avoid type-inference errors.
51
71
 
52
72
  Returns:
53
73
  (Tuple[DataFrameType, str]):
@@ -87,7 +107,7 @@ def load_dataframe(
87
107
  if verbose:
88
108
  _LOGGER.info(f"💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
89
109
 
90
- return df, df_name
110
+ return df, df_name # type: ignore
91
111
 
92
112
 
93
113
  def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):