dragon-ml-toolbox 9.2.0__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-9.2.0.dist-info → dragon_ml_toolbox-10.0.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-9.2.0.dist-info → dragon_ml_toolbox-10.0.0.dist-info}/RECORD +9 -8
- ml_tools/ETL_cleaning.py +372 -0
- ml_tools/ETL_engineering.py +1 -255
- ml_tools/utilities.py +27 -7
- {dragon_ml_toolbox-9.2.0.dist-info → dragon_ml_toolbox-10.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-9.2.0.dist-info → dragon_ml_toolbox-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-9.2.0.dist-info → dragon_ml_toolbox-10.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-9.2.0.dist-info → dragon_ml_toolbox-10.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
dragon_ml_toolbox-
|
|
2
|
-
dragon_ml_toolbox-
|
|
3
|
-
ml_tools/
|
|
1
|
+
dragon_ml_toolbox-10.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-10.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
|
+
ml_tools/ETL_cleaning.py,sha256=NJj1Iw-94D9MQvSkX1ce7wPbNM5b_1-NUMffZfod7VI,14957
|
|
4
|
+
ml_tools/ETL_engineering.py,sha256=sgpIhlFIeId4eSJ-a33MnVuPNXs50msxFWa8-kw2hOI,36369
|
|
4
5
|
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
5
6
|
ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
|
|
6
7
|
ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
|
|
@@ -28,8 +29,8 @@ ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,1400
|
|
|
28
29
|
ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
29
30
|
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
30
31
|
ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
|
|
31
|
-
ml_tools/utilities.py,sha256=
|
|
32
|
-
dragon_ml_toolbox-
|
|
33
|
-
dragon_ml_toolbox-
|
|
34
|
-
dragon_ml_toolbox-
|
|
35
|
-
dragon_ml_toolbox-
|
|
32
|
+
ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
|
|
33
|
+
dragon_ml_toolbox-10.0.0.dist-info/METADATA,sha256=QvDD6uzokGUUKjj8s5wziNLu6QLGldCVSsTm1qc8-7w,6942
|
|
34
|
+
dragon_ml_toolbox-10.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
dragon_ml_toolbox-10.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
+
dragon_ml_toolbox-10.0.0.dist-info/RECORD,,
|
ml_tools/ETL_cleaning.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
6
|
+
from .path_manager import sanitize_filename, make_fullpath
|
|
7
|
+
from .utilities import save_dataframe, load_dataframe
|
|
8
|
+
from ._script_info import _script_info
|
|
9
|
+
from ._logger import _LOGGER
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"save_unique_values",
|
|
14
|
+
"basic_clean",
|
|
15
|
+
"ColumnCleaner",
|
|
16
|
+
"DataFrameCleaner"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
################ Unique Values per column #################
|
|
21
|
+
def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
|
|
22
|
+
"""
|
|
23
|
+
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
24
|
+
from each column into a separate text file exactly as they appear.
|
|
25
|
+
|
|
26
|
+
This is useful for understanding the raw categories or range of values
|
|
27
|
+
within a dataset before cleaning.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
csv_path (Union[str, Path]):
|
|
31
|
+
The file path to the input CSV file.
|
|
32
|
+
output_dir (Union[str, Path]):
|
|
33
|
+
The path to the directory where the .txt files will be saved.
|
|
34
|
+
The directory will be created if it does not exist.
|
|
35
|
+
"""
|
|
36
|
+
# --- 1. Input Validation ---
|
|
37
|
+
csv_path = make_fullpath(input_path=csv_path, enforce="file")
|
|
38
|
+
output_dir = make_fullpath(input_path=output_dir, make=True)
|
|
39
|
+
|
|
40
|
+
# --- 2. Load Data ---
|
|
41
|
+
try:
|
|
42
|
+
# Load all columns as strings to preserve original formatting
|
|
43
|
+
df = pd.read_csv(csv_path, dtype=str, encoding='utf-8')
|
|
44
|
+
except FileNotFoundError as e:
|
|
45
|
+
_LOGGER.error(f"The file was not found at '{csv_path}'.")
|
|
46
|
+
raise e
|
|
47
|
+
except Exception as e2:
|
|
48
|
+
_LOGGER.error(f"An error occurred while reading the CSV file.")
|
|
49
|
+
raise e2
|
|
50
|
+
else:
|
|
51
|
+
_LOGGER.info(f"Data loaded from '{csv_path}'")
|
|
52
|
+
|
|
53
|
+
# --- 3. Process Each Column ---
|
|
54
|
+
for i, column_name in enumerate(df.columns):
|
|
55
|
+
# _LOGGER.info(f"Processing column: '{column_name}'...")
|
|
56
|
+
|
|
57
|
+
# --- Get unique values AS IS ---
|
|
58
|
+
try:
|
|
59
|
+
# Drop nulls, get unique values, and sort them.
|
|
60
|
+
# The values are preserved exactly as they are in the cells.
|
|
61
|
+
unique_values = df[column_name].dropna().unique()
|
|
62
|
+
sorted_uniques = sorted(unique_values)
|
|
63
|
+
except Exception:
|
|
64
|
+
_LOGGER.exception(f"Could not process column '{column_name}'.")
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
if not sorted_uniques:
|
|
68
|
+
_LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# --- Sanitize column name to create a valid filename ---
|
|
72
|
+
sanitized_name = sanitize_filename(column_name)
|
|
73
|
+
if not sanitized_name.strip('_'):
|
|
74
|
+
sanitized_name = f'column_{i}'
|
|
75
|
+
file_path = output_dir / f"{sanitized_name}_unique_values.txt"
|
|
76
|
+
|
|
77
|
+
# --- Write to file ---
|
|
78
|
+
try:
|
|
79
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
80
|
+
f.write(f"# Unique values for column: '{column_name}'\n")
|
|
81
|
+
f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
|
|
82
|
+
f.write("-" * 30 + "\n")
|
|
83
|
+
for value in sorted_uniques:
|
|
84
|
+
f.write(f"{value}\n")
|
|
85
|
+
f.write("-" * 30 + "\n")
|
|
86
|
+
except IOError:
|
|
87
|
+
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
88
|
+
else:
|
|
89
|
+
_LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
|
|
90
|
+
|
|
91
|
+
_LOGGER.info("Process complete.")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
########## Basic df cleaner #############
|
|
95
|
+
def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path,None]=None):
|
|
96
|
+
"""
|
|
97
|
+
Performs a comprehensive, standardized cleaning on all columns of a CSV file.
|
|
98
|
+
|
|
99
|
+
The cleaning process includes:
|
|
100
|
+
- Normalizing full-width and typographical punctuation to standard equivalents.
|
|
101
|
+
- Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
|
|
102
|
+
- Stripping any leading or trailing whitespace.
|
|
103
|
+
- Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
|
|
104
|
+
- Converting strings that become empty after cleaning into true null values.
|
|
105
|
+
- Normalizing all text to lowercase.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
input_filepath (Union[str, Path]):
|
|
109
|
+
The path to the source CSV file to be cleaned.
|
|
110
|
+
output_filepath (Union[str, Path, None], optional):
|
|
111
|
+
The path to save the cleaned CSV file. If None (default),
|
|
112
|
+
the original input file will be overwritten.
|
|
113
|
+
"""
|
|
114
|
+
# Handle paths
|
|
115
|
+
input_path = make_fullpath(input_filepath, enforce="file")
|
|
116
|
+
|
|
117
|
+
# Unless explicitly defined, overwrite file.
|
|
118
|
+
if output_filepath is not None:
|
|
119
|
+
parent_dir = make_fullpath(Path(output_filepath).parent, make=True, enforce="directory")
|
|
120
|
+
output_path = parent_dir / Path(output_filepath).name
|
|
121
|
+
else:
|
|
122
|
+
output_path = input_path
|
|
123
|
+
|
|
124
|
+
# load polars df
|
|
125
|
+
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
126
|
+
|
|
127
|
+
# Cleaning rules
|
|
128
|
+
cleaning_rules = {
|
|
129
|
+
# 1. Comprehensive Punctuation & Symbol Normalization
|
|
130
|
+
# Remove invisible control characters
|
|
131
|
+
r'\p{C}+': '',
|
|
132
|
+
|
|
133
|
+
# Full-width to half-width
|
|
134
|
+
'》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
|
|
135
|
+
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#',
|
|
136
|
+
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|',
|
|
137
|
+
|
|
138
|
+
# Others
|
|
139
|
+
'©': '',
|
|
140
|
+
'®': '',
|
|
141
|
+
'™': '',
|
|
142
|
+
|
|
143
|
+
# Collapse repeating punctuation (explicit method)
|
|
144
|
+
r'\.{2,}': '.', # Replace two or more dots with a single dot
|
|
145
|
+
r'\?{2,}': '?', # Replace two or more question marks with a single question mark
|
|
146
|
+
r'!{2,}': '!', # Replace two or more exclamation marks with a single one
|
|
147
|
+
|
|
148
|
+
# Typographical standardization
|
|
149
|
+
# Unify various dashes and hyphens to a standard hyphen-minus
|
|
150
|
+
r'[—–―]': '-',
|
|
151
|
+
# Unify various quote types to standard single quotes
|
|
152
|
+
r'[“”]': "'",
|
|
153
|
+
r'[‘’]': "'",
|
|
154
|
+
|
|
155
|
+
# 2. Internal Whitespace Consolidation
|
|
156
|
+
# Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
|
|
157
|
+
r'\s+': ' ',
|
|
158
|
+
|
|
159
|
+
# 3. Leading/Trailing Whitespace Removal
|
|
160
|
+
# Strip any whitespace from the beginning or end of the string
|
|
161
|
+
r'^\s+|\s+$': '',
|
|
162
|
+
|
|
163
|
+
# 4. Textual Null Standardization (New Step)
|
|
164
|
+
# Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
|
|
165
|
+
r'^(N/A|NA|NULL|NONE|NIL|)$': None,
|
|
166
|
+
|
|
167
|
+
# 5. Final Nullification of Empty Strings
|
|
168
|
+
# After all cleaning, if a string is now empty, convert it to a null
|
|
169
|
+
r'^$': None
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
# Clean data
|
|
173
|
+
try:
|
|
174
|
+
# Create a cleaner for every column in the dataframe
|
|
175
|
+
all_columns = df.columns
|
|
176
|
+
column_cleaners = [
|
|
177
|
+
ColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
# Instantiate and run the main dataframe cleaner
|
|
181
|
+
df_cleaner = DataFrameCleaner(cleaners=column_cleaners)
|
|
182
|
+
df_cleaned = df_cleaner.clean(df, clone_df=False) # Use clone_df=False for efficiency
|
|
183
|
+
|
|
184
|
+
# apply lowercase to all string columns
|
|
185
|
+
df_final = df_cleaned.with_columns(
|
|
186
|
+
pl.col(pl.String).str.to_lowercase()
|
|
187
|
+
)
|
|
188
|
+
except Exception as e:
|
|
189
|
+
_LOGGER.error(f"An error occurred during the cleaning process for '{input_path.name}'.")
|
|
190
|
+
raise e
|
|
191
|
+
|
|
192
|
+
# Save cleaned dataframe
|
|
193
|
+
save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
194
|
+
|
|
195
|
+
_LOGGER.info(f"Successfully cleaned and saved data to '{output_path.name}'.")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
########## EXTRACT and CLEAN ##########
|
|
199
|
+
class ColumnCleaner:
|
|
200
|
+
"""
|
|
201
|
+
A configuration object that defines cleaning rules for a single Polars DataFrame column.
|
|
202
|
+
|
|
203
|
+
This class holds a dictionary of regex-to-replacement rules, the target column name,
|
|
204
|
+
and the case-sensitivity setting. It is intended to be used with the DataFrameCleaner.
|
|
205
|
+
|
|
206
|
+
Notes:
|
|
207
|
+
- Define rules from most specific to more general to create a fallback system.
|
|
208
|
+
- Beware of chain replacements (rules matching strings that have already been
|
|
209
|
+
changed by a previous rule in the same cleaner).
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
column_name (str):
|
|
213
|
+
The name of the column to be cleaned.
|
|
214
|
+
rules (Dict[str, str]):
|
|
215
|
+
A dictionary of regex patterns to replacement strings. Can use
|
|
216
|
+
backreferences (e.g., r'$1 $2') for captured groups. Note that Polars
|
|
217
|
+
uses a '$' prefix for backreferences.
|
|
218
|
+
case_insensitive (bool):
|
|
219
|
+
If True (default), regex matching ignores case.
|
|
220
|
+
|
|
221
|
+
## Usage Example
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
id_rules = {
|
|
225
|
+
# Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
|
|
226
|
+
r'ID[- ](\\d+)': r'ID:$1'
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
|
|
230
|
+
# This object would then be passed to a DataFrameCleaner.
|
|
231
|
+
```
|
|
232
|
+
"""
|
|
233
|
+
def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
|
|
234
|
+
if not isinstance(column_name, str) or not column_name:
|
|
235
|
+
_LOGGER.error("The 'column_name' must be a non-empty string.")
|
|
236
|
+
raise TypeError()
|
|
237
|
+
if not isinstance(rules, dict):
|
|
238
|
+
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
239
|
+
raise TypeError()
|
|
240
|
+
|
|
241
|
+
# Validate each regex pattern for correctness
|
|
242
|
+
for pattern in rules.keys():
|
|
243
|
+
try:
|
|
244
|
+
re.compile(pattern)
|
|
245
|
+
except re.error:
|
|
246
|
+
_LOGGER.error(f"Invalid regex pattern '{pattern}'.")
|
|
247
|
+
raise
|
|
248
|
+
|
|
249
|
+
self.column_name = column_name
|
|
250
|
+
self.rules = rules
|
|
251
|
+
self.case_insensitive = case_insensitive
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class DataFrameCleaner:
|
|
255
|
+
"""
|
|
256
|
+
Orchestrates cleaning multiple columns in a Polars DataFrame.
|
|
257
|
+
|
|
258
|
+
This class takes a list of ColumnCleaner objects and applies their defined
|
|
259
|
+
rules to the corresponding columns of a DataFrame using high-performance
|
|
260
|
+
Polars expressions.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
cleaners (List[ColumnCleaner]):
|
|
264
|
+
A list of ColumnCleaner configuration objects.
|
|
265
|
+
|
|
266
|
+
Raises:
|
|
267
|
+
TypeError: If 'cleaners' is not a list or contains non-ColumnCleaner objects.
|
|
268
|
+
ValueError: If multiple ColumnCleaner objects target the same column.
|
|
269
|
+
"""
|
|
270
|
+
def __init__(self, cleaners: List[ColumnCleaner]):
|
|
271
|
+
if not isinstance(cleaners, list):
|
|
272
|
+
_LOGGER.error("The 'cleaners' argument must be a list of ColumnCleaner objects.")
|
|
273
|
+
raise TypeError()
|
|
274
|
+
|
|
275
|
+
seen_columns = set()
|
|
276
|
+
for cleaner in cleaners:
|
|
277
|
+
if not isinstance(cleaner, ColumnCleaner):
|
|
278
|
+
_LOGGER.error(f"All items in 'cleaners' list must be ColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
|
|
279
|
+
raise TypeError()
|
|
280
|
+
if cleaner.column_name in seen_columns:
|
|
281
|
+
_LOGGER.error(f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
|
|
282
|
+
raise ValueError()
|
|
283
|
+
seen_columns.add(cleaner.column_name)
|
|
284
|
+
|
|
285
|
+
self.cleaners = cleaners
|
|
286
|
+
|
|
287
|
+
def clean(self, df: pl.DataFrame, clone_df: bool=True) -> pl.DataFrame:
|
|
288
|
+
"""
|
|
289
|
+
Applies all defined cleaning rules to the Polars DataFrame.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
df (pl.DataFrame): The Polars DataFrame to clean.
|
|
293
|
+
clone_df (bool): Whether to work on a clone to prevent undesired changes.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
pl.DataFrame: A new, cleaned Polars DataFrame.
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
ValueError: If any columns specified in the cleaners are not found
|
|
300
|
+
in the input DataFrame.
|
|
301
|
+
"""
|
|
302
|
+
rule_columns = {c.column_name for c in self.cleaners}
|
|
303
|
+
df_columns = set(df.columns)
|
|
304
|
+
missing_columns = rule_columns - df_columns
|
|
305
|
+
|
|
306
|
+
if missing_columns:
|
|
307
|
+
_LOGGER.error("The following columns specified in cleaning rules were not found in the DataFrame:")
|
|
308
|
+
for miss_col in sorted(list(missing_columns)):
|
|
309
|
+
print(f"\t- {miss_col}")
|
|
310
|
+
raise ValueError()
|
|
311
|
+
|
|
312
|
+
if clone_df:
|
|
313
|
+
df_cleaned = df.clone()
|
|
314
|
+
else:
|
|
315
|
+
df_cleaned = df
|
|
316
|
+
|
|
317
|
+
# Build and apply a series of expressions for each column
|
|
318
|
+
for cleaner in self.cleaners:
|
|
319
|
+
col_name = cleaner.column_name
|
|
320
|
+
|
|
321
|
+
# Start with the column, cast to String for replacement operations
|
|
322
|
+
col_expr = pl.col(col_name).cast(pl.String)
|
|
323
|
+
|
|
324
|
+
# Sequentially chain 'replace_all' expressions for each rule
|
|
325
|
+
for pattern, replacement in cleaner.rules.items():
|
|
326
|
+
final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
|
|
327
|
+
|
|
328
|
+
if replacement is None:
|
|
329
|
+
# If replacement is None, use a when/then expression to set matching values to null
|
|
330
|
+
col_expr = pl.when(col_expr.str.contains(final_pattern)) \
|
|
331
|
+
.then(None) \
|
|
332
|
+
.otherwise(col_expr)
|
|
333
|
+
else:
|
|
334
|
+
col_expr = col_expr.str.replace_all(final_pattern, replacement)
|
|
335
|
+
|
|
336
|
+
# Execute the expression chain for the column
|
|
337
|
+
df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
|
|
338
|
+
|
|
339
|
+
_LOGGER.info(f"Cleaned {len(self.cleaners)} columns.")
|
|
340
|
+
|
|
341
|
+
return df_cleaned
|
|
342
|
+
|
|
343
|
+
def load_clean_save(self, input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
|
|
344
|
+
"""
|
|
345
|
+
This convenience method encapsulates the entire cleaning process into a
|
|
346
|
+
single call. It loads a DataFrame from a specified file, applies all
|
|
347
|
+
cleaning rules configured in the `DataFrameCleaner` instance, and saves
|
|
348
|
+
the resulting cleaned DataFrame to a new file.
|
|
349
|
+
|
|
350
|
+
The method ensures that all data is loaded as string types to prevent
|
|
351
|
+
unintended type inference issues before cleaning operations are applied.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
input_filepath (Union[str, Path]):
|
|
355
|
+
The path to the input data file.
|
|
356
|
+
output_filepath (Union[str, Path]):
|
|
357
|
+
The full path, where the cleaned data file will be saved.
|
|
358
|
+
"""
|
|
359
|
+
df, _ = load_dataframe(df_path=input_filepath, kind="polars", all_strings=True)
|
|
360
|
+
|
|
361
|
+
df_clean = self.clean(df=df, clone_df=False)
|
|
362
|
+
|
|
363
|
+
if isinstance(output_filepath, str):
|
|
364
|
+
output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
|
|
365
|
+
|
|
366
|
+
save_dataframe(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
|
|
367
|
+
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def info():
|
|
372
|
+
_script_info(__all__)
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -1,18 +1,11 @@
|
|
|
1
1
|
import polars as pl
|
|
2
|
-
import pandas as pd
|
|
3
2
|
import re
|
|
4
|
-
from pathlib import Path
|
|
5
3
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
6
|
-
from .path_manager import sanitize_filename, make_fullpath
|
|
7
|
-
from .utilities import save_dataframe, load_dataframe
|
|
8
4
|
from ._script_info import _script_info
|
|
9
5
|
from ._logger import _LOGGER
|
|
10
6
|
|
|
11
7
|
|
|
12
8
|
__all__ = [
|
|
13
|
-
"save_unique_values",
|
|
14
|
-
"ColumnCleaner",
|
|
15
|
-
"DataFrameCleaner",
|
|
16
9
|
"TransformationRecipe",
|
|
17
10
|
"DataProcessor",
|
|
18
11
|
"BinaryTransformer",
|
|
@@ -28,253 +21,6 @@ __all__ = [
|
|
|
28
21
|
"DateFeatureExtractor"
|
|
29
22
|
]
|
|
30
23
|
|
|
31
|
-
################ Unique Values per column #################
|
|
32
|
-
def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
|
|
33
|
-
"""
|
|
34
|
-
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
35
|
-
from each column into a separate text file exactly as they appear.
|
|
36
|
-
|
|
37
|
-
This is useful for understanding the raw categories or range of values
|
|
38
|
-
within a dataset before cleaning.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
csv_path (Union[str, Path]):
|
|
42
|
-
The file path to the input CSV file.
|
|
43
|
-
output_dir (Union[str, Path]):
|
|
44
|
-
The path to the directory where the .txt files will be saved.
|
|
45
|
-
The directory will be created if it does not exist.
|
|
46
|
-
"""
|
|
47
|
-
# --- 1. Input Validation ---
|
|
48
|
-
csv_path = make_fullpath(input_path=csv_path, enforce="file")
|
|
49
|
-
output_dir = make_fullpath(input_path=output_dir, make=True)
|
|
50
|
-
|
|
51
|
-
# --- 2. Load Data ---
|
|
52
|
-
try:
|
|
53
|
-
# Load all columns as strings to preserve original formatting
|
|
54
|
-
df = pd.read_csv(csv_path, dtype=str, encoding='utf-8')
|
|
55
|
-
except FileNotFoundError as e:
|
|
56
|
-
_LOGGER.error(f"The file was not found at '{csv_path}'.")
|
|
57
|
-
raise e
|
|
58
|
-
except Exception as e2:
|
|
59
|
-
_LOGGER.error(f"An error occurred while reading the CSV file.")
|
|
60
|
-
raise e2
|
|
61
|
-
else:
|
|
62
|
-
_LOGGER.info(f"Data loaded from '{csv_path}'")
|
|
63
|
-
|
|
64
|
-
# --- 3. Process Each Column ---
|
|
65
|
-
for i, column_name in enumerate(df.columns):
|
|
66
|
-
# _LOGGER.info(f"Processing column: '{column_name}'...")
|
|
67
|
-
|
|
68
|
-
# --- Get unique values AS IS ---
|
|
69
|
-
try:
|
|
70
|
-
# Drop nulls, get unique values, and sort them.
|
|
71
|
-
# The values are preserved exactly as they are in the cells.
|
|
72
|
-
unique_values = df[column_name].dropna().unique()
|
|
73
|
-
sorted_uniques = sorted(unique_values)
|
|
74
|
-
except Exception:
|
|
75
|
-
_LOGGER.exception(f"Could not process column '{column_name}'.")
|
|
76
|
-
continue
|
|
77
|
-
|
|
78
|
-
if not sorted_uniques:
|
|
79
|
-
_LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
|
|
80
|
-
continue
|
|
81
|
-
|
|
82
|
-
# --- Sanitize column name to create a valid filename ---
|
|
83
|
-
sanitized_name = sanitize_filename(column_name)
|
|
84
|
-
if not sanitized_name.strip('_'):
|
|
85
|
-
sanitized_name = f'column_{i}'
|
|
86
|
-
file_path = output_dir / f"{sanitized_name}_unique_values.txt"
|
|
87
|
-
|
|
88
|
-
# --- Write to file ---
|
|
89
|
-
try:
|
|
90
|
-
with open(file_path, 'w', encoding='utf-8') as f:
|
|
91
|
-
f.write(f"# Unique values for column: '{column_name}'\n")
|
|
92
|
-
f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
|
|
93
|
-
f.write("-" * 30 + "\n")
|
|
94
|
-
for value in sorted_uniques:
|
|
95
|
-
f.write(f"{value}\n")
|
|
96
|
-
f.write("-" * 30 + "\n")
|
|
97
|
-
except IOError:
|
|
98
|
-
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
99
|
-
else:
|
|
100
|
-
_LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
|
|
101
|
-
|
|
102
|
-
_LOGGER.info("Process complete.")
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
########## EXTRACT and CLEAN ##########
|
|
106
|
-
class ColumnCleaner:
|
|
107
|
-
"""
|
|
108
|
-
A configuration object that defines cleaning rules for a single Polars DataFrame column.
|
|
109
|
-
|
|
110
|
-
This class holds a dictionary of regex-to-replacement rules, the target column name,
|
|
111
|
-
and the case-sensitivity setting. It is intended to be used with the DataFrameCleaner.
|
|
112
|
-
|
|
113
|
-
Notes:
|
|
114
|
-
- Define rules from most specific to more general to create a fallback system.
|
|
115
|
-
- Beware of chain replacements (rules matching strings that have already been
|
|
116
|
-
changed by a previous rule in the same cleaner).
|
|
117
|
-
|
|
118
|
-
Args:
|
|
119
|
-
column_name (str):
|
|
120
|
-
The name of the column to be cleaned.
|
|
121
|
-
rules (Dict[str, str]):
|
|
122
|
-
A dictionary of regex patterns to replacement strings. Can use
|
|
123
|
-
backreferences (e.g., r'$1 $2') for captured groups. Note that Polars
|
|
124
|
-
uses a '$' prefix for backreferences.
|
|
125
|
-
case_insensitive (bool):
|
|
126
|
-
If True (default), regex matching ignores case.
|
|
127
|
-
|
|
128
|
-
## Usage Example
|
|
129
|
-
|
|
130
|
-
```python
|
|
131
|
-
id_rules = {
|
|
132
|
-
# Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
|
|
133
|
-
r'ID[- ](\\d+)': r'ID:$1'
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
|
|
137
|
-
# This object would then be passed to a DataFrameCleaner.
|
|
138
|
-
```
|
|
139
|
-
"""
|
|
140
|
-
def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
|
|
141
|
-
if not isinstance(column_name, str) or not column_name:
|
|
142
|
-
_LOGGER.error("The 'column_name' must be a non-empty string.")
|
|
143
|
-
raise TypeError()
|
|
144
|
-
if not isinstance(rules, dict):
|
|
145
|
-
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
146
|
-
raise TypeError()
|
|
147
|
-
|
|
148
|
-
# Validate each regex pattern for correctness
|
|
149
|
-
for pattern in rules.keys():
|
|
150
|
-
try:
|
|
151
|
-
re.compile(pattern)
|
|
152
|
-
except re.error:
|
|
153
|
-
_LOGGER.error(f"Invalid regex pattern '{pattern}'.")
|
|
154
|
-
raise
|
|
155
|
-
|
|
156
|
-
self.column_name = column_name
|
|
157
|
-
self.rules = rules
|
|
158
|
-
self.case_insensitive = case_insensitive
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
class DataFrameCleaner:
|
|
162
|
-
"""
|
|
163
|
-
Orchestrates cleaning multiple columns in a Polars DataFrame.
|
|
164
|
-
|
|
165
|
-
This class takes a list of ColumnCleaner objects and applies their defined
|
|
166
|
-
rules to the corresponding columns of a DataFrame using high-performance
|
|
167
|
-
Polars expressions.
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
cleaners (List[ColumnCleaner]):
|
|
171
|
-
A list of ColumnCleaner configuration objects.
|
|
172
|
-
|
|
173
|
-
Raises:
|
|
174
|
-
TypeError: If 'cleaners' is not a list or contains non-ColumnCleaner objects.
|
|
175
|
-
ValueError: If multiple ColumnCleaner objects target the same column.
|
|
176
|
-
"""
|
|
177
|
-
def __init__(self, cleaners: List[ColumnCleaner]):
|
|
178
|
-
if not isinstance(cleaners, list):
|
|
179
|
-
_LOGGER.error("The 'cleaners' argument must be a list of ColumnCleaner objects.")
|
|
180
|
-
raise TypeError()
|
|
181
|
-
|
|
182
|
-
seen_columns = set()
|
|
183
|
-
for cleaner in cleaners:
|
|
184
|
-
if not isinstance(cleaner, ColumnCleaner):
|
|
185
|
-
_LOGGER.error(f"All items in 'cleaners' list must be ColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
|
|
186
|
-
raise TypeError()
|
|
187
|
-
if cleaner.column_name in seen_columns:
|
|
188
|
-
_LOGGER.error(f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
|
|
189
|
-
raise ValueError()
|
|
190
|
-
seen_columns.add(cleaner.column_name)
|
|
191
|
-
|
|
192
|
-
self.cleaners = cleaners
|
|
193
|
-
|
|
194
|
-
def clean(self, df: pl.DataFrame, clone_df: bool=True) -> pl.DataFrame:
|
|
195
|
-
"""
|
|
196
|
-
Applies all defined cleaning rules to the Polars DataFrame.
|
|
197
|
-
|
|
198
|
-
Args:
|
|
199
|
-
df (pl.DataFrame): The Polars DataFrame to clean.
|
|
200
|
-
clone_df (bool): Whether to work on a clone to prevent undesired changes.
|
|
201
|
-
|
|
202
|
-
Returns:
|
|
203
|
-
pl.DataFrame: A new, cleaned Polars DataFrame.
|
|
204
|
-
|
|
205
|
-
Raises:
|
|
206
|
-
ValueError: If any columns specified in the cleaners are not found
|
|
207
|
-
in the input DataFrame.
|
|
208
|
-
"""
|
|
209
|
-
rule_columns = {c.column_name for c in self.cleaners}
|
|
210
|
-
df_columns = set(df.columns)
|
|
211
|
-
missing_columns = rule_columns - df_columns
|
|
212
|
-
|
|
213
|
-
if missing_columns:
|
|
214
|
-
_LOGGER.error("The following columns specified in cleaning rules were not found in the DataFrame:")
|
|
215
|
-
for miss_col in sorted(list(missing_columns)):
|
|
216
|
-
print(f"\t- {miss_col}")
|
|
217
|
-
raise ValueError()
|
|
218
|
-
|
|
219
|
-
if clone_df:
|
|
220
|
-
df_cleaned = df.clone()
|
|
221
|
-
else:
|
|
222
|
-
df_cleaned = df
|
|
223
|
-
|
|
224
|
-
# Build and apply a series of expressions for each column
|
|
225
|
-
for cleaner in self.cleaners:
|
|
226
|
-
col_name = cleaner.column_name
|
|
227
|
-
|
|
228
|
-
# Start with the column, cast to String for replacement operations
|
|
229
|
-
col_expr = pl.col(col_name).cast(pl.String)
|
|
230
|
-
|
|
231
|
-
# Sequentially chain 'replace_all' expressions for each rule
|
|
232
|
-
for pattern, replacement in cleaner.rules.items():
|
|
233
|
-
final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
|
|
234
|
-
|
|
235
|
-
if replacement is None:
|
|
236
|
-
# If replacement is None, use a when/then expression to set matching values to null
|
|
237
|
-
col_expr = pl.when(col_expr.str.contains(final_pattern)) \
|
|
238
|
-
.then(None) \
|
|
239
|
-
.otherwise(col_expr)
|
|
240
|
-
else:
|
|
241
|
-
col_expr = col_expr.str.replace_all(final_pattern, replacement)
|
|
242
|
-
|
|
243
|
-
# Execute the expression chain for the column
|
|
244
|
-
df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
|
|
245
|
-
|
|
246
|
-
_LOGGER.info(f"Cleaned {len(self.cleaners)} columns.")
|
|
247
|
-
|
|
248
|
-
return df_cleaned
|
|
249
|
-
|
|
250
|
-
def load_clean_save(self, input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
|
|
251
|
-
"""
|
|
252
|
-
This convenience method encapsulates the entire cleaning process into a
|
|
253
|
-
single call. It loads a DataFrame from a specified file, applies all
|
|
254
|
-
cleaning rules configured in the `DataFrameCleaner` instance, and saves
|
|
255
|
-
the resulting cleaned DataFrame to a new file.
|
|
256
|
-
|
|
257
|
-
The method ensures that all data is loaded as string types to prevent
|
|
258
|
-
unintended type inference issues before cleaning operations are applied.
|
|
259
|
-
|
|
260
|
-
Args:
|
|
261
|
-
input_filepath (Union[str, Path]):
|
|
262
|
-
The path to the input data file.
|
|
263
|
-
output_filepath (Union[str, Path]):
|
|
264
|
-
The full path, where the cleaned data file will be saved.
|
|
265
|
-
"""
|
|
266
|
-
df, _ = load_dataframe(df_path=input_filepath, kind="polars", all_strings=True)
|
|
267
|
-
|
|
268
|
-
df_clean = self.clean(df=df, clone_df=False)
|
|
269
|
-
|
|
270
|
-
if isinstance(output_filepath, str):
|
|
271
|
-
output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
|
|
272
|
-
|
|
273
|
-
save_dataframe(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
|
|
274
|
-
|
|
275
|
-
return None
|
|
276
|
-
|
|
277
|
-
|
|
278
24
|
############ TRANSFORM MAIN ####################
|
|
279
25
|
|
|
280
26
|
# Magic word for rename-only transformation
|
|
@@ -631,7 +377,7 @@ class MultiBinaryDummifier:
|
|
|
631
377
|
)
|
|
632
378
|
output_expressions.append(expr)
|
|
633
379
|
|
|
634
|
-
return pl.select(output_expressions)
|
|
380
|
+
return pl.select(output_expressions) # type: ignore
|
|
635
381
|
|
|
636
382
|
|
|
637
383
|
class KeywordDummifier:
|
ml_tools/utilities.py
CHANGED
|
@@ -3,7 +3,7 @@ import numpy as np
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import polars as pl
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
|
|
6
|
+
from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple, overload
|
|
7
7
|
import joblib
|
|
8
8
|
from joblib.externals.loky.process_executor import TerminatedWorkerError
|
|
9
9
|
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
|
|
@@ -28,12 +28,32 @@ __all__ = [
|
|
|
28
28
|
]
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
# Overload 1: When kind='pandas'
|
|
32
|
+
@overload
|
|
33
|
+
def load_dataframe(
|
|
34
|
+
df_path: Union[str, Path],
|
|
35
|
+
kind: Literal["pandas"] = "pandas",
|
|
36
|
+
all_strings: bool = False,
|
|
37
|
+
verbose: bool = True
|
|
38
|
+
) -> Tuple[pd.DataFrame, str]:
|
|
39
|
+
... # for overload stubs
|
|
40
|
+
|
|
41
|
+
# Overload 2: When kind='polars'
|
|
42
|
+
@overload
|
|
43
|
+
def load_dataframe(
|
|
44
|
+
df_path: Union[str, Path],
|
|
45
|
+
kind: Literal["polars"],
|
|
46
|
+
all_strings: bool = False,
|
|
47
|
+
verbose: bool = True
|
|
48
|
+
) -> Tuple[pl.DataFrame, str]:
|
|
49
|
+
... # for overload stubs
|
|
50
|
+
|
|
31
51
|
def load_dataframe(
|
|
32
52
|
df_path: Union[str, Path],
|
|
33
53
|
kind: Literal["pandas", "polars"] = "pandas",
|
|
34
54
|
all_strings: bool = False,
|
|
35
55
|
verbose: bool = True
|
|
36
|
-
) -> Tuple[
|
|
56
|
+
) -> Union[Tuple[pd.DataFrame, str], Tuple[pl.DataFrame, str]]:
|
|
37
57
|
"""
|
|
38
58
|
Load a CSV file into a DataFrame and extract its base name.
|
|
39
59
|
|
|
@@ -41,13 +61,13 @@ def load_dataframe(
|
|
|
41
61
|
columns as string types to prevent type inference errors.
|
|
42
62
|
|
|
43
63
|
Args:
|
|
44
|
-
df_path (
|
|
64
|
+
df_path (str, Path):
|
|
45
65
|
The path to the CSV file.
|
|
46
|
-
kind (
|
|
66
|
+
kind ("pandas", "polars"):
|
|
47
67
|
The type of DataFrame to load. Defaults to "pandas".
|
|
48
|
-
all_strings (bool
|
|
68
|
+
all_strings (bool):
|
|
49
69
|
If True, loads all columns as string data types. This is useful for
|
|
50
|
-
ETL tasks and to avoid type-inference errors.
|
|
70
|
+
ETL tasks and to avoid type-inference errors.
|
|
51
71
|
|
|
52
72
|
Returns:
|
|
53
73
|
(Tuple[DataFrameType, str]):
|
|
@@ -87,7 +107,7 @@ def load_dataframe(
|
|
|
87
107
|
if verbose:
|
|
88
108
|
_LOGGER.info(f"💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
|
|
89
109
|
|
|
90
|
-
return df, df_name
|
|
110
|
+
return df, df_name # type: ignore
|
|
91
111
|
|
|
92
112
|
|
|
93
113
|
def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|