dragon-ml-toolbox 8.1.0__tar.gz → 9.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-8.1.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-9.0.0}/PKG-INFO +5 -1
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0/dragon_ml_toolbox.egg-info}/PKG-INFO +5 -1
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -1
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/dragon_ml_toolbox.egg-info/requires.txt +4 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ETL_engineering.py +216 -81
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/GUI_tools.py +5 -5
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/MICE_imputation.py +12 -8
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_callbacks.py +6 -3
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_datasetmaster.py +37 -20
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_evaluation.py +4 -4
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_evaluation_multi.py +26 -17
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_inference.py +30 -23
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_models.py +14 -14
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_optimization.py +4 -3
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_scaler.py +7 -7
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_trainer.py +17 -15
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/PSO_optimization.py +16 -8
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/RNN_forecast.py +1 -1
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/SQL.py +22 -13
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/VIF_factor.py +7 -6
- dragon_ml_toolbox-9.0.0/ml_tools/_logger.py +134 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/custom_logger.py +12 -8
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/data_exploration.py +20 -15
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ensemble_evaluation.py +10 -6
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ensemble_inference.py +18 -18
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ensemble_learning.py +8 -5
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/handle_excel.py +15 -11
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/optimization_tools.py +3 -4
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/path_manager.py +21 -15
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/utilities.py +35 -26
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/pyproject.toml +7 -3
- dragon_ml_toolbox-8.1.0/ml_tools/_ML_optimization_multi.py +0 -231
- dragon_ml_toolbox-8.1.0/ml_tools/_logger.py +0 -36
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/LICENSE +0 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/README.md +0 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 9.0.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -17,6 +17,7 @@ Requires-Dist: pandas; extra == "base"
|
|
|
17
17
|
Requires-Dist: numpy; extra == "base"
|
|
18
18
|
Requires-Dist: polars; extra == "base"
|
|
19
19
|
Requires-Dist: joblib; extra == "base"
|
|
20
|
+
Requires-Dist: colorlog; extra == "base"
|
|
20
21
|
Provides-Extra: ml
|
|
21
22
|
Requires-Dist: numpy>=2.0; extra == "ml"
|
|
22
23
|
Requires-Dist: pandas; extra == "ml"
|
|
@@ -37,6 +38,7 @@ Requires-Dist: shap; extra == "ml"
|
|
|
37
38
|
Requires-Dist: tqdm; extra == "ml"
|
|
38
39
|
Requires-Dist: Pillow; extra == "ml"
|
|
39
40
|
Requires-Dist: evotorch; extra == "ml"
|
|
41
|
+
Requires-Dist: colorlog; extra == "ml"
|
|
40
42
|
Provides-Extra: mice
|
|
41
43
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
42
44
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -48,6 +50,7 @@ Requires-Dist: matplotlib; extra == "mice"
|
|
|
48
50
|
Requires-Dist: statsmodels; extra == "mice"
|
|
49
51
|
Requires-Dist: lightgbm<=4.5.0; extra == "mice"
|
|
50
52
|
Requires-Dist: shap; extra == "mice"
|
|
53
|
+
Requires-Dist: colorlog; extra == "mice"
|
|
51
54
|
Provides-Extra: pytorch
|
|
52
55
|
Requires-Dist: torch; extra == "pytorch"
|
|
53
56
|
Requires-Dist: torchvision; extra == "pytorch"
|
|
@@ -59,6 +62,7 @@ Requires-Dist: ipykernel; extra == "excel"
|
|
|
59
62
|
Requires-Dist: notebook; extra == "excel"
|
|
60
63
|
Requires-Dist: jupyterlab; extra == "excel"
|
|
61
64
|
Requires-Dist: ipywidgets; extra == "excel"
|
|
65
|
+
Requires-Dist: colorlog; extra == "excel"
|
|
62
66
|
Provides-Extra: gui-boost
|
|
63
67
|
Requires-Dist: numpy; extra == "gui-boost"
|
|
64
68
|
Requires-Dist: joblib; extra == "gui-boost"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 9.0.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -17,6 +17,7 @@ Requires-Dist: pandas; extra == "base"
|
|
|
17
17
|
Requires-Dist: numpy; extra == "base"
|
|
18
18
|
Requires-Dist: polars; extra == "base"
|
|
19
19
|
Requires-Dist: joblib; extra == "base"
|
|
20
|
+
Requires-Dist: colorlog; extra == "base"
|
|
20
21
|
Provides-Extra: ml
|
|
21
22
|
Requires-Dist: numpy>=2.0; extra == "ml"
|
|
22
23
|
Requires-Dist: pandas; extra == "ml"
|
|
@@ -37,6 +38,7 @@ Requires-Dist: shap; extra == "ml"
|
|
|
37
38
|
Requires-Dist: tqdm; extra == "ml"
|
|
38
39
|
Requires-Dist: Pillow; extra == "ml"
|
|
39
40
|
Requires-Dist: evotorch; extra == "ml"
|
|
41
|
+
Requires-Dist: colorlog; extra == "ml"
|
|
40
42
|
Provides-Extra: mice
|
|
41
43
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
42
44
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -48,6 +50,7 @@ Requires-Dist: matplotlib; extra == "mice"
|
|
|
48
50
|
Requires-Dist: statsmodels; extra == "mice"
|
|
49
51
|
Requires-Dist: lightgbm<=4.5.0; extra == "mice"
|
|
50
52
|
Requires-Dist: shap; extra == "mice"
|
|
53
|
+
Requires-Dist: colorlog; extra == "mice"
|
|
51
54
|
Provides-Extra: pytorch
|
|
52
55
|
Requires-Dist: torch; extra == "pytorch"
|
|
53
56
|
Requires-Dist: torchvision; extra == "pytorch"
|
|
@@ -59,6 +62,7 @@ Requires-Dist: ipykernel; extra == "excel"
|
|
|
59
62
|
Requires-Dist: notebook; extra == "excel"
|
|
60
63
|
Requires-Dist: jupyterlab; extra == "excel"
|
|
61
64
|
Requires-Dist: ipywidgets; extra == "excel"
|
|
65
|
+
Requires-Dist: colorlog; extra == "excel"
|
|
62
66
|
Provides-Extra: gui-boost
|
|
63
67
|
Requires-Dist: numpy; extra == "gui-boost"
|
|
64
68
|
Requires-Dist: joblib; extra == "gui-boost"
|
|
@@ -19,12 +19,14 @@ shap
|
|
|
19
19
|
tqdm
|
|
20
20
|
Pillow
|
|
21
21
|
evotorch
|
|
22
|
+
colorlog
|
|
22
23
|
|
|
23
24
|
[base]
|
|
24
25
|
pandas
|
|
25
26
|
numpy
|
|
26
27
|
polars
|
|
27
28
|
joblib
|
|
29
|
+
colorlog
|
|
28
30
|
|
|
29
31
|
[excel]
|
|
30
32
|
pandas
|
|
@@ -34,6 +36,7 @@ ipykernel
|
|
|
34
36
|
notebook
|
|
35
37
|
jupyterlab
|
|
36
38
|
ipywidgets
|
|
39
|
+
colorlog
|
|
37
40
|
|
|
38
41
|
[gui-boost]
|
|
39
42
|
numpy
|
|
@@ -57,6 +60,7 @@ matplotlib
|
|
|
57
60
|
statsmodels
|
|
58
61
|
lightgbm<=4.5.0
|
|
59
62
|
shap
|
|
63
|
+
colorlog
|
|
60
64
|
|
|
61
65
|
[nuitka]
|
|
62
66
|
nuitka
|
|
@@ -1,18 +1,22 @@
|
|
|
1
1
|
import polars as pl
|
|
2
|
+
import pandas as pd
|
|
2
3
|
import re
|
|
4
|
+
from pathlib import Path
|
|
3
5
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
6
|
+
from .path_manager import sanitize_filename, make_fullpath
|
|
4
7
|
from ._script_info import _script_info
|
|
5
8
|
from ._logger import _LOGGER
|
|
6
|
-
import warnings
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
__all__ = [
|
|
12
|
+
"save_unique_values",
|
|
10
13
|
"ColumnCleaner",
|
|
11
14
|
"DataFrameCleaner",
|
|
12
15
|
"TransformationRecipe",
|
|
13
16
|
"DataProcessor",
|
|
14
17
|
"BinaryTransformer",
|
|
15
18
|
"MultiBinaryDummifier",
|
|
19
|
+
"AutoDummifier",
|
|
16
20
|
"KeywordDummifier",
|
|
17
21
|
"NumberExtractor",
|
|
18
22
|
"MultiNumberExtractor",
|
|
@@ -23,6 +27,80 @@ __all__ = [
|
|
|
23
27
|
"DateFeatureExtractor"
|
|
24
28
|
]
|
|
25
29
|
|
|
30
|
+
################ Unique Values per column #################
|
|
31
|
+
def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
34
|
+
from each column into a separate text file exactly as they appear.
|
|
35
|
+
|
|
36
|
+
This is useful for understanding the raw categories or range of values
|
|
37
|
+
within a dataset before cleaning.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
csv_path (Union[str, Path]):
|
|
41
|
+
The file path to the input CSV file.
|
|
42
|
+
output_dir (Union[str, Path]):
|
|
43
|
+
The path to the directory where the .txt files will be saved.
|
|
44
|
+
The directory will be created if it does not exist.
|
|
45
|
+
"""
|
|
46
|
+
# --- 1. Input Validation ---
|
|
47
|
+
csv_path = make_fullpath(input_path=csv_path, enforce="file")
|
|
48
|
+
output_dir = make_fullpath(input_path=output_dir, make=True)
|
|
49
|
+
|
|
50
|
+
# --- 2. Load Data ---
|
|
51
|
+
try:
|
|
52
|
+
# Load all columns as strings to preserve original formatting
|
|
53
|
+
df = pd.read_csv(csv_path, dtype=str, encoding='utf-8')
|
|
54
|
+
except FileNotFoundError as e:
|
|
55
|
+
_LOGGER.error(f"The file was not found at '{csv_path}'.")
|
|
56
|
+
raise e
|
|
57
|
+
except Exception as e2:
|
|
58
|
+
_LOGGER.error(f"An error occurred while reading the CSV file.")
|
|
59
|
+
raise e2
|
|
60
|
+
else:
|
|
61
|
+
_LOGGER.info(f"Data loaded from '{csv_path}'")
|
|
62
|
+
|
|
63
|
+
# --- 3. Process Each Column ---
|
|
64
|
+
for i, column_name in enumerate(df.columns):
|
|
65
|
+
_LOGGER.info(f"Processing column: '{column_name}'...")
|
|
66
|
+
|
|
67
|
+
# --- Get unique values AS IS ---
|
|
68
|
+
try:
|
|
69
|
+
# Drop nulls, get unique values, and sort them.
|
|
70
|
+
# The values are preserved exactly as they are in the cells.
|
|
71
|
+
unique_values = df[column_name].dropna().unique()
|
|
72
|
+
sorted_uniques = sorted(unique_values)
|
|
73
|
+
except Exception:
|
|
74
|
+
_LOGGER.exception(f"Could not process column '{column_name}'.")
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
if not sorted_uniques:
|
|
78
|
+
_LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
# --- Sanitize column name to create a valid filename ---
|
|
82
|
+
sanitized_name = sanitize_filename(column_name)
|
|
83
|
+
if not sanitized_name.strip('_'):
|
|
84
|
+
sanitized_name = f'column_{i}'
|
|
85
|
+
file_path = output_dir / f"{sanitized_name}_unique_values.txt"
|
|
86
|
+
|
|
87
|
+
# --- Write to file ---
|
|
88
|
+
try:
|
|
89
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
90
|
+
f.write(f"# Unique values for column: '{column_name}'\n")
|
|
91
|
+
f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
|
|
92
|
+
f.write("-" * 30 + "\n")
|
|
93
|
+
for value in sorted_uniques:
|
|
94
|
+
f.write(f"{value}\n")
|
|
95
|
+
f.write("-" * 30 + "\n")
|
|
96
|
+
except IOError:
|
|
97
|
+
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
98
|
+
else:
|
|
99
|
+
_LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values to '{file_path}'")
|
|
100
|
+
|
|
101
|
+
_LOGGER.info("Process complete.")
|
|
102
|
+
|
|
103
|
+
|
|
26
104
|
########## EXTRACT and CLEAN ##########
|
|
27
105
|
class ColumnCleaner:
|
|
28
106
|
"""
|
|
@@ -60,16 +138,19 @@ class ColumnCleaner:
|
|
|
60
138
|
"""
|
|
61
139
|
def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
|
|
62
140
|
if not isinstance(column_name, str) or not column_name:
|
|
63
|
-
|
|
141
|
+
_LOGGER.error("The 'column_name' must be a non-empty string.")
|
|
142
|
+
raise TypeError()
|
|
64
143
|
if not isinstance(rules, dict):
|
|
65
|
-
|
|
144
|
+
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
145
|
+
raise TypeError()
|
|
66
146
|
|
|
67
147
|
# Validate each regex pattern for correctness
|
|
68
148
|
for pattern in rules.keys():
|
|
69
149
|
try:
|
|
70
150
|
re.compile(pattern)
|
|
71
|
-
except re.error
|
|
72
|
-
|
|
151
|
+
except re.error:
|
|
152
|
+
_LOGGER.error(f"Invalid regex pattern '{pattern}'.")
|
|
153
|
+
raise
|
|
73
154
|
|
|
74
155
|
self.column_name = column_name
|
|
75
156
|
self.rules = rules
|
|
@@ -94,20 +175,17 @@ class DataFrameCleaner:
|
|
|
94
175
|
"""
|
|
95
176
|
def __init__(self, cleaners: List[ColumnCleaner]):
|
|
96
177
|
if not isinstance(cleaners, list):
|
|
97
|
-
|
|
178
|
+
_LOGGER.error("The 'cleaners' argument must be a list of ColumnCleaner objects.")
|
|
179
|
+
raise TypeError()
|
|
98
180
|
|
|
99
181
|
seen_columns = set()
|
|
100
182
|
for cleaner in cleaners:
|
|
101
183
|
if not isinstance(cleaner, ColumnCleaner):
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
f"but found an object of type {type(cleaner).__name__}."
|
|
105
|
-
)
|
|
184
|
+
_LOGGER.error(f"All items in 'cleaners' list must be ColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
|
|
185
|
+
raise TypeError()
|
|
106
186
|
if cleaner.column_name in seen_columns:
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"Each column should only have one cleaner."
|
|
110
|
-
)
|
|
187
|
+
_LOGGER.error(f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
|
|
188
|
+
raise ValueError()
|
|
111
189
|
seen_columns.add(cleaner.column_name)
|
|
112
190
|
|
|
113
191
|
self.cleaners = cleaners
|
|
@@ -131,10 +209,10 @@ class DataFrameCleaner:
|
|
|
131
209
|
missing_columns = rule_columns - df_columns
|
|
132
210
|
|
|
133
211
|
if missing_columns:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
f"
|
|
137
|
-
)
|
|
212
|
+
_LOGGER.error("The following columns specified in cleaning rules were not found in the DataFrame:")
|
|
213
|
+
for miss_col in sorted(list(missing_columns)):
|
|
214
|
+
print(f"\t- {miss_col}")
|
|
215
|
+
raise ValueError()
|
|
138
216
|
|
|
139
217
|
df_cleaned = df.clone()
|
|
140
218
|
|
|
@@ -153,7 +231,7 @@ class DataFrameCleaner:
|
|
|
153
231
|
# Execute the expression chain for the column
|
|
154
232
|
df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
|
|
155
233
|
|
|
156
|
-
|
|
234
|
+
_LOGGER.info(f"Cleaned {len(self.cleaners)} columns.")
|
|
157
235
|
|
|
158
236
|
return df_cleaned
|
|
159
237
|
|
|
@@ -199,16 +277,20 @@ class TransformationRecipe:
|
|
|
199
277
|
"""
|
|
200
278
|
# --- Validation ---
|
|
201
279
|
if not isinstance(input_col_name, str) or not input_col_name:
|
|
202
|
-
|
|
280
|
+
_LOGGER.error("'input_col' must be a non-empty string.")
|
|
281
|
+
raise TypeError()
|
|
203
282
|
|
|
204
283
|
if transform == _RENAME:
|
|
205
284
|
if not isinstance(output_col_names, str):
|
|
206
|
-
|
|
285
|
+
_LOGGER.error("For a RENAME operation, 'output_col' must be a string.")
|
|
286
|
+
raise TypeError()
|
|
207
287
|
elif not isinstance(transform, Callable):
|
|
208
|
-
|
|
288
|
+
_LOGGER.error(f"'transform' must be a callable function or the string '{_RENAME}'.")
|
|
289
|
+
raise TypeError()
|
|
209
290
|
|
|
210
291
|
if isinstance(output_col_names, list) and transform == _RENAME:
|
|
211
|
-
|
|
292
|
+
_LOGGER.error("A RENAME operation cannot have a list of output columns.")
|
|
293
|
+
raise ValueError()
|
|
212
294
|
|
|
213
295
|
# --- Add Step ---
|
|
214
296
|
step = {
|
|
@@ -243,9 +325,11 @@ class DataProcessor:
|
|
|
243
325
|
been populated with transformation steps.
|
|
244
326
|
"""
|
|
245
327
|
if not isinstance(recipe, TransformationRecipe):
|
|
246
|
-
|
|
328
|
+
_LOGGER.error("The recipe must be an instance of TransformationRecipe.")
|
|
329
|
+
raise TypeError()
|
|
247
330
|
if len(recipe) == 0:
|
|
248
|
-
|
|
331
|
+
_LOGGER.error("The recipe cannot be empty.")
|
|
332
|
+
raise ValueError()
|
|
249
333
|
self._recipe = recipe
|
|
250
334
|
|
|
251
335
|
def transform(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
@@ -260,7 +344,8 @@ class DataProcessor:
|
|
|
260
344
|
transform_action = step["transform"]
|
|
261
345
|
|
|
262
346
|
if input_col_name not in df.columns:
|
|
263
|
-
|
|
347
|
+
_LOGGER.error(f"Input column '{input_col_name}' not found in DataFrame.")
|
|
348
|
+
raise ValueError()
|
|
264
349
|
|
|
265
350
|
input_series = df.get_column(input_col_name)
|
|
266
351
|
|
|
@@ -273,29 +358,44 @@ class DataProcessor:
|
|
|
273
358
|
|
|
274
359
|
if isinstance(result, pl.Series):
|
|
275
360
|
if not isinstance(output_col_spec, str):
|
|
276
|
-
|
|
361
|
+
_LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
|
|
362
|
+
raise TypeError()
|
|
277
363
|
processed_columns.append(result.alias(output_col_spec))
|
|
278
364
|
|
|
279
365
|
elif isinstance(result, pl.DataFrame):
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
)
|
|
366
|
+
# 1. Handle list-based renaming
|
|
367
|
+
if isinstance(output_col_spec, list):
|
|
368
|
+
if len(result.columns) != len(output_col_spec):
|
|
369
|
+
_LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
|
|
370
|
+
raise ValueError()
|
|
371
|
+
|
|
372
|
+
renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
|
|
373
|
+
processed_columns.extend(renamed_df.get_columns())
|
|
374
|
+
|
|
375
|
+
# 2. Handle a string prefix for AutoDummifier
|
|
376
|
+
elif isinstance(output_col_spec, str):
|
|
377
|
+
prefix = output_col_spec
|
|
378
|
+
# Replace the original name part with the desired prefix.
|
|
379
|
+
new_names = {
|
|
380
|
+
col: f"{prefix}{col[len(input_col_name):]}" for col in result.columns
|
|
381
|
+
}
|
|
382
|
+
renamed_df = result.rename(new_names)
|
|
383
|
+
processed_columns.extend(renamed_df.get_columns())
|
|
287
384
|
|
|
288
|
-
|
|
289
|
-
|
|
385
|
+
else:
|
|
386
|
+
_LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names or a string prefix.")
|
|
387
|
+
raise TypeError()
|
|
290
388
|
|
|
291
389
|
else:
|
|
292
|
-
|
|
390
|
+
_LOGGER.error(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
|
|
391
|
+
raise TypeError()
|
|
293
392
|
|
|
294
|
-
else: # This case is
|
|
295
|
-
|
|
393
|
+
else: # This case is unlikely due to builder validation.
|
|
394
|
+
_LOGGER.error(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
|
|
395
|
+
raise TypeError()
|
|
296
396
|
|
|
297
397
|
if not processed_columns:
|
|
298
|
-
_LOGGER.
|
|
398
|
+
_LOGGER.error("The transformation resulted in an empty DataFrame.")
|
|
299
399
|
return pl.DataFrame()
|
|
300
400
|
|
|
301
401
|
return pl.DataFrame(processed_columns)
|
|
@@ -365,18 +465,17 @@ class BinaryTransformer:
|
|
|
365
465
|
):
|
|
366
466
|
# --- Validation: Enforce one and only one option ---
|
|
367
467
|
if true_keywords is not None and false_keywords is not None:
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
)
|
|
468
|
+
_LOGGER.error("Provide either 'true_keywords' or 'false_keywords', but not both.")
|
|
469
|
+
raise ValueError()
|
|
371
470
|
if true_keywords is None and false_keywords is None:
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
)
|
|
471
|
+
_LOGGER.error("You must provide either 'true_keywords' or 'false_keywords'.")
|
|
472
|
+
raise ValueError()
|
|
375
473
|
|
|
376
474
|
# --- Configuration ---
|
|
377
475
|
self.keywords: List[str] = true_keywords if true_keywords is not None else false_keywords # type: ignore
|
|
378
476
|
if not self.keywords:
|
|
379
|
-
|
|
477
|
+
_LOGGER.error("Keyword list cannot be empty.")
|
|
478
|
+
raise ValueError()
|
|
380
479
|
|
|
381
480
|
self.mode: str = "true_mode" if true_keywords is not None else "false_mode"
|
|
382
481
|
|
|
@@ -413,6 +512,27 @@ class BinaryTransformer:
|
|
|
413
512
|
return (~contains_keyword).cast(pl.UInt8)
|
|
414
513
|
|
|
415
514
|
|
|
515
|
+
class AutoDummifier:
|
|
516
|
+
"""
|
|
517
|
+
A transformer that performs one-hot encoding on a categorical column,
|
|
518
|
+
automatically detecting the unique categories from the data.
|
|
519
|
+
"""
|
|
520
|
+
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
521
|
+
"""
|
|
522
|
+
Executes the one-hot encoding logic.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
column (pl.Series): The input Polars Series of categories.
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
pl.DataFrame: A DataFrame with one-hot encoded columns.
|
|
529
|
+
Column names are auto-generated by Polars as
|
|
530
|
+
'{original_col_name}_{category_value}'.
|
|
531
|
+
"""
|
|
532
|
+
# Ensure the column is treated as a string before creating dummies
|
|
533
|
+
return column.cast(pl.Utf8).to_dummies()
|
|
534
|
+
|
|
535
|
+
|
|
416
536
|
class MultiBinaryDummifier:
|
|
417
537
|
"""
|
|
418
538
|
A one-to-many transformer that creates multiple binary columns from a single
|
|
@@ -431,9 +551,11 @@ class MultiBinaryDummifier:
|
|
|
431
551
|
"""
|
|
432
552
|
def __init__(self, keywords: List[str], case_insensitive: bool = True):
|
|
433
553
|
if not isinstance(keywords, list) or not all(isinstance(k, str) for k in keywords):
|
|
434
|
-
|
|
554
|
+
_LOGGER.error("The 'keywords' argument must be a list of strings.")
|
|
555
|
+
raise TypeError()
|
|
435
556
|
if not keywords:
|
|
436
|
-
|
|
557
|
+
_LOGGER.error("The 'keywords' list cannot be empty.")
|
|
558
|
+
raise ValueError()
|
|
437
559
|
|
|
438
560
|
self.keywords = keywords
|
|
439
561
|
self.case_insensitive = case_insensitive
|
|
@@ -493,7 +615,8 @@ class KeywordDummifier:
|
|
|
493
615
|
"""
|
|
494
616
|
def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
|
|
495
617
|
if len(group_names) != len(group_keywords):
|
|
496
|
-
|
|
618
|
+
_LOGGER.error("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
|
|
619
|
+
raise ValueError()
|
|
497
620
|
|
|
498
621
|
self.group_names = group_names
|
|
499
622
|
self.group_keywords = group_keywords
|
|
@@ -573,23 +696,28 @@ class NumberExtractor:
|
|
|
573
696
|
):
|
|
574
697
|
# --- Validation ---
|
|
575
698
|
if not isinstance(regex_pattern, str):
|
|
576
|
-
|
|
699
|
+
_LOGGER.error("regex_pattern must be a string.")
|
|
700
|
+
raise TypeError()
|
|
577
701
|
|
|
578
702
|
# Validate that the regex has exactly one capturing group
|
|
579
703
|
try:
|
|
580
704
|
if re.compile(regex_pattern).groups != 1:
|
|
581
|
-
|
|
705
|
+
_LOGGER.error("regex_pattern must contain exactly one capturing group '(...)'")
|
|
706
|
+
raise ValueError()
|
|
582
707
|
except re.error as e:
|
|
583
|
-
|
|
708
|
+
_LOGGER.error(f"Invalid regex pattern provided: {e}")
|
|
709
|
+
raise ValueError()
|
|
584
710
|
|
|
585
711
|
if dtype not in ["float", "int"]:
|
|
586
|
-
|
|
712
|
+
_LOGGER.error("dtype must be either 'float' or 'int'.")
|
|
713
|
+
raise ValueError()
|
|
587
714
|
|
|
588
715
|
if round_digits is not None:
|
|
589
716
|
if not isinstance(round_digits, int):
|
|
590
|
-
|
|
717
|
+
_LOGGER.error("round_digits must be an integer.")
|
|
718
|
+
raise TypeError()
|
|
591
719
|
if dtype == "int":
|
|
592
|
-
_LOGGER.warning(f"
|
|
720
|
+
_LOGGER.warning(f"'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
|
|
593
721
|
|
|
594
722
|
self.regex_pattern = regex_pattern
|
|
595
723
|
self.dtype = dtype
|
|
@@ -647,21 +775,26 @@ class MultiNumberExtractor:
|
|
|
647
775
|
):
|
|
648
776
|
# --- Validation ---
|
|
649
777
|
if not isinstance(num_outputs, int) or num_outputs <= 0:
|
|
650
|
-
|
|
778
|
+
_LOGGER.error("num_outputs must be a positive integer.")
|
|
779
|
+
raise ValueError()
|
|
651
780
|
|
|
652
781
|
if not isinstance(regex_pattern, str):
|
|
653
|
-
|
|
782
|
+
_LOGGER.error("regex_pattern must be a string.")
|
|
783
|
+
raise TypeError()
|
|
654
784
|
|
|
655
785
|
# Validate that the regex has exactly one capturing group
|
|
656
786
|
try:
|
|
657
787
|
if re.compile(regex_pattern).groups != 1:
|
|
658
|
-
|
|
788
|
+
_LOGGER.error("regex_pattern must contain exactly one capturing group '(...)'")
|
|
789
|
+
raise ValueError()
|
|
659
790
|
except re.error as e:
|
|
660
|
-
|
|
791
|
+
_LOGGER.error(f"Invalid regex pattern provided: {e}")
|
|
792
|
+
raise ValueError()
|
|
661
793
|
|
|
662
794
|
# Validate dtype
|
|
663
795
|
if dtype not in ["float", "int"]:
|
|
664
|
-
|
|
796
|
+
_LOGGER.error("dtype must be either 'float' or 'int'.")
|
|
797
|
+
raise ValueError()
|
|
665
798
|
|
|
666
799
|
self.num_outputs = num_outputs
|
|
667
800
|
self.regex_pattern = regex_pattern
|
|
@@ -714,17 +847,14 @@ class RatioCalculator:
|
|
|
714
847
|
try:
|
|
715
848
|
compiled_pattern = re.compile(regex_pattern)
|
|
716
849
|
if compiled_pattern.groups != 2:
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
"capturing groups '(...)'."
|
|
720
|
-
)
|
|
850
|
+
_LOGGER.error("RatioCalculator regex_pattern must contain exactly two capturing groups '(...)'.")
|
|
851
|
+
raise ValueError()
|
|
721
852
|
if compiled_pattern.groupindex:
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
"(e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)')."
|
|
725
|
-
)
|
|
853
|
+
_LOGGER.error("RatioCalculator must be initialized with unnamed capturing groups (e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)').")
|
|
854
|
+
raise ValueError()
|
|
726
855
|
except re.error as e:
|
|
727
|
-
|
|
856
|
+
_LOGGER.error(f"Invalid regex pattern provided: {e}")
|
|
857
|
+
raise ValueError()
|
|
728
858
|
|
|
729
859
|
self.regex_pattern = regex_pattern
|
|
730
860
|
|
|
@@ -768,7 +898,8 @@ class CategoryMapper:
|
|
|
768
898
|
unseen_value: Optional[Union[int, float]] = None,
|
|
769
899
|
):
|
|
770
900
|
if not isinstance(mapping, dict):
|
|
771
|
-
|
|
901
|
+
_LOGGER.error("The 'mapping' argument must be a dictionary.")
|
|
902
|
+
raise TypeError()
|
|
772
903
|
|
|
773
904
|
self.mapping = mapping
|
|
774
905
|
self.default_value = unseen_value
|
|
@@ -829,7 +960,8 @@ class RegexMapper:
|
|
|
829
960
|
):
|
|
830
961
|
# --- Validation ---
|
|
831
962
|
if not isinstance(mapping, dict):
|
|
832
|
-
|
|
963
|
+
_LOGGER.error("The 'mapping' argument must be a dictionary.")
|
|
964
|
+
raise TypeError()
|
|
833
965
|
|
|
834
966
|
self.unseen_value = unseen_value
|
|
835
967
|
|
|
@@ -843,9 +975,11 @@ class RegexMapper:
|
|
|
843
975
|
try:
|
|
844
976
|
re.compile(final_pattern)
|
|
845
977
|
except re.error as e:
|
|
846
|
-
|
|
978
|
+
_LOGGER.error(f"Invalid regex pattern '{final_pattern}': {e}")
|
|
979
|
+
raise ValueError()
|
|
847
980
|
if not isinstance(value, (int, float)):
|
|
848
|
-
|
|
981
|
+
_LOGGER.error(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
|
|
982
|
+
raise TypeError()
|
|
849
983
|
|
|
850
984
|
self.processed_mapping.append((final_pattern, value))
|
|
851
985
|
|
|
@@ -900,11 +1034,13 @@ class ValueBinner:
|
|
|
900
1034
|
):
|
|
901
1035
|
# --- Validation ---
|
|
902
1036
|
if not isinstance(breaks, list) or len(breaks) < 2:
|
|
903
|
-
|
|
1037
|
+
_LOGGER.error("The 'breaks' argument must be a list of at least two numbers.")
|
|
1038
|
+
raise ValueError()
|
|
904
1039
|
|
|
905
1040
|
# Check if the list is sorted
|
|
906
1041
|
if not all(breaks[i] <= breaks[i+1] for i in range(len(breaks)-1)):
|
|
907
|
-
|
|
1042
|
+
_LOGGER.error("The 'breaks' list must be sorted in ascending order.")
|
|
1043
|
+
raise ValueError()
|
|
908
1044
|
|
|
909
1045
|
self.breaks = breaks
|
|
910
1046
|
self.left_closed = left_closed
|
|
@@ -964,14 +1100,13 @@ class DateFeatureExtractor:
|
|
|
964
1100
|
):
|
|
965
1101
|
# --- Validation ---
|
|
966
1102
|
if not isinstance(features, list) or not features:
|
|
967
|
-
|
|
1103
|
+
_LOGGER.error("'features' must be a non-empty list of strings.")
|
|
1104
|
+
raise ValueError()
|
|
968
1105
|
|
|
969
1106
|
for feature in features:
|
|
970
1107
|
if feature not in self.ALLOWED_FEATURES:
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
f"Allowed features are: {self.ALLOWED_FEATURES}"
|
|
974
|
-
)
|
|
1108
|
+
_LOGGER.error(f"Feature '{feature}' is not supported. Allowed features are: {self.ALLOWED_FEATURES}")
|
|
1109
|
+
raise ValueError()
|
|
975
1110
|
|
|
976
1111
|
self.features = features
|
|
977
1112
|
self.format = format
|