dragon-ml-toolbox 8.1.0__tar.gz → 9.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (42) hide show
  1. {dragon_ml_toolbox-8.1.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-9.0.0}/PKG-INFO +5 -1
  2. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0/dragon_ml_toolbox.egg-info}/PKG-INFO +5 -1
  3. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -1
  4. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/dragon_ml_toolbox.egg-info/requires.txt +4 -0
  5. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ETL_engineering.py +216 -81
  6. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/GUI_tools.py +5 -5
  7. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/MICE_imputation.py +12 -8
  8. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_callbacks.py +6 -3
  9. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_datasetmaster.py +37 -20
  10. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_evaluation.py +4 -4
  11. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_evaluation_multi.py +26 -17
  12. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_inference.py +30 -23
  13. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_models.py +14 -14
  14. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_optimization.py +4 -3
  15. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_scaler.py +7 -7
  16. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ML_trainer.py +17 -15
  17. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/PSO_optimization.py +16 -8
  18. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/RNN_forecast.py +1 -1
  19. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/SQL.py +22 -13
  20. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/VIF_factor.py +7 -6
  21. dragon_ml_toolbox-9.0.0/ml_tools/_logger.py +134 -0
  22. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/custom_logger.py +12 -8
  23. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/data_exploration.py +20 -15
  24. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ensemble_evaluation.py +10 -6
  25. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ensemble_inference.py +18 -18
  26. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/ensemble_learning.py +8 -5
  27. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/handle_excel.py +15 -11
  28. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/optimization_tools.py +3 -4
  29. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/path_manager.py +21 -15
  30. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/utilities.py +35 -26
  31. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/pyproject.toml +7 -3
  32. dragon_ml_toolbox-8.1.0/ml_tools/_ML_optimization_multi.py +0 -231
  33. dragon_ml_toolbox-8.1.0/ml_tools/_logger.py +0 -36
  34. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/LICENSE +0 -0
  35. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/LICENSE-THIRD-PARTY.md +0 -0
  36. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/README.md +0 -0
  37. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  38. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  39. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/__init__.py +0 -0
  40. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/_script_info.py +0 -0
  41. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/ml_tools/keys.py +0 -0
  42. {dragon_ml_toolbox-8.1.0 → dragon_ml_toolbox-9.0.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 8.1.0
3
+ Version: 9.0.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -17,6 +17,7 @@ Requires-Dist: pandas; extra == "base"
17
17
  Requires-Dist: numpy; extra == "base"
18
18
  Requires-Dist: polars; extra == "base"
19
19
  Requires-Dist: joblib; extra == "base"
20
+ Requires-Dist: colorlog; extra == "base"
20
21
  Provides-Extra: ml
21
22
  Requires-Dist: numpy>=2.0; extra == "ml"
22
23
  Requires-Dist: pandas; extra == "ml"
@@ -37,6 +38,7 @@ Requires-Dist: shap; extra == "ml"
37
38
  Requires-Dist: tqdm; extra == "ml"
38
39
  Requires-Dist: Pillow; extra == "ml"
39
40
  Requires-Dist: evotorch; extra == "ml"
41
+ Requires-Dist: colorlog; extra == "ml"
40
42
  Provides-Extra: mice
41
43
  Requires-Dist: numpy<2.0; extra == "mice"
42
44
  Requires-Dist: pandas; extra == "mice"
@@ -48,6 +50,7 @@ Requires-Dist: matplotlib; extra == "mice"
48
50
  Requires-Dist: statsmodels; extra == "mice"
49
51
  Requires-Dist: lightgbm<=4.5.0; extra == "mice"
50
52
  Requires-Dist: shap; extra == "mice"
53
+ Requires-Dist: colorlog; extra == "mice"
51
54
  Provides-Extra: pytorch
52
55
  Requires-Dist: torch; extra == "pytorch"
53
56
  Requires-Dist: torchvision; extra == "pytorch"
@@ -59,6 +62,7 @@ Requires-Dist: ipykernel; extra == "excel"
59
62
  Requires-Dist: notebook; extra == "excel"
60
63
  Requires-Dist: jupyterlab; extra == "excel"
61
64
  Requires-Dist: ipywidgets; extra == "excel"
65
+ Requires-Dist: colorlog; extra == "excel"
62
66
  Provides-Extra: gui-boost
63
67
  Requires-Dist: numpy; extra == "gui-boost"
64
68
  Requires-Dist: joblib; extra == "gui-boost"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 8.1.0
3
+ Version: 9.0.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -17,6 +17,7 @@ Requires-Dist: pandas; extra == "base"
17
17
  Requires-Dist: numpy; extra == "base"
18
18
  Requires-Dist: polars; extra == "base"
19
19
  Requires-Dist: joblib; extra == "base"
20
+ Requires-Dist: colorlog; extra == "base"
20
21
  Provides-Extra: ml
21
22
  Requires-Dist: numpy>=2.0; extra == "ml"
22
23
  Requires-Dist: pandas; extra == "ml"
@@ -37,6 +38,7 @@ Requires-Dist: shap; extra == "ml"
37
38
  Requires-Dist: tqdm; extra == "ml"
38
39
  Requires-Dist: Pillow; extra == "ml"
39
40
  Requires-Dist: evotorch; extra == "ml"
41
+ Requires-Dist: colorlog; extra == "ml"
40
42
  Provides-Extra: mice
41
43
  Requires-Dist: numpy<2.0; extra == "mice"
42
44
  Requires-Dist: pandas; extra == "mice"
@@ -48,6 +50,7 @@ Requires-Dist: matplotlib; extra == "mice"
48
50
  Requires-Dist: statsmodels; extra == "mice"
49
51
  Requires-Dist: lightgbm<=4.5.0; extra == "mice"
50
52
  Requires-Dist: shap; extra == "mice"
53
+ Requires-Dist: colorlog; extra == "mice"
51
54
  Provides-Extra: pytorch
52
55
  Requires-Dist: torch; extra == "pytorch"
53
56
  Requires-Dist: torchvision; extra == "pytorch"
@@ -59,6 +62,7 @@ Requires-Dist: ipykernel; extra == "excel"
59
62
  Requires-Dist: notebook; extra == "excel"
60
63
  Requires-Dist: jupyterlab; extra == "excel"
61
64
  Requires-Dist: ipywidgets; extra == "excel"
65
+ Requires-Dist: colorlog; extra == "excel"
62
66
  Provides-Extra: gui-boost
63
67
  Requires-Dist: numpy; extra == "gui-boost"
64
68
  Requires-Dist: joblib; extra == "gui-boost"
@@ -23,7 +23,6 @@ ml_tools/PSO_optimization.py
23
23
  ml_tools/RNN_forecast.py
24
24
  ml_tools/SQL.py
25
25
  ml_tools/VIF_factor.py
26
- ml_tools/_ML_optimization_multi.py
27
26
  ml_tools/__init__.py
28
27
  ml_tools/_logger.py
29
28
  ml_tools/_script_info.py
@@ -19,12 +19,14 @@ shap
19
19
  tqdm
20
20
  Pillow
21
21
  evotorch
22
+ colorlog
22
23
 
23
24
  [base]
24
25
  pandas
25
26
  numpy
26
27
  polars
27
28
  joblib
29
+ colorlog
28
30
 
29
31
  [excel]
30
32
  pandas
@@ -34,6 +36,7 @@ ipykernel
34
36
  notebook
35
37
  jupyterlab
36
38
  ipywidgets
39
+ colorlog
37
40
 
38
41
  [gui-boost]
39
42
  numpy
@@ -57,6 +60,7 @@ matplotlib
57
60
  statsmodels
58
61
  lightgbm<=4.5.0
59
62
  shap
63
+ colorlog
60
64
 
61
65
  [nuitka]
62
66
  nuitka
@@ -1,18 +1,22 @@
1
1
  import polars as pl
2
+ import pandas as pd
2
3
  import re
4
+ from pathlib import Path
3
5
  from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
6
+ from .path_manager import sanitize_filename, make_fullpath
4
7
  from ._script_info import _script_info
5
8
  from ._logger import _LOGGER
6
- import warnings
7
9
 
8
10
 
9
11
  __all__ = [
12
+ "save_unique_values",
10
13
  "ColumnCleaner",
11
14
  "DataFrameCleaner",
12
15
  "TransformationRecipe",
13
16
  "DataProcessor",
14
17
  "BinaryTransformer",
15
18
  "MultiBinaryDummifier",
19
+ "AutoDummifier",
16
20
  "KeywordDummifier",
17
21
  "NumberExtractor",
18
22
  "MultiNumberExtractor",
@@ -23,6 +27,80 @@ __all__ = [
23
27
  "DateFeatureExtractor"
24
28
  ]
25
29
 
30
+ ################ Unique Values per column #################
31
+ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
32
+ """
33
+ Loads a CSV file, then analyzes it and saves the unique non-null values
34
+ from each column into a separate text file exactly as they appear.
35
+
36
+ This is useful for understanding the raw categories or range of values
37
+ within a dataset before cleaning.
38
+
39
+ Args:
40
+ csv_path (Union[str, Path]):
41
+ The file path to the input CSV file.
42
+ output_dir (Union[str, Path]):
43
+ The path to the directory where the .txt files will be saved.
44
+ The directory will be created if it does not exist.
45
+ """
46
+ # --- 1. Input Validation ---
47
+ csv_path = make_fullpath(input_path=csv_path, enforce="file")
48
+ output_dir = make_fullpath(input_path=output_dir, make=True)
49
+
50
+ # --- 2. Load Data ---
51
+ try:
52
+ # Load all columns as strings to preserve original formatting
53
+ df = pd.read_csv(csv_path, dtype=str, encoding='utf-8')
54
+ except FileNotFoundError as e:
55
+ _LOGGER.error(f"The file was not found at '{csv_path}'.")
56
+ raise e
57
+ except Exception as e2:
58
+ _LOGGER.error(f"An error occurred while reading the CSV file.")
59
+ raise e2
60
+ else:
61
+ _LOGGER.info(f"Data loaded from '{csv_path}'")
62
+
63
+ # --- 3. Process Each Column ---
64
+ for i, column_name in enumerate(df.columns):
65
+ _LOGGER.info(f"Processing column: '{column_name}'...")
66
+
67
+ # --- Get unique values AS IS ---
68
+ try:
69
+ # Drop nulls, get unique values, and sort them.
70
+ # The values are preserved exactly as they are in the cells.
71
+ unique_values = df[column_name].dropna().unique()
72
+ sorted_uniques = sorted(unique_values)
73
+ except Exception:
74
+ _LOGGER.exception(f"Could not process column '{column_name}'.")
75
+ continue
76
+
77
+ if not sorted_uniques:
78
+ _LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
79
+ continue
80
+
81
+ # --- Sanitize column name to create a valid filename ---
82
+ sanitized_name = sanitize_filename(column_name)
83
+ if not sanitized_name.strip('_'):
84
+ sanitized_name = f'column_{i}'
85
+ file_path = output_dir / f"{sanitized_name}_unique_values.txt"
86
+
87
+ # --- Write to file ---
88
+ try:
89
+ with open(file_path, 'w', encoding='utf-8') as f:
90
+ f.write(f"# Unique values for column: '{column_name}'\n")
91
+ f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
92
+ f.write("-" * 30 + "\n")
93
+ for value in sorted_uniques:
94
+ f.write(f"{value}\n")
95
+ f.write("-" * 30 + "\n")
96
+ except IOError:
97
+ _LOGGER.exception(f"Error writing to file {file_path}.")
98
+ else:
99
+ _LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values to '{file_path}'")
100
+
101
+ _LOGGER.info("Process complete.")
102
+
103
+
26
104
  ########## EXTRACT and CLEAN ##########
27
105
  class ColumnCleaner:
28
106
  """
@@ -60,16 +138,19 @@ class ColumnCleaner:
60
138
  """
61
139
  def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
62
140
  if not isinstance(column_name, str) or not column_name:
63
- raise TypeError("The 'column_name' must be a non-empty string.")
141
+ _LOGGER.error("The 'column_name' must be a non-empty string.")
142
+ raise TypeError()
64
143
  if not isinstance(rules, dict):
65
- raise TypeError("The 'rules' argument must be a dictionary.")
144
+ _LOGGER.error("The 'rules' argument must be a dictionary.")
145
+ raise TypeError()
66
146
 
67
147
  # Validate each regex pattern for correctness
68
148
  for pattern in rules.keys():
69
149
  try:
70
150
  re.compile(pattern)
71
- except re.error as e:
72
- raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
151
+ except re.error:
152
+ _LOGGER.error(f"Invalid regex pattern '{pattern}'.")
153
+ raise
73
154
 
74
155
  self.column_name = column_name
75
156
  self.rules = rules
@@ -94,20 +175,17 @@ class DataFrameCleaner:
94
175
  """
95
176
  def __init__(self, cleaners: List[ColumnCleaner]):
96
177
  if not isinstance(cleaners, list):
97
- raise TypeError("The 'cleaners' argument must be a list of ColumnCleaner objects.")
178
+ _LOGGER.error("The 'cleaners' argument must be a list of ColumnCleaner objects.")
179
+ raise TypeError()
98
180
 
99
181
  seen_columns = set()
100
182
  for cleaner in cleaners:
101
183
  if not isinstance(cleaner, ColumnCleaner):
102
- raise TypeError(
103
- f"All items in 'cleaners' list must be ColumnCleaner objects, "
104
- f"but found an object of type {type(cleaner).__name__}."
105
- )
184
+ _LOGGER.error(f"All items in 'cleaners' list must be ColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
185
+ raise TypeError()
106
186
  if cleaner.column_name in seen_columns:
107
- raise ValueError(
108
- f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. "
109
- "Each column should only have one cleaner."
110
- )
187
+ _LOGGER.error(f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
188
+ raise ValueError()
111
189
  seen_columns.add(cleaner.column_name)
112
190
 
113
191
  self.cleaners = cleaners
@@ -131,10 +209,10 @@ class DataFrameCleaner:
131
209
  missing_columns = rule_columns - df_columns
132
210
 
133
211
  if missing_columns:
134
- raise ValueError(
135
- f"The following columns specified in cleaning rules "
136
- f"were not found in the DataFrame: {sorted(list(missing_columns))}"
137
- )
212
+ _LOGGER.error("The following columns specified in cleaning rules were not found in the DataFrame:")
213
+ for miss_col in sorted(list(missing_columns)):
214
+ print(f"\t- {miss_col}")
215
+ raise ValueError()
138
216
 
139
217
  df_cleaned = df.clone()
140
218
 
@@ -153,7 +231,7 @@ class DataFrameCleaner:
153
231
  # Execute the expression chain for the column
154
232
  df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
155
233
 
156
- print(f"Cleaned {len(self.cleaners)} columns.")
234
+ _LOGGER.info(f"Cleaned {len(self.cleaners)} columns.")
157
235
 
158
236
  return df_cleaned
159
237
 
@@ -199,16 +277,20 @@ class TransformationRecipe:
199
277
  """
200
278
  # --- Validation ---
201
279
  if not isinstance(input_col_name, str) or not input_col_name:
202
- raise TypeError("'input_col' must be a non-empty string.")
280
+ _LOGGER.error("'input_col' must be a non-empty string.")
281
+ raise TypeError()
203
282
 
204
283
  if transform == _RENAME:
205
284
  if not isinstance(output_col_names, str):
206
- raise TypeError("For a RENAME operation, 'output_col' must be a string.")
285
+ _LOGGER.error("For a RENAME operation, 'output_col' must be a string.")
286
+ raise TypeError()
207
287
  elif not isinstance(transform, Callable):
208
- raise TypeError(f"'transform' must be a callable function or the string '{_RENAME}'.")
288
+ _LOGGER.error(f"'transform' must be a callable function or the string '{_RENAME}'.")
289
+ raise TypeError()
209
290
 
210
291
  if isinstance(output_col_names, list) and transform == _RENAME:
211
- raise ValueError("A RENAME operation cannot have a list of output columns.")
292
+ _LOGGER.error("A RENAME operation cannot have a list of output columns.")
293
+ raise ValueError()
212
294
 
213
295
  # --- Add Step ---
214
296
  step = {
@@ -243,9 +325,11 @@ class DataProcessor:
243
325
  been populated with transformation steps.
244
326
  """
245
327
  if not isinstance(recipe, TransformationRecipe):
246
- raise TypeError("The recipe must be an instance of TransformationRecipe.")
328
+ _LOGGER.error("The recipe must be an instance of TransformationRecipe.")
329
+ raise TypeError()
247
330
  if len(recipe) == 0:
248
- raise ValueError("The recipe cannot be empty.")
331
+ _LOGGER.error("The recipe cannot be empty.")
332
+ raise ValueError()
249
333
  self._recipe = recipe
250
334
 
251
335
  def transform(self, df: pl.DataFrame) -> pl.DataFrame:
@@ -260,7 +344,8 @@ class DataProcessor:
260
344
  transform_action = step["transform"]
261
345
 
262
346
  if input_col_name not in df.columns:
263
- raise ValueError(f"Input column '{input_col_name}' not found in DataFrame.")
347
+ _LOGGER.error(f"Input column '{input_col_name}' not found in DataFrame.")
348
+ raise ValueError()
264
349
 
265
350
  input_series = df.get_column(input_col_name)
266
351
 
@@ -273,29 +358,44 @@ class DataProcessor:
273
358
 
274
359
  if isinstance(result, pl.Series):
275
360
  if not isinstance(output_col_spec, str):
276
- raise TypeError(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
361
+ _LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
362
+ raise TypeError()
277
363
  processed_columns.append(result.alias(output_col_spec))
278
364
 
279
365
  elif isinstance(result, pl.DataFrame):
280
- if not isinstance(output_col_spec, list):
281
- raise TypeError(f"Function for '{input_col_name}' returned a DataFrame but 'output_col' is not a list.")
282
- if len(result.columns) != len(output_col_spec):
283
- raise ValueError(
284
- f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, "
285
- f"but recipe specifies {len(output_col_spec)} output names."
286
- )
366
+ # 1. Handle list-based renaming
367
+ if isinstance(output_col_spec, list):
368
+ if len(result.columns) != len(output_col_spec):
369
+ _LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
370
+ raise ValueError()
371
+
372
+ renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
373
+ processed_columns.extend(renamed_df.get_columns())
374
+
375
+ # 2. Handle a string prefix for AutoDummifier
376
+ elif isinstance(output_col_spec, str):
377
+ prefix = output_col_spec
378
+ # Replace the original name part with the desired prefix.
379
+ new_names = {
380
+ col: f"{prefix}{col[len(input_col_name):]}" for col in result.columns
381
+ }
382
+ renamed_df = result.rename(new_names)
383
+ processed_columns.extend(renamed_df.get_columns())
287
384
 
288
- renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
289
- processed_columns.extend(renamed_df.get_columns())
385
+ else:
386
+ _LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names or a string prefix.")
387
+ raise TypeError()
290
388
 
291
389
  else:
292
- raise TypeError(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
390
+ _LOGGER.error(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
391
+ raise TypeError()
293
392
 
294
- else: # This case is now unlikely due to builder validation.
295
- raise TypeError(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
393
+ else: # This case is unlikely due to builder validation.
394
+ _LOGGER.error(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
395
+ raise TypeError()
296
396
 
297
397
  if not processed_columns:
298
- _LOGGER.warning("⚠️ The transformation resulted in an empty DataFrame.")
398
+ _LOGGER.error("The transformation resulted in an empty DataFrame.")
299
399
  return pl.DataFrame()
300
400
 
301
401
  return pl.DataFrame(processed_columns)
@@ -365,18 +465,17 @@ class BinaryTransformer:
365
465
  ):
366
466
  # --- Validation: Enforce one and only one option ---
367
467
  if true_keywords is not None and false_keywords is not None:
368
- raise ValueError(
369
- "Provide either 'true_keywords' or 'false_keywords', but not both."
370
- )
468
+ _LOGGER.error("Provide either 'true_keywords' or 'false_keywords', but not both.")
469
+ raise ValueError()
371
470
  if true_keywords is None and false_keywords is None:
372
- raise ValueError(
373
- "You must provide either 'true_keywords' or 'false_keywords'."
374
- )
471
+ _LOGGER.error("You must provide either 'true_keywords' or 'false_keywords'.")
472
+ raise ValueError()
375
473
 
376
474
  # --- Configuration ---
377
475
  self.keywords: List[str] = true_keywords if true_keywords is not None else false_keywords # type: ignore
378
476
  if not self.keywords:
379
- raise ValueError("Keyword list cannot be empty.")
477
+ _LOGGER.error("Keyword list cannot be empty.")
478
+ raise ValueError()
380
479
 
381
480
  self.mode: str = "true_mode" if true_keywords is not None else "false_mode"
382
481
 
@@ -413,6 +512,27 @@ class BinaryTransformer:
413
512
  return (~contains_keyword).cast(pl.UInt8)
414
513
 
415
514
 
515
+ class AutoDummifier:
516
+ """
517
+ A transformer that performs one-hot encoding on a categorical column,
518
+ automatically detecting the unique categories from the data.
519
+ """
520
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
521
+ """
522
+ Executes the one-hot encoding logic.
523
+
524
+ Args:
525
+ column (pl.Series): The input Polars Series of categories.
526
+
527
+ Returns:
528
+ pl.DataFrame: A DataFrame with one-hot encoded columns.
529
+ Column names are auto-generated by Polars as
530
+ '{original_col_name}_{category_value}'.
531
+ """
532
+ # Ensure the column is treated as a string before creating dummies
533
+ return column.cast(pl.Utf8).to_dummies()
534
+
535
+
416
536
  class MultiBinaryDummifier:
417
537
  """
418
538
  A one-to-many transformer that creates multiple binary columns from a single
@@ -431,9 +551,11 @@ class MultiBinaryDummifier:
431
551
  """
432
552
  def __init__(self, keywords: List[str], case_insensitive: bool = True):
433
553
  if not isinstance(keywords, list) or not all(isinstance(k, str) for k in keywords):
434
- raise TypeError("The 'keywords' argument must be a list of strings.")
554
+ _LOGGER.error("The 'keywords' argument must be a list of strings.")
555
+ raise TypeError()
435
556
  if not keywords:
436
- raise ValueError("The 'keywords' list cannot be empty.")
557
+ _LOGGER.error("The 'keywords' list cannot be empty.")
558
+ raise ValueError()
437
559
 
438
560
  self.keywords = keywords
439
561
  self.case_insensitive = case_insensitive
@@ -493,7 +615,8 @@ class KeywordDummifier:
493
615
  """
494
616
  def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
495
617
  if len(group_names) != len(group_keywords):
496
- raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
618
+ _LOGGER.error("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
619
+ raise ValueError()
497
620
 
498
621
  self.group_names = group_names
499
622
  self.group_keywords = group_keywords
@@ -573,23 +696,28 @@ class NumberExtractor:
573
696
  ):
574
697
  # --- Validation ---
575
698
  if not isinstance(regex_pattern, str):
576
- raise TypeError("regex_pattern must be a string.")
699
+ _LOGGER.error("regex_pattern must be a string.")
700
+ raise TypeError()
577
701
 
578
702
  # Validate that the regex has exactly one capturing group
579
703
  try:
580
704
  if re.compile(regex_pattern).groups != 1:
581
- raise ValueError("regex_pattern must contain exactly one capturing group '(...)'")
705
+ _LOGGER.error("regex_pattern must contain exactly one capturing group '(...)'")
706
+ raise ValueError()
582
707
  except re.error as e:
583
- raise ValueError(f"Invalid regex pattern provided: {e}") from e
708
+ _LOGGER.error(f"Invalid regex pattern provided: {e}")
709
+ raise ValueError()
584
710
 
585
711
  if dtype not in ["float", "int"]:
586
- raise ValueError("dtype must be either 'float' or 'int'.")
712
+ _LOGGER.error("dtype must be either 'float' or 'int'.")
713
+ raise ValueError()
587
714
 
588
715
  if round_digits is not None:
589
716
  if not isinstance(round_digits, int):
590
- raise TypeError("round_digits must be an integer.")
717
+ _LOGGER.error("round_digits must be an integer.")
718
+ raise TypeError()
591
719
  if dtype == "int":
592
- _LOGGER.warning(f"⚠️ 'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
720
+ _LOGGER.warning(f"'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
593
721
 
594
722
  self.regex_pattern = regex_pattern
595
723
  self.dtype = dtype
@@ -647,21 +775,26 @@ class MultiNumberExtractor:
647
775
  ):
648
776
  # --- Validation ---
649
777
  if not isinstance(num_outputs, int) or num_outputs <= 0:
650
- raise ValueError("num_outputs must be a positive integer.")
778
+ _LOGGER.error("num_outputs must be a positive integer.")
779
+ raise ValueError()
651
780
 
652
781
  if not isinstance(regex_pattern, str):
653
- raise TypeError("regex_pattern must be a string.")
782
+ _LOGGER.error("regex_pattern must be a string.")
783
+ raise TypeError()
654
784
 
655
785
  # Validate that the regex has exactly one capturing group
656
786
  try:
657
787
  if re.compile(regex_pattern).groups != 1:
658
- raise ValueError("regex_pattern must contain exactly one capturing group '(...)'")
788
+ _LOGGER.error("regex_pattern must contain exactly one capturing group '(...)'")
789
+ raise ValueError()
659
790
  except re.error as e:
660
- raise ValueError(f"Invalid regex pattern provided: {e}") from e
791
+ _LOGGER.error(f"Invalid regex pattern provided: {e}")
792
+ raise ValueError()
661
793
 
662
794
  # Validate dtype
663
795
  if dtype not in ["float", "int"]:
664
- raise ValueError("dtype must be either 'float' or 'int'.")
796
+ _LOGGER.error("dtype must be either 'float' or 'int'.")
797
+ raise ValueError()
665
798
 
666
799
  self.num_outputs = num_outputs
667
800
  self.regex_pattern = regex_pattern
@@ -714,17 +847,14 @@ class RatioCalculator:
714
847
  try:
715
848
  compiled_pattern = re.compile(regex_pattern)
716
849
  if compiled_pattern.groups != 2:
717
- raise ValueError(
718
- "RatioCalculator regex_pattern must contain exactly two "
719
- "capturing groups '(...)'."
720
- )
850
+ _LOGGER.error("RatioCalculator regex_pattern must contain exactly two capturing groups '(...)'.")
851
+ raise ValueError()
721
852
  if compiled_pattern.groupindex:
722
- raise ValueError(
723
- "RatioCalculator must be initialized with unnamed capturing groups "
724
- "(e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)')."
725
- )
853
+ _LOGGER.error("RatioCalculator must be initialized with unnamed capturing groups (e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)').")
854
+ raise ValueError()
726
855
  except re.error as e:
727
- raise ValueError(f"Invalid regex pattern provided: {e}") from e
856
+ _LOGGER.error(f"Invalid regex pattern provided: {e}")
857
+ raise ValueError()
728
858
 
729
859
  self.regex_pattern = regex_pattern
730
860
 
@@ -768,7 +898,8 @@ class CategoryMapper:
768
898
  unseen_value: Optional[Union[int, float]] = None,
769
899
  ):
770
900
  if not isinstance(mapping, dict):
771
- raise TypeError("The 'mapping' argument must be a dictionary.")
901
+ _LOGGER.error("The 'mapping' argument must be a dictionary.")
902
+ raise TypeError()
772
903
 
773
904
  self.mapping = mapping
774
905
  self.default_value = unseen_value
@@ -829,7 +960,8 @@ class RegexMapper:
829
960
  ):
830
961
  # --- Validation ---
831
962
  if not isinstance(mapping, dict):
832
- raise TypeError("The 'mapping' argument must be a dictionary.")
963
+ _LOGGER.error("The 'mapping' argument must be a dictionary.")
964
+ raise TypeError()
833
965
 
834
966
  self.unseen_value = unseen_value
835
967
 
@@ -843,9 +975,11 @@ class RegexMapper:
843
975
  try:
844
976
  re.compile(final_pattern)
845
977
  except re.error as e:
846
- raise ValueError(f"Invalid regex pattern '{final_pattern}': {e}") from e
978
+ _LOGGER.error(f"Invalid regex pattern '{final_pattern}': {e}")
979
+ raise ValueError()
847
980
  if not isinstance(value, (int, float)):
848
- raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
981
+ _LOGGER.error(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
982
+ raise TypeError()
849
983
 
850
984
  self.processed_mapping.append((final_pattern, value))
851
985
 
@@ -900,11 +1034,13 @@ class ValueBinner:
900
1034
  ):
901
1035
  # --- Validation ---
902
1036
  if not isinstance(breaks, list) or len(breaks) < 2:
903
- raise ValueError("The 'breaks' argument must be a list of at least two numbers.")
1037
+ _LOGGER.error("The 'breaks' argument must be a list of at least two numbers.")
1038
+ raise ValueError()
904
1039
 
905
1040
  # Check if the list is sorted
906
1041
  if not all(breaks[i] <= breaks[i+1] for i in range(len(breaks)-1)):
907
- raise ValueError("The 'breaks' list must be sorted in ascending order.")
1042
+ _LOGGER.error("The 'breaks' list must be sorted in ascending order.")
1043
+ raise ValueError()
908
1044
 
909
1045
  self.breaks = breaks
910
1046
  self.left_closed = left_closed
@@ -964,14 +1100,13 @@ class DateFeatureExtractor:
964
1100
  ):
965
1101
  # --- Validation ---
966
1102
  if not isinstance(features, list) or not features:
967
- raise ValueError("'features' must be a non-empty list of strings.")
1103
+ _LOGGER.error("'features' must be a non-empty list of strings.")
1104
+ raise ValueError()
968
1105
 
969
1106
  for feature in features:
970
1107
  if feature not in self.ALLOWED_FEATURES:
971
- raise ValueError(
972
- f"Feature '{feature}' is not supported. "
973
- f"Allowed features are: {self.ALLOWED_FEATURES}"
974
- )
1108
+ _LOGGER.error(f"Feature '{feature}' is not supported. Allowed features are: {self.ALLOWED_FEATURES}")
1109
+ raise ValueError()
975
1110
 
976
1111
  self.features = features
977
1112
  self.format = format