dragon-ml-toolbox 8.2.0__py3-none-any.whl → 9.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/METADATA +5 -1
- dragon_ml_toolbox-9.0.0.dist-info/RECORD +35 -0
- ml_tools/ETL_engineering.py +177 -79
- ml_tools/GUI_tools.py +5 -5
- ml_tools/MICE_imputation.py +12 -8
- ml_tools/ML_callbacks.py +6 -3
- ml_tools/ML_datasetmaster.py +37 -20
- ml_tools/ML_evaluation.py +4 -4
- ml_tools/ML_evaluation_multi.py +26 -17
- ml_tools/ML_inference.py +30 -23
- ml_tools/ML_models.py +14 -14
- ml_tools/ML_optimization.py +4 -3
- ml_tools/ML_scaler.py +7 -7
- ml_tools/ML_trainer.py +17 -15
- ml_tools/PSO_optimization.py +16 -8
- ml_tools/RNN_forecast.py +1 -1
- ml_tools/SQL.py +22 -13
- ml_tools/VIF_factor.py +7 -6
- ml_tools/_logger.py +105 -7
- ml_tools/custom_logger.py +12 -8
- ml_tools/data_exploration.py +20 -15
- ml_tools/ensemble_evaluation.py +10 -6
- ml_tools/ensemble_inference.py +18 -18
- ml_tools/ensemble_learning.py +8 -5
- ml_tools/handle_excel.py +15 -11
- ml_tools/optimization_tools.py +3 -4
- ml_tools/path_manager.py +21 -15
- ml_tools/utilities.py +35 -26
- dragon_ml_toolbox-8.2.0.dist-info/RECORD +0 -36
- ml_tools/_ML_optimization_multi.py +0 -231
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 9.0.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -17,6 +17,7 @@ Requires-Dist: pandas; extra == "base"
|
|
|
17
17
|
Requires-Dist: numpy; extra == "base"
|
|
18
18
|
Requires-Dist: polars; extra == "base"
|
|
19
19
|
Requires-Dist: joblib; extra == "base"
|
|
20
|
+
Requires-Dist: colorlog; extra == "base"
|
|
20
21
|
Provides-Extra: ml
|
|
21
22
|
Requires-Dist: numpy>=2.0; extra == "ml"
|
|
22
23
|
Requires-Dist: pandas; extra == "ml"
|
|
@@ -37,6 +38,7 @@ Requires-Dist: shap; extra == "ml"
|
|
|
37
38
|
Requires-Dist: tqdm; extra == "ml"
|
|
38
39
|
Requires-Dist: Pillow; extra == "ml"
|
|
39
40
|
Requires-Dist: evotorch; extra == "ml"
|
|
41
|
+
Requires-Dist: colorlog; extra == "ml"
|
|
40
42
|
Provides-Extra: mice
|
|
41
43
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
42
44
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -48,6 +50,7 @@ Requires-Dist: matplotlib; extra == "mice"
|
|
|
48
50
|
Requires-Dist: statsmodels; extra == "mice"
|
|
49
51
|
Requires-Dist: lightgbm<=4.5.0; extra == "mice"
|
|
50
52
|
Requires-Dist: shap; extra == "mice"
|
|
53
|
+
Requires-Dist: colorlog; extra == "mice"
|
|
51
54
|
Provides-Extra: pytorch
|
|
52
55
|
Requires-Dist: torch; extra == "pytorch"
|
|
53
56
|
Requires-Dist: torchvision; extra == "pytorch"
|
|
@@ -59,6 +62,7 @@ Requires-Dist: ipykernel; extra == "excel"
|
|
|
59
62
|
Requires-Dist: notebook; extra == "excel"
|
|
60
63
|
Requires-Dist: jupyterlab; extra == "excel"
|
|
61
64
|
Requires-Dist: ipywidgets; extra == "excel"
|
|
65
|
+
Requires-Dist: colorlog; extra == "excel"
|
|
62
66
|
Provides-Extra: gui-boost
|
|
63
67
|
Requires-Dist: numpy; extra == "gui-boost"
|
|
64
68
|
Requires-Dist: joblib; extra == "gui-boost"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
dragon_ml_toolbox-9.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-9.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
|
+
ml_tools/ETL_engineering.py,sha256=SH8b9BSR79cib49YpIixjayaruD0qftnW7FV3xskoOs,44876
|
|
4
|
+
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
5
|
+
ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
|
|
6
|
+
ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
|
|
7
|
+
ml_tools/ML_datasetmaster.py,sha256=CBZFpvm0qiY-8gP89iKTkd7jvU-rGQcJwk-_mBJmRSg,29273
|
|
8
|
+
ml_tools/ML_evaluation.py,sha256=28JJ2M71p4pxniwav2Hv3b1a5dsvaoIYNLm-UJQuXvY,16002
|
|
9
|
+
ml_tools/ML_evaluation_multi.py,sha256=2jTSNFCu8cz5C05EusnrDyffs59M2Fq3UXSHxo2TR1A,12515
|
|
10
|
+
ml_tools/ML_inference.py,sha256=SGDPiPxs_OYDKKRZziFMyaWcC8A37c70W9t-dMP5niI,23066
|
|
11
|
+
ml_tools/ML_models.py,sha256=Dl2mTMgVCtnNCSRlyqvMnInsKJVldS7vnBPimD-TnHo,27999
|
|
12
|
+
ml_tools/ML_optimization.py,sha256=a2Uxe1g-y4I-gFa8ENIM8QDS-Pz3hoPRRaVXAWMbyQA,13491
|
|
13
|
+
ml_tools/ML_scaler.py,sha256=O8JzHr2551zPpKRRReEIMvq0lNAAPau6hV59KUMAySg,7420
|
|
14
|
+
ml_tools/ML_trainer.py,sha256=xM-o-gbPhWXm2lOVXbeaTFotgJSDRSHyE7H0-9OOij4,23712
|
|
15
|
+
ml_tools/PSO_optimization.py,sha256=q0VYpssQGbPum7xdnkDXlJQKhZMYZo8acHpKhajPK3c,22954
|
|
16
|
+
ml_tools/RNN_forecast.py,sha256=8rNZr-eWOBXMiDQV22e_tQTPM5LM2IFggEAa1FaoXaI,1965
|
|
17
|
+
ml_tools/SQL.py,sha256=WDgdZUYuLBUpv-4Am9XjVY_Aq_jxBWdLrbcgAIEwefI,10704
|
|
18
|
+
ml_tools/VIF_factor.py,sha256=MkMh_RIdsN2XUPzKNGRiEcmB17R_MmvGV4ezpL5zD2E,10403
|
|
19
|
+
ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
20
|
+
ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
|
|
21
|
+
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
22
|
+
ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
|
|
23
|
+
ml_tools/data_exploration.py,sha256=hKA_3U-piJ8TtDWhzX_T2Awkg-25e0DC5E8qloqPo6w,27206
|
|
24
|
+
ml_tools/ensemble_evaluation.py,sha256=xMEMfXJ5MjTkTfr1LkFOeD7iUtnVDCW3S9lm3zT-6tY,24778
|
|
25
|
+
ml_tools/ensemble_inference.py,sha256=EFHnbjbu31fcVp88NBx8lWAVdu2Gpg9MY9huVZJHFfM,9350
|
|
26
|
+
ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
|
|
27
|
+
ml_tools/handle_excel.py,sha256=p5BpBS9vhBhz3lqkk_WQ9Ef7EGedf2dp2cl0yekeRy4,13065
|
|
28
|
+
ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
29
|
+
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
30
|
+
ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
|
|
31
|
+
ml_tools/utilities.py,sha256=zzfYR7SUSb2rZILTNoCjl_pfLlPdHf4263atXuEb3iE,19341
|
|
32
|
+
dragon_ml_toolbox-9.0.0.dist-info/METADATA,sha256=FWDN8U9RARbPxbCBVrv4ZHqJys-LVo7M3dlyVwKdh74,6941
|
|
33
|
+
dragon_ml_toolbox-9.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
34
|
+
dragon_ml_toolbox-9.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
35
|
+
dragon_ml_toolbox-9.0.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import polars as pl
|
|
2
|
+
import pandas as pd
|
|
2
3
|
import re
|
|
4
|
+
from pathlib import Path
|
|
3
5
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
6
|
+
from .path_manager import sanitize_filename, make_fullpath
|
|
4
7
|
from ._script_info import _script_info
|
|
5
8
|
from ._logger import _LOGGER
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
__all__ = [
|
|
12
|
+
"save_unique_values",
|
|
9
13
|
"ColumnCleaner",
|
|
10
14
|
"DataFrameCleaner",
|
|
11
15
|
"TransformationRecipe",
|
|
@@ -23,6 +27,80 @@ __all__ = [
|
|
|
23
27
|
"DateFeatureExtractor"
|
|
24
28
|
]
|
|
25
29
|
|
|
30
|
+
################ Unique Values per column #################
|
|
31
|
+
def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
34
|
+
from each column into a separate text file exactly as they appear.
|
|
35
|
+
|
|
36
|
+
This is useful for understanding the raw categories or range of values
|
|
37
|
+
within a dataset before cleaning.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
csv_path (Union[str, Path]):
|
|
41
|
+
The file path to the input CSV file.
|
|
42
|
+
output_dir (Union[str, Path]):
|
|
43
|
+
The path to the directory where the .txt files will be saved.
|
|
44
|
+
The directory will be created if it does not exist.
|
|
45
|
+
"""
|
|
46
|
+
# --- 1. Input Validation ---
|
|
47
|
+
csv_path = make_fullpath(input_path=csv_path, enforce="file")
|
|
48
|
+
output_dir = make_fullpath(input_path=output_dir, make=True)
|
|
49
|
+
|
|
50
|
+
# --- 2. Load Data ---
|
|
51
|
+
try:
|
|
52
|
+
# Load all columns as strings to preserve original formatting
|
|
53
|
+
df = pd.read_csv(csv_path, dtype=str, encoding='utf-8')
|
|
54
|
+
except FileNotFoundError as e:
|
|
55
|
+
_LOGGER.error(f"The file was not found at '{csv_path}'.")
|
|
56
|
+
raise e
|
|
57
|
+
except Exception as e2:
|
|
58
|
+
_LOGGER.error(f"An error occurred while reading the CSV file.")
|
|
59
|
+
raise e2
|
|
60
|
+
else:
|
|
61
|
+
_LOGGER.info(f"Data loaded from '{csv_path}'")
|
|
62
|
+
|
|
63
|
+
# --- 3. Process Each Column ---
|
|
64
|
+
for i, column_name in enumerate(df.columns):
|
|
65
|
+
_LOGGER.info(f"Processing column: '{column_name}'...")
|
|
66
|
+
|
|
67
|
+
# --- Get unique values AS IS ---
|
|
68
|
+
try:
|
|
69
|
+
# Drop nulls, get unique values, and sort them.
|
|
70
|
+
# The values are preserved exactly as they are in the cells.
|
|
71
|
+
unique_values = df[column_name].dropna().unique()
|
|
72
|
+
sorted_uniques = sorted(unique_values)
|
|
73
|
+
except Exception:
|
|
74
|
+
_LOGGER.exception(f"Could not process column '{column_name}'.")
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
if not sorted_uniques:
|
|
78
|
+
_LOGGER.warning(f"Column '{column_name}' has no unique non-null values. Skipping.")
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
# --- Sanitize column name to create a valid filename ---
|
|
82
|
+
sanitized_name = sanitize_filename(column_name)
|
|
83
|
+
if not sanitized_name.strip('_'):
|
|
84
|
+
sanitized_name = f'column_{i}'
|
|
85
|
+
file_path = output_dir / f"{sanitized_name}_unique_values.txt"
|
|
86
|
+
|
|
87
|
+
# --- Write to file ---
|
|
88
|
+
try:
|
|
89
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
90
|
+
f.write(f"# Unique values for column: '{column_name}'\n")
|
|
91
|
+
f.write(f"# Total unique non-null values: {len(sorted_uniques)}\n")
|
|
92
|
+
f.write("-" * 30 + "\n")
|
|
93
|
+
for value in sorted_uniques:
|
|
94
|
+
f.write(f"{value}\n")
|
|
95
|
+
f.write("-" * 30 + "\n")
|
|
96
|
+
except IOError:
|
|
97
|
+
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
98
|
+
else:
|
|
99
|
+
_LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values to '{file_path}'")
|
|
100
|
+
|
|
101
|
+
_LOGGER.info("Process complete.")
|
|
102
|
+
|
|
103
|
+
|
|
26
104
|
########## EXTRACT and CLEAN ##########
|
|
27
105
|
class ColumnCleaner:
|
|
28
106
|
"""
|
|
@@ -60,16 +138,19 @@ class ColumnCleaner:
|
|
|
60
138
|
"""
|
|
61
139
|
def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
|
|
62
140
|
if not isinstance(column_name, str) or not column_name:
|
|
63
|
-
|
|
141
|
+
_LOGGER.error("The 'column_name' must be a non-empty string.")
|
|
142
|
+
raise TypeError()
|
|
64
143
|
if not isinstance(rules, dict):
|
|
65
|
-
|
|
144
|
+
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
145
|
+
raise TypeError()
|
|
66
146
|
|
|
67
147
|
# Validate each regex pattern for correctness
|
|
68
148
|
for pattern in rules.keys():
|
|
69
149
|
try:
|
|
70
150
|
re.compile(pattern)
|
|
71
|
-
except re.error
|
|
72
|
-
|
|
151
|
+
except re.error:
|
|
152
|
+
_LOGGER.error(f"Invalid regex pattern '{pattern}'.")
|
|
153
|
+
raise
|
|
73
154
|
|
|
74
155
|
self.column_name = column_name
|
|
75
156
|
self.rules = rules
|
|
@@ -94,20 +175,17 @@ class DataFrameCleaner:
|
|
|
94
175
|
"""
|
|
95
176
|
def __init__(self, cleaners: List[ColumnCleaner]):
|
|
96
177
|
if not isinstance(cleaners, list):
|
|
97
|
-
|
|
178
|
+
_LOGGER.error("The 'cleaners' argument must be a list of ColumnCleaner objects.")
|
|
179
|
+
raise TypeError()
|
|
98
180
|
|
|
99
181
|
seen_columns = set()
|
|
100
182
|
for cleaner in cleaners:
|
|
101
183
|
if not isinstance(cleaner, ColumnCleaner):
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
f"but found an object of type {type(cleaner).__name__}."
|
|
105
|
-
)
|
|
184
|
+
_LOGGER.error(f"All items in 'cleaners' list must be ColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
|
|
185
|
+
raise TypeError()
|
|
106
186
|
if cleaner.column_name in seen_columns:
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"Each column should only have one cleaner."
|
|
110
|
-
)
|
|
187
|
+
_LOGGER.error(f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
|
|
188
|
+
raise ValueError()
|
|
111
189
|
seen_columns.add(cleaner.column_name)
|
|
112
190
|
|
|
113
191
|
self.cleaners = cleaners
|
|
@@ -131,10 +209,10 @@ class DataFrameCleaner:
|
|
|
131
209
|
missing_columns = rule_columns - df_columns
|
|
132
210
|
|
|
133
211
|
if missing_columns:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
f"
|
|
137
|
-
)
|
|
212
|
+
_LOGGER.error("The following columns specified in cleaning rules were not found in the DataFrame:")
|
|
213
|
+
for miss_col in sorted(list(missing_columns)):
|
|
214
|
+
print(f"\t- {miss_col}")
|
|
215
|
+
raise ValueError()
|
|
138
216
|
|
|
139
217
|
df_cleaned = df.clone()
|
|
140
218
|
|
|
@@ -153,7 +231,7 @@ class DataFrameCleaner:
|
|
|
153
231
|
# Execute the expression chain for the column
|
|
154
232
|
df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
|
|
155
233
|
|
|
156
|
-
|
|
234
|
+
_LOGGER.info(f"Cleaned {len(self.cleaners)} columns.")
|
|
157
235
|
|
|
158
236
|
return df_cleaned
|
|
159
237
|
|
|
@@ -199,16 +277,20 @@ class TransformationRecipe:
|
|
|
199
277
|
"""
|
|
200
278
|
# --- Validation ---
|
|
201
279
|
if not isinstance(input_col_name, str) or not input_col_name:
|
|
202
|
-
|
|
280
|
+
_LOGGER.error("'input_col' must be a non-empty string.")
|
|
281
|
+
raise TypeError()
|
|
203
282
|
|
|
204
283
|
if transform == _RENAME:
|
|
205
284
|
if not isinstance(output_col_names, str):
|
|
206
|
-
|
|
285
|
+
_LOGGER.error("For a RENAME operation, 'output_col' must be a string.")
|
|
286
|
+
raise TypeError()
|
|
207
287
|
elif not isinstance(transform, Callable):
|
|
208
|
-
|
|
288
|
+
_LOGGER.error(f"'transform' must be a callable function or the string '{_RENAME}'.")
|
|
289
|
+
raise TypeError()
|
|
209
290
|
|
|
210
291
|
if isinstance(output_col_names, list) and transform == _RENAME:
|
|
211
|
-
|
|
292
|
+
_LOGGER.error("A RENAME operation cannot have a list of output columns.")
|
|
293
|
+
raise ValueError()
|
|
212
294
|
|
|
213
295
|
# --- Add Step ---
|
|
214
296
|
step = {
|
|
@@ -243,9 +325,11 @@ class DataProcessor:
|
|
|
243
325
|
been populated with transformation steps.
|
|
244
326
|
"""
|
|
245
327
|
if not isinstance(recipe, TransformationRecipe):
|
|
246
|
-
|
|
328
|
+
_LOGGER.error("The recipe must be an instance of TransformationRecipe.")
|
|
329
|
+
raise TypeError()
|
|
247
330
|
if len(recipe) == 0:
|
|
248
|
-
|
|
331
|
+
_LOGGER.error("The recipe cannot be empty.")
|
|
332
|
+
raise ValueError()
|
|
249
333
|
self._recipe = recipe
|
|
250
334
|
|
|
251
335
|
def transform(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
@@ -260,7 +344,8 @@ class DataProcessor:
|
|
|
260
344
|
transform_action = step["transform"]
|
|
261
345
|
|
|
262
346
|
if input_col_name not in df.columns:
|
|
263
|
-
|
|
347
|
+
_LOGGER.error(f"Input column '{input_col_name}' not found in DataFrame.")
|
|
348
|
+
raise ValueError()
|
|
264
349
|
|
|
265
350
|
input_series = df.get_column(input_col_name)
|
|
266
351
|
|
|
@@ -273,17 +358,16 @@ class DataProcessor:
|
|
|
273
358
|
|
|
274
359
|
if isinstance(result, pl.Series):
|
|
275
360
|
if not isinstance(output_col_spec, str):
|
|
276
|
-
|
|
361
|
+
_LOGGER.error(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
|
|
362
|
+
raise TypeError()
|
|
277
363
|
processed_columns.append(result.alias(output_col_spec))
|
|
278
364
|
|
|
279
365
|
elif isinstance(result, pl.DataFrame):
|
|
280
366
|
# 1. Handle list-based renaming
|
|
281
367
|
if isinstance(output_col_spec, list):
|
|
282
368
|
if len(result.columns) != len(output_col_spec):
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
f"but recipe specifies {len(output_col_spec)} output names."
|
|
286
|
-
)
|
|
369
|
+
_LOGGER.error(f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, but recipe specifies {len(output_col_spec)} output names.")
|
|
370
|
+
raise ValueError()
|
|
287
371
|
|
|
288
372
|
renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
|
|
289
373
|
processed_columns.extend(renamed_df.get_columns())
|
|
@@ -299,19 +383,19 @@ class DataProcessor:
|
|
|
299
383
|
processed_columns.extend(renamed_df.get_columns())
|
|
300
384
|
|
|
301
385
|
else:
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
f"so 'output_col' must be a list of names or a string prefix."
|
|
305
|
-
)
|
|
386
|
+
_LOGGER.error(f"Function for '{input_col_name}' returned a DataFrame, so 'output_col' must be a list of names or a string prefix.")
|
|
387
|
+
raise TypeError()
|
|
306
388
|
|
|
307
389
|
else:
|
|
308
|
-
|
|
390
|
+
_LOGGER.error(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
|
|
391
|
+
raise TypeError()
|
|
309
392
|
|
|
310
|
-
else: # This case is
|
|
311
|
-
|
|
393
|
+
else: # This case is unlikely due to builder validation.
|
|
394
|
+
_LOGGER.error(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
|
|
395
|
+
raise TypeError()
|
|
312
396
|
|
|
313
397
|
if not processed_columns:
|
|
314
|
-
_LOGGER.
|
|
398
|
+
_LOGGER.error("The transformation resulted in an empty DataFrame.")
|
|
315
399
|
return pl.DataFrame()
|
|
316
400
|
|
|
317
401
|
return pl.DataFrame(processed_columns)
|
|
@@ -381,18 +465,17 @@ class BinaryTransformer:
|
|
|
381
465
|
):
|
|
382
466
|
# --- Validation: Enforce one and only one option ---
|
|
383
467
|
if true_keywords is not None and false_keywords is not None:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
)
|
|
468
|
+
_LOGGER.error("Provide either 'true_keywords' or 'false_keywords', but not both.")
|
|
469
|
+
raise ValueError()
|
|
387
470
|
if true_keywords is None and false_keywords is None:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
)
|
|
471
|
+
_LOGGER.error("You must provide either 'true_keywords' or 'false_keywords'.")
|
|
472
|
+
raise ValueError()
|
|
391
473
|
|
|
392
474
|
# --- Configuration ---
|
|
393
475
|
self.keywords: List[str] = true_keywords if true_keywords is not None else false_keywords # type: ignore
|
|
394
476
|
if not self.keywords:
|
|
395
|
-
|
|
477
|
+
_LOGGER.error("Keyword list cannot be empty.")
|
|
478
|
+
raise ValueError()
|
|
396
479
|
|
|
397
480
|
self.mode: str = "true_mode" if true_keywords is not None else "false_mode"
|
|
398
481
|
|
|
@@ -468,9 +551,11 @@ class MultiBinaryDummifier:
|
|
|
468
551
|
"""
|
|
469
552
|
def __init__(self, keywords: List[str], case_insensitive: bool = True):
|
|
470
553
|
if not isinstance(keywords, list) or not all(isinstance(k, str) for k in keywords):
|
|
471
|
-
|
|
554
|
+
_LOGGER.error("The 'keywords' argument must be a list of strings.")
|
|
555
|
+
raise TypeError()
|
|
472
556
|
if not keywords:
|
|
473
|
-
|
|
557
|
+
_LOGGER.error("The 'keywords' list cannot be empty.")
|
|
558
|
+
raise ValueError()
|
|
474
559
|
|
|
475
560
|
self.keywords = keywords
|
|
476
561
|
self.case_insensitive = case_insensitive
|
|
@@ -530,7 +615,8 @@ class KeywordDummifier:
|
|
|
530
615
|
"""
|
|
531
616
|
def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
|
|
532
617
|
if len(group_names) != len(group_keywords):
|
|
533
|
-
|
|
618
|
+
_LOGGER.error("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
|
|
619
|
+
raise ValueError()
|
|
534
620
|
|
|
535
621
|
self.group_names = group_names
|
|
536
622
|
self.group_keywords = group_keywords
|
|
@@ -610,23 +696,28 @@ class NumberExtractor:
|
|
|
610
696
|
):
|
|
611
697
|
# --- Validation ---
|
|
612
698
|
if not isinstance(regex_pattern, str):
|
|
613
|
-
|
|
699
|
+
_LOGGER.error("regex_pattern must be a string.")
|
|
700
|
+
raise TypeError()
|
|
614
701
|
|
|
615
702
|
# Validate that the regex has exactly one capturing group
|
|
616
703
|
try:
|
|
617
704
|
if re.compile(regex_pattern).groups != 1:
|
|
618
|
-
|
|
705
|
+
_LOGGER.error("regex_pattern must contain exactly one capturing group '(...)'")
|
|
706
|
+
raise ValueError()
|
|
619
707
|
except re.error as e:
|
|
620
|
-
|
|
708
|
+
_LOGGER.error(f"Invalid regex pattern provided: {e}")
|
|
709
|
+
raise ValueError()
|
|
621
710
|
|
|
622
711
|
if dtype not in ["float", "int"]:
|
|
623
|
-
|
|
712
|
+
_LOGGER.error("dtype must be either 'float' or 'int'.")
|
|
713
|
+
raise ValueError()
|
|
624
714
|
|
|
625
715
|
if round_digits is not None:
|
|
626
716
|
if not isinstance(round_digits, int):
|
|
627
|
-
|
|
717
|
+
_LOGGER.error("round_digits must be an integer.")
|
|
718
|
+
raise TypeError()
|
|
628
719
|
if dtype == "int":
|
|
629
|
-
_LOGGER.warning(f"
|
|
720
|
+
_LOGGER.warning(f"'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
|
|
630
721
|
|
|
631
722
|
self.regex_pattern = regex_pattern
|
|
632
723
|
self.dtype = dtype
|
|
@@ -684,21 +775,26 @@ class MultiNumberExtractor:
|
|
|
684
775
|
):
|
|
685
776
|
# --- Validation ---
|
|
686
777
|
if not isinstance(num_outputs, int) or num_outputs <= 0:
|
|
687
|
-
|
|
778
|
+
_LOGGER.error("num_outputs must be a positive integer.")
|
|
779
|
+
raise ValueError()
|
|
688
780
|
|
|
689
781
|
if not isinstance(regex_pattern, str):
|
|
690
|
-
|
|
782
|
+
_LOGGER.error("regex_pattern must be a string.")
|
|
783
|
+
raise TypeError()
|
|
691
784
|
|
|
692
785
|
# Validate that the regex has exactly one capturing group
|
|
693
786
|
try:
|
|
694
787
|
if re.compile(regex_pattern).groups != 1:
|
|
695
|
-
|
|
788
|
+
_LOGGER.error("regex_pattern must contain exactly one capturing group '(...)'")
|
|
789
|
+
raise ValueError()
|
|
696
790
|
except re.error as e:
|
|
697
|
-
|
|
791
|
+
_LOGGER.error(f"Invalid regex pattern provided: {e}")
|
|
792
|
+
raise ValueError()
|
|
698
793
|
|
|
699
794
|
# Validate dtype
|
|
700
795
|
if dtype not in ["float", "int"]:
|
|
701
|
-
|
|
796
|
+
_LOGGER.error("dtype must be either 'float' or 'int'.")
|
|
797
|
+
raise ValueError()
|
|
702
798
|
|
|
703
799
|
self.num_outputs = num_outputs
|
|
704
800
|
self.regex_pattern = regex_pattern
|
|
@@ -751,17 +847,14 @@ class RatioCalculator:
|
|
|
751
847
|
try:
|
|
752
848
|
compiled_pattern = re.compile(regex_pattern)
|
|
753
849
|
if compiled_pattern.groups != 2:
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
"capturing groups '(...)'."
|
|
757
|
-
)
|
|
850
|
+
_LOGGER.error("RatioCalculator regex_pattern must contain exactly two capturing groups '(...)'.")
|
|
851
|
+
raise ValueError()
|
|
758
852
|
if compiled_pattern.groupindex:
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
"(e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)')."
|
|
762
|
-
)
|
|
853
|
+
_LOGGER.error("RatioCalculator must be initialized with unnamed capturing groups (e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)').")
|
|
854
|
+
raise ValueError()
|
|
763
855
|
except re.error as e:
|
|
764
|
-
|
|
856
|
+
_LOGGER.error(f"Invalid regex pattern provided: {e}")
|
|
857
|
+
raise ValueError()
|
|
765
858
|
|
|
766
859
|
self.regex_pattern = regex_pattern
|
|
767
860
|
|
|
@@ -805,7 +898,8 @@ class CategoryMapper:
|
|
|
805
898
|
unseen_value: Optional[Union[int, float]] = None,
|
|
806
899
|
):
|
|
807
900
|
if not isinstance(mapping, dict):
|
|
808
|
-
|
|
901
|
+
_LOGGER.error("The 'mapping' argument must be a dictionary.")
|
|
902
|
+
raise TypeError()
|
|
809
903
|
|
|
810
904
|
self.mapping = mapping
|
|
811
905
|
self.default_value = unseen_value
|
|
@@ -866,7 +960,8 @@ class RegexMapper:
|
|
|
866
960
|
):
|
|
867
961
|
# --- Validation ---
|
|
868
962
|
if not isinstance(mapping, dict):
|
|
869
|
-
|
|
963
|
+
_LOGGER.error("The 'mapping' argument must be a dictionary.")
|
|
964
|
+
raise TypeError()
|
|
870
965
|
|
|
871
966
|
self.unseen_value = unseen_value
|
|
872
967
|
|
|
@@ -880,9 +975,11 @@ class RegexMapper:
|
|
|
880
975
|
try:
|
|
881
976
|
re.compile(final_pattern)
|
|
882
977
|
except re.error as e:
|
|
883
|
-
|
|
978
|
+
_LOGGER.error(f"Invalid regex pattern '{final_pattern}': {e}")
|
|
979
|
+
raise ValueError()
|
|
884
980
|
if not isinstance(value, (int, float)):
|
|
885
|
-
|
|
981
|
+
_LOGGER.error(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
|
|
982
|
+
raise TypeError()
|
|
886
983
|
|
|
887
984
|
self.processed_mapping.append((final_pattern, value))
|
|
888
985
|
|
|
@@ -937,11 +1034,13 @@ class ValueBinner:
|
|
|
937
1034
|
):
|
|
938
1035
|
# --- Validation ---
|
|
939
1036
|
if not isinstance(breaks, list) or len(breaks) < 2:
|
|
940
|
-
|
|
1037
|
+
_LOGGER.error("The 'breaks' argument must be a list of at least two numbers.")
|
|
1038
|
+
raise ValueError()
|
|
941
1039
|
|
|
942
1040
|
# Check if the list is sorted
|
|
943
1041
|
if not all(breaks[i] <= breaks[i+1] for i in range(len(breaks)-1)):
|
|
944
|
-
|
|
1042
|
+
_LOGGER.error("The 'breaks' list must be sorted in ascending order.")
|
|
1043
|
+
raise ValueError()
|
|
945
1044
|
|
|
946
1045
|
self.breaks = breaks
|
|
947
1046
|
self.left_closed = left_closed
|
|
@@ -1001,14 +1100,13 @@ class DateFeatureExtractor:
|
|
|
1001
1100
|
):
|
|
1002
1101
|
# --- Validation ---
|
|
1003
1102
|
if not isinstance(features, list) or not features:
|
|
1004
|
-
|
|
1103
|
+
_LOGGER.error("'features' must be a non-empty list of strings.")
|
|
1104
|
+
raise ValueError()
|
|
1005
1105
|
|
|
1006
1106
|
for feature in features:
|
|
1007
1107
|
if feature not in self.ALLOWED_FEATURES:
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
f"Allowed features are: {self.ALLOWED_FEATURES}"
|
|
1011
|
-
)
|
|
1108
|
+
_LOGGER.error(f"Feature '{feature}' is not supported. Allowed features are: {self.ALLOWED_FEATURES}")
|
|
1109
|
+
raise ValueError()
|
|
1012
1110
|
|
|
1013
1111
|
self.features = features
|
|
1014
1112
|
self.format = format
|
ml_tools/GUI_tools.py
CHANGED
|
@@ -88,7 +88,7 @@ class ConfigManager:
|
|
|
88
88
|
|
|
89
89
|
path = Path(file_path)
|
|
90
90
|
if path.exists():
|
|
91
|
-
_LOGGER.warning(f"
|
|
91
|
+
_LOGGER.warning(f"Configuration file already exists at {path}, or wrong path provided. Aborting.")
|
|
92
92
|
return
|
|
93
93
|
|
|
94
94
|
config = configparser.ConfigParser()
|
|
@@ -150,7 +150,7 @@ class ConfigManager:
|
|
|
150
150
|
|
|
151
151
|
with open(path, 'w') as configfile:
|
|
152
152
|
config.write(configfile)
|
|
153
|
-
_LOGGER.info(f"
|
|
153
|
+
_LOGGER.info(f"Successfully generated config template at: '{path}'")
|
|
154
154
|
|
|
155
155
|
|
|
156
156
|
# --- GUI Factory ---
|
|
@@ -442,14 +442,14 @@ def catch_exceptions(show_popup: bool = True):
|
|
|
442
442
|
def wrapper(*args, **kwargs):
|
|
443
443
|
try:
|
|
444
444
|
return func(*args, **kwargs)
|
|
445
|
-
except Exception
|
|
445
|
+
except Exception:
|
|
446
446
|
# Format the full traceback to give detailed error info
|
|
447
|
-
error_msg = traceback.format_exc()
|
|
448
447
|
if show_popup:
|
|
448
|
+
error_msg = traceback.format_exc()
|
|
449
449
|
sg.popup_error("An error occurred:", error_msg, title="Error")
|
|
450
450
|
else:
|
|
451
451
|
# Fallback for non-GUI contexts or if popup is disabled
|
|
452
|
-
_LOGGER.error
|
|
452
|
+
_LOGGER.exception("An error occurred.")
|
|
453
453
|
return wrapper
|
|
454
454
|
return decorator
|
|
455
455
|
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -38,13 +38,14 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
38
38
|
imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
|
|
39
39
|
|
|
40
40
|
if imputed_datasets is None or len(imputed_datasets) == 0:
|
|
41
|
-
|
|
41
|
+
_LOGGER.error("No imputed datasets were generated. Check the MICE process.")
|
|
42
|
+
raise ValueError()
|
|
42
43
|
|
|
43
44
|
# threshold binary columns
|
|
44
45
|
if binary_columns is not None:
|
|
45
46
|
invalid_binary_columns = set(binary_columns) - set(df.columns)
|
|
46
47
|
if invalid_binary_columns:
|
|
47
|
-
_LOGGER.warning(f"
|
|
48
|
+
_LOGGER.warning(f"These 'binary columns' are not in the dataset:")
|
|
48
49
|
for invalid_binary_col in invalid_binary_columns:
|
|
49
50
|
print(f" - {invalid_binary_col}")
|
|
50
51
|
valid_binary_columns = [col for col in binary_columns if col not in invalid_binary_columns]
|
|
@@ -63,7 +64,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
63
64
|
assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}" # type: ignore
|
|
64
65
|
# print("✅ All imputed datasets match the original DataFrame indexes.")
|
|
65
66
|
|
|
66
|
-
_LOGGER.info("
|
|
67
|
+
_LOGGER.info("MICE imputation complete.")
|
|
67
68
|
|
|
68
69
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
69
70
|
|
|
@@ -95,7 +96,8 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
95
96
|
dataset_count = kernel.num_datasets
|
|
96
97
|
|
|
97
98
|
if dataset_count != len(imputed_dataset_names):
|
|
98
|
-
|
|
99
|
+
_LOGGER.error(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
|
|
100
|
+
raise ValueError()
|
|
99
101
|
|
|
100
102
|
# Check path
|
|
101
103
|
root_path = make_fullpath(root_dir, make=True)
|
|
@@ -133,7 +135,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
133
135
|
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
134
136
|
plt.close()
|
|
135
137
|
|
|
136
|
-
_LOGGER.info(f"
|
|
138
|
+
_LOGGER.info(f"{dataset_file_dir} process completed.")
|
|
137
139
|
|
|
138
140
|
|
|
139
141
|
# Imputed distributions
|
|
@@ -157,7 +159,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
157
159
|
"""Helper function to add labels and legends to a figure"""
|
|
158
160
|
|
|
159
161
|
if not isinstance(fig, ggplot):
|
|
160
|
-
|
|
162
|
+
_LOGGER.error(f"Expected a plotnine.ggplot object, received {type(fig)}.")
|
|
163
|
+
raise TypeError()
|
|
161
164
|
|
|
162
165
|
# Edit labels and title
|
|
163
166
|
fig = fig + theme(
|
|
@@ -171,7 +174,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
171
174
|
fig = fig.draw()
|
|
172
175
|
|
|
173
176
|
if not hasattr(fig, 'axes') or len(fig.axes) == 0:
|
|
174
|
-
|
|
177
|
+
_LOGGER.error("Rendered figure has no axes to modify.")
|
|
178
|
+
raise RuntimeError()
|
|
175
179
|
|
|
176
180
|
if filename == "Combined_Distributions":
|
|
177
181
|
custom_xlabel = "Feature Values"
|
|
@@ -218,7 +222,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
218
222
|
fig = kernel.plot_imputed_distributions(variables=[feature])
|
|
219
223
|
_process_figure(fig, feature)
|
|
220
224
|
|
|
221
|
-
_LOGGER.info(f"
|
|
225
|
+
_LOGGER.info(f"{local_dir_name} completed.")
|
|
222
226
|
|
|
223
227
|
|
|
224
228
|
def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],
|