dragon-ml-toolbox 5.2.2__py3-none-any.whl → 5.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.0.dist-info}/METADATA +2 -2
- {dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.0.dist-info}/RECORD +12 -12
- ml_tools/MICE_imputation.py +4 -0
- ml_tools/ML_callbacks.py +5 -5
- ml_tools/ML_datasetmaster.py +20 -12
- ml_tools/ML_optimization.py +1 -0
- ml_tools/data_exploration.py +21 -20
- ml_tools/utilities.py +11 -10
- {dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 5.
|
|
3
|
+
Version: 5.3.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -18,7 +18,7 @@ Requires-Dist: numpy; extra == "base"
|
|
|
18
18
|
Requires-Dist: polars; extra == "base"
|
|
19
19
|
Requires-Dist: joblib; extra == "base"
|
|
20
20
|
Provides-Extra: ml
|
|
21
|
-
Requires-Dist: numpy; extra == "ml"
|
|
21
|
+
Requires-Dist: numpy>=2.0; extra == "ml"
|
|
22
22
|
Requires-Dist: pandas; extra == "ml"
|
|
23
23
|
Requires-Dist: polars; extra == "ml"
|
|
24
24
|
Requires-Dist: joblib; extra == "ml"
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
dragon_ml_toolbox-5.
|
|
2
|
-
dragon_ml_toolbox-5.
|
|
1
|
+
dragon_ml_toolbox-5.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-5.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
3
|
ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
|
|
4
4
|
ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
|
|
5
|
-
ml_tools/MICE_imputation.py,sha256=
|
|
6
|
-
ml_tools/ML_callbacks.py,sha256=
|
|
7
|
-
ml_tools/ML_datasetmaster.py,sha256=
|
|
5
|
+
ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
|
|
6
|
+
ml_tools/ML_callbacks.py,sha256=eOCSc-1_e5vC2dQN1ydHGKDLeJ3DqB-eLRLuXp2DpFM,13257
|
|
7
|
+
ml_tools/ML_datasetmaster.py,sha256=bbKCNA_b_uDIfxP9YIYKZm-VSfUSD15LvegFxpE9DIQ,34315
|
|
8
8
|
ml_tools/ML_evaluation.py,sha256=4dVqe6JF1Ukmk1sAcY8E5EG1oB1_oy2HXE5OT-pZwCs,10273
|
|
9
9
|
ml_tools/ML_inference.py,sha256=Fh-X2UQn3AznWBjf-7iPSxwE-EzkGQm1VEIRUAkURmE,5336
|
|
10
10
|
ml_tools/ML_models.py,sha256=SJhKHGAN2VTBqzcHUOpFWuVZ2Y7U1M4P_axG_LNYWcI,6460
|
|
11
|
-
ml_tools/ML_optimization.py,sha256=
|
|
11
|
+
ml_tools/ML_optimization.py,sha256=zGKpWW4SL1-3iiHglDP-dkuADL73T0kxs3Dc-Lyishs,9671
|
|
12
12
|
ml_tools/ML_trainer.py,sha256=t58Ka6ryaYm0Fi5xje-e-fkmz9DwDLIeJLbh04n_gDg,15034
|
|
13
13
|
ml_tools/PSO_optimization.py,sha256=stH2Ux1sftQgX5EwLc85kHcoT4Rmz6zv7sH2yzf4Zrw,22710
|
|
14
14
|
ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
|
|
@@ -18,15 +18,15 @@ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
|
18
18
|
ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
|
|
19
19
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
20
20
|
ml_tools/custom_logger.py,sha256=njM_0XPbQ1S-x5LeSQAaTo2if-XVOR_pQSGg4EDeiTU,4603
|
|
21
|
-
ml_tools/data_exploration.py,sha256=
|
|
21
|
+
ml_tools/data_exploration.py,sha256=P4f8OpRa7Q4i-11nkppxXw5Lx2lwlpn20GwWBbN_xbM,23901
|
|
22
22
|
ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
|
|
23
23
|
ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
|
|
24
24
|
ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,13007
|
|
25
25
|
ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
|
|
26
26
|
ml_tools/optimization_tools.py,sha256=MuT4OG7_r1QqLUti-yYix7QeCpglezD0oe9BDCq0QXk,5086
|
|
27
27
|
ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
|
|
28
|
-
ml_tools/utilities.py,sha256=
|
|
29
|
-
dragon_ml_toolbox-5.
|
|
30
|
-
dragon_ml_toolbox-5.
|
|
31
|
-
dragon_ml_toolbox-5.
|
|
32
|
-
dragon_ml_toolbox-5.
|
|
28
|
+
ml_tools/utilities.py,sha256=T5xbxzBr14odUj7KncSeg-tJzqjmSDLOOmxEaGYLLi4,18447
|
|
29
|
+
dragon_ml_toolbox-5.3.0.dist-info/METADATA,sha256=Lu_JBMfkCPssLk-a2v4b-oZu86cFK1OIB4HtHspVRIk,6643
|
|
30
|
+
dragon_ml_toolbox-5.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
dragon_ml_toolbox-5.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
32
|
+
dragon_ml_toolbox-5.3.0.dist-info/RECORD,,
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -29,6 +29,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
29
29
|
random_state=random_state
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
+
_LOGGER.info("➡️ MICE imputation running...")
|
|
33
|
+
|
|
32
34
|
# Perform MICE with n iterations per dataset
|
|
33
35
|
kernel.mice(iterations)
|
|
34
36
|
|
|
@@ -61,6 +63,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
61
63
|
assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}" # type: ignore
|
|
62
64
|
# print("✅ All imputed datasets match the original DataFrame indexes.")
|
|
63
65
|
|
|
66
|
+
_LOGGER.info("✅ MICE imputation complete.")
|
|
67
|
+
|
|
64
68
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
65
69
|
|
|
66
70
|
|
ml_tools/ML_callbacks.py
CHANGED
|
@@ -148,13 +148,13 @@ class EarlyStopping(Callback):
|
|
|
148
148
|
else: # Default to min mode for loss or other metrics
|
|
149
149
|
self.monitor_op = np.less
|
|
150
150
|
|
|
151
|
-
self.best = np.
|
|
151
|
+
self.best = np.inf if self.monitor_op == np.less else -np.inf
|
|
152
152
|
|
|
153
153
|
def on_train_begin(self, logs=None):
|
|
154
154
|
# Reset state at the beginning of training
|
|
155
155
|
self.wait = 0
|
|
156
156
|
self.stopped_epoch = 0
|
|
157
|
-
self.best = np.
|
|
157
|
+
self.best = np.inf if self.monitor_op == np.less else -np.inf
|
|
158
158
|
|
|
159
159
|
def on_epoch_end(self, epoch, logs=None):
|
|
160
160
|
current = logs.get(self.monitor) # type: ignore
|
|
@@ -228,11 +228,11 @@ class ModelCheckpoint(Callback):
|
|
|
228
228
|
else:
|
|
229
229
|
self.monitor_op = np.less if 'loss' in self.monitor else np.greater
|
|
230
230
|
|
|
231
|
-
self.best = np.
|
|
231
|
+
self.best = np.inf if self.monitor_op == np.less else -np.inf
|
|
232
232
|
|
|
233
233
|
def on_train_begin(self, logs=None):
|
|
234
234
|
"""Reset state when training starts."""
|
|
235
|
-
self.best = np.
|
|
235
|
+
self.best = np.inf if self.monitor_op == np.less else -np.inf
|
|
236
236
|
self.saved_checkpoints = []
|
|
237
237
|
self.last_best_filepath = None
|
|
238
238
|
|
|
@@ -251,7 +251,7 @@ class ModelCheckpoint(Callback):
|
|
|
251
251
|
return
|
|
252
252
|
|
|
253
253
|
if self.monitor_op(current, self.best):
|
|
254
|
-
old_best_str = f"{self.best:.4f}" if self.best not in [np.
|
|
254
|
+
old_best_str = f"{self.best:.4f}" if self.best not in [np.inf, -np.inf] else "inf"
|
|
255
255
|
|
|
256
256
|
# Create a descriptive filename
|
|
257
257
|
filename = f"epoch_{epoch}-{self.monitor}_{current:.4f}.pth"
|
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -128,17 +128,18 @@ class DatasetMaker(_BaseMaker):
|
|
|
128
128
|
- Automated (single call):
|
|
129
129
|
```python
|
|
130
130
|
maker = DatasetMaker(df, label_col='target')
|
|
131
|
-
maker.
|
|
131
|
+
maker.auto_process() # uses simplified arguments
|
|
132
132
|
train_ds, test_ds = maker.get_datasets()
|
|
133
133
|
```
|
|
134
134
|
"""
|
|
135
|
-
def __init__(self, pandas_df: pandas.DataFrame, label_col: str):
|
|
135
|
+
def __init__(self, pandas_df: pandas.DataFrame, label_col: str, kind: Literal["regression", "classification"]):
|
|
136
136
|
super().__init__()
|
|
137
137
|
if not isinstance(pandas_df, pandas.DataFrame):
|
|
138
138
|
raise TypeError("Input must be a pandas.DataFrame.")
|
|
139
139
|
if label_col not in pandas_df.columns:
|
|
140
140
|
raise ValueError(f"Label column '{label_col}' not found in DataFrame.")
|
|
141
|
-
|
|
141
|
+
|
|
142
|
+
self.kind = kind
|
|
142
143
|
self.labels = pandas_df[label_col]
|
|
143
144
|
self.features = pandas_df.drop(columns=label_col)
|
|
144
145
|
self.labels_map = None
|
|
@@ -277,7 +278,7 @@ class DatasetMaker(_BaseMaker):
|
|
|
277
278
|
_LOGGER.info(f"Balancing complete. New training set size: {len(self.features_train)} samples.")
|
|
278
279
|
return self
|
|
279
280
|
|
|
280
|
-
def
|
|
281
|
+
def auto_process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
|
|
281
282
|
balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
|
|
282
283
|
"""Runs a standard, fully automated preprocessing pipeline."""
|
|
283
284
|
_LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
|
|
@@ -334,8 +335,10 @@ class DatasetMaker(_BaseMaker):
|
|
|
334
335
|
if not self._is_split:
|
|
335
336
|
raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
|
|
336
337
|
|
|
337
|
-
|
|
338
|
-
|
|
338
|
+
label_dtype = torch.float32 if self.kind == "regression" else torch.int64
|
|
339
|
+
|
|
340
|
+
self._train_dataset = _PytorchDataset(self.features_train, self.labels_train, labels_dtype=label_dtype) # type: ignore
|
|
341
|
+
self._test_dataset = _PytorchDataset(self.features_test, self.labels_test, labels_dtype=label_dtype) # type: ignore
|
|
339
342
|
|
|
340
343
|
return self._train_dataset, self._test_dataset
|
|
341
344
|
|
|
@@ -382,12 +385,13 @@ class SimpleDatasetMaker:
|
|
|
382
385
|
|
|
383
386
|
Args:
|
|
384
387
|
pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
|
|
388
|
+
kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
|
|
385
389
|
test_size (float): The proportion of the dataset to allocate to the
|
|
386
390
|
test split.
|
|
387
391
|
random_state (int): The seed for the random number generator for
|
|
388
392
|
reproducibility.
|
|
389
393
|
"""
|
|
390
|
-
def __init__(self, pandas_df: pandas.DataFrame, test_size: float = 0.2, random_state: int = 42):
|
|
394
|
+
def __init__(self, pandas_df: pandas.DataFrame, kind: Literal["regression", "classification"], test_size: float = 0.2, random_state: int = 42):
|
|
391
395
|
"""
|
|
392
396
|
Attributes:
|
|
393
397
|
`train_dataset` -> PyTorch Dataset
|
|
@@ -398,9 +402,11 @@ class SimpleDatasetMaker:
|
|
|
398
402
|
|
|
399
403
|
The ID can be manually set to any string if needed, it is `None` by default.
|
|
400
404
|
"""
|
|
401
|
-
|
|
405
|
+
# Validation
|
|
402
406
|
if not isinstance(pandas_df, pandas.DataFrame):
|
|
403
|
-
raise TypeError("Input must be a pandas.DataFrame.")
|
|
407
|
+
raise TypeError("Input must be a pandas.DataFrame.")
|
|
408
|
+
if kind not in ["regression", "classification"]:
|
|
409
|
+
raise ValueError("`kind` must be 'regression' or 'classification'.")
|
|
404
410
|
|
|
405
411
|
# 1. Identify features and target
|
|
406
412
|
features = pandas_df.iloc[:, :-1]
|
|
@@ -422,9 +428,11 @@ class SimpleDatasetMaker:
|
|
|
422
428
|
self._y_train_shape = y_train.shape
|
|
423
429
|
self._y_test_shape = y_test.shape
|
|
424
430
|
|
|
425
|
-
# 3. Convert to PyTorch Datasets
|
|
426
|
-
|
|
427
|
-
|
|
431
|
+
# 3. Convert to PyTorch Datasets with the correct label dtype
|
|
432
|
+
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
433
|
+
|
|
434
|
+
self._train_ds = _PytorchDataset(X_train.values, y_train.values, labels_dtype=label_dtype)
|
|
435
|
+
self._test_ds = _PytorchDataset(X_test.values, y_test.values, labels_dtype=label_dtype)
|
|
428
436
|
|
|
429
437
|
@property
|
|
430
438
|
def train_dataset(self) -> Dataset:
|
ml_tools/ML_optimization.py
CHANGED
ml_tools/data_exploration.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import Union, Literal, Dict, Tuple, List, Optional
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from .path_manager import sanitize_filename, make_fullpath
|
|
9
9
|
from ._script_info import _script_info
|
|
10
|
+
from ._logger import _LOGGER
|
|
10
11
|
import re
|
|
11
12
|
|
|
12
13
|
|
|
@@ -55,7 +56,7 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
55
56
|
].round(round_digits)
|
|
56
57
|
summary = summary.join(summary_numeric, how='left')
|
|
57
58
|
|
|
58
|
-
print(f"Shape: {df.shape}")
|
|
59
|
+
print(f"DataFrame Shape: {df.shape}")
|
|
59
60
|
return summary
|
|
60
61
|
|
|
61
62
|
|
|
@@ -98,7 +99,7 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
|
|
|
98
99
|
|
|
99
100
|
dropped_columns = original_columns - set(cols_to_keep)
|
|
100
101
|
if verbose:
|
|
101
|
-
|
|
102
|
+
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
|
|
102
103
|
if dropped_columns:
|
|
103
104
|
for dropped_column in dropped_columns:
|
|
104
105
|
print(f" {dropped_column}")
|
|
@@ -129,10 +130,10 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
129
130
|
valid_targets = _validate_columns(df_clean, targets)
|
|
130
131
|
target_na = df_clean[valid_targets].isnull().all(axis=1)
|
|
131
132
|
if target_na.any():
|
|
132
|
-
|
|
133
|
+
_LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
|
|
133
134
|
df_clean = df_clean[~target_na]
|
|
134
135
|
else:
|
|
135
|
-
|
|
136
|
+
_LOGGER.info("✅ No rows with all targets missing.")
|
|
136
137
|
else:
|
|
137
138
|
valid_targets = []
|
|
138
139
|
|
|
@@ -142,12 +143,12 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
142
143
|
feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
|
|
143
144
|
rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
|
|
144
145
|
if len(rows_to_drop) > 0:
|
|
145
|
-
|
|
146
|
+
_LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
|
|
146
147
|
df_clean = df_clean.drop(index=rows_to_drop)
|
|
147
148
|
else:
|
|
148
|
-
|
|
149
|
+
_LOGGER.info(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
|
|
149
150
|
else:
|
|
150
|
-
|
|
151
|
+
_LOGGER.warning("⚠️ No feature columns available to evaluate.")
|
|
151
152
|
|
|
152
153
|
return df_clean
|
|
153
154
|
|
|
@@ -207,7 +208,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
207
208
|
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
208
209
|
|
|
209
210
|
if len(cols_to_drop) > 0:
|
|
210
|
-
|
|
211
|
+
_LOGGER.info(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
211
212
|
print(list(cols_to_drop))
|
|
212
213
|
|
|
213
214
|
result_df = df.drop(columns=cols_to_drop)
|
|
@@ -216,7 +217,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
216
217
|
|
|
217
218
|
return result_df
|
|
218
219
|
else:
|
|
219
|
-
|
|
220
|
+
_LOGGER.info(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
220
221
|
return df
|
|
221
222
|
|
|
222
223
|
|
|
@@ -311,7 +312,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
|
|
|
311
312
|
"""
|
|
312
313
|
numeric_df = df.select_dtypes(include='number')
|
|
313
314
|
if numeric_df.empty:
|
|
314
|
-
|
|
315
|
+
_LOGGER.warning("⚠️ No numeric columns found. Heatmap not generated.")
|
|
315
316
|
return
|
|
316
317
|
|
|
317
318
|
corr = numeric_df.corr(method=method)
|
|
@@ -348,7 +349,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
|
|
|
348
349
|
full_path = save_path / plot_title
|
|
349
350
|
|
|
350
351
|
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
351
|
-
|
|
352
|
+
_LOGGER.info(f"Saved correlation heatmap: '{plot_title}'")
|
|
352
353
|
|
|
353
354
|
plt.show()
|
|
354
355
|
plt.close()
|
|
@@ -454,7 +455,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: Union[str, Path], bin_t
|
|
|
454
455
|
_plot_helper(dict_=dict_to_plot_std, target_dir=std_dir, ylabel="Counts")
|
|
455
456
|
_plot_helper(dict_=dict_to_plot_freq, target_dir=freq_dir, ylabel="Frequency")
|
|
456
457
|
|
|
457
|
-
|
|
458
|
+
_LOGGER.info(f"Saved {saved_plots} value distribution plots.")
|
|
458
459
|
|
|
459
460
|
|
|
460
461
|
def clip_outliers_single(
|
|
@@ -479,17 +480,17 @@ def clip_outliers_single(
|
|
|
479
480
|
None: if a problem with the dataframe column occurred.
|
|
480
481
|
"""
|
|
481
482
|
if column not in df.columns:
|
|
482
|
-
|
|
483
|
+
_LOGGER.warning(f"⚠️ Column '{column}' not found in DataFrame.")
|
|
483
484
|
return None
|
|
484
485
|
|
|
485
486
|
if not pd.api.types.is_numeric_dtype(df[column]):
|
|
486
|
-
|
|
487
|
+
_LOGGER.warning(f"⚠️ Column '{column}' must be numeric.")
|
|
487
488
|
return None
|
|
488
489
|
|
|
489
490
|
new_df = df.copy(deep=True)
|
|
490
491
|
new_df[column] = new_df[column].clip(lower=min_val, upper=max_val)
|
|
491
492
|
|
|
492
|
-
|
|
493
|
+
_LOGGER.info(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
|
|
493
494
|
return new_df
|
|
494
495
|
|
|
495
496
|
|
|
@@ -539,10 +540,10 @@ def clip_outliers_multi(
|
|
|
539
540
|
skipped_columns.append((col, str(e)))
|
|
540
541
|
continue
|
|
541
542
|
|
|
542
|
-
|
|
543
|
+
_LOGGER.info(f"Clipped {clipped_columns} columns.")
|
|
543
544
|
|
|
544
545
|
if skipped_columns:
|
|
545
|
-
|
|
546
|
+
_LOGGER.warning("⚠️ Skipped columns:")
|
|
546
547
|
for col, msg in skipped_columns:
|
|
547
548
|
print(f" - {col}: {msg}")
|
|
548
549
|
|
|
@@ -574,7 +575,7 @@ def match_and_filter_columns_by_regex(
|
|
|
574
575
|
matched_columns = df.columns[mask].to_list()
|
|
575
576
|
filtered_df = df.loc[:, mask]
|
|
576
577
|
|
|
577
|
-
|
|
578
|
+
_LOGGER.info(f"{len(matched_columns)} columns match the regex pattern '{pattern}'.")
|
|
578
579
|
|
|
579
580
|
return filtered_df, matched_columns
|
|
580
581
|
|
|
@@ -628,11 +629,11 @@ def standardize_percentages(
|
|
|
628
629
|
for col in columns:
|
|
629
630
|
# --- Robustness Checks ---
|
|
630
631
|
if col not in df_copy.columns:
|
|
631
|
-
|
|
632
|
+
_LOGGER.warning(f"⚠️ Column '{col}' not found. Skipping.")
|
|
632
633
|
continue
|
|
633
634
|
|
|
634
635
|
if not is_numeric_dtype(df_copy[col]):
|
|
635
|
-
|
|
636
|
+
_LOGGER.warning(f"⚠️ Column '{col}' is not numeric. Skipping.")
|
|
636
637
|
continue
|
|
637
638
|
|
|
638
639
|
# --- Applying the Logic ---
|
ml_tools/utilities.py
CHANGED
|
@@ -8,6 +8,7 @@ import joblib
|
|
|
8
8
|
from joblib.externals.loky.process_executor import TerminatedWorkerError
|
|
9
9
|
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
|
|
10
10
|
from ._script_info import _script_info
|
|
11
|
+
from ._logger import _LOGGER
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
# Keep track of available tools
|
|
@@ -81,7 +82,7 @@ def load_dataframe(
|
|
|
81
82
|
raise ValueError(f"❌ DataFrame '{df_name}' loaded from '{path}' is empty.")
|
|
82
83
|
|
|
83
84
|
if verbose:
|
|
84
|
-
|
|
85
|
+
_LOGGER.info(f"💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
|
|
85
86
|
|
|
86
87
|
return df, df_name
|
|
87
88
|
|
|
@@ -166,7 +167,7 @@ def merge_dataframes(
|
|
|
166
167
|
merged_df = merged_df.reset_index(drop=True)
|
|
167
168
|
|
|
168
169
|
if verbose:
|
|
169
|
-
|
|
170
|
+
_LOGGER.info(f"✅ Merged DataFrame shape: {merged_df.shape}")
|
|
170
171
|
|
|
171
172
|
return merged_df
|
|
172
173
|
|
|
@@ -185,7 +186,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
185
186
|
"""
|
|
186
187
|
# This check works for both pandas and polars
|
|
187
188
|
if df.shape[0] == 0:
|
|
188
|
-
|
|
189
|
+
_LOGGER.warning(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
|
|
189
190
|
return
|
|
190
191
|
|
|
191
192
|
# Create the directory if it doesn't exist
|
|
@@ -207,7 +208,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
207
208
|
# This error handles cases where an unsupported type is passed
|
|
208
209
|
raise TypeError(f"❌ Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
209
210
|
|
|
210
|
-
|
|
211
|
+
_LOGGER.info(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
211
212
|
|
|
212
213
|
|
|
213
214
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
@@ -382,11 +383,11 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
|
|
|
382
383
|
if raise_on_error:
|
|
383
384
|
raise Exception(message)
|
|
384
385
|
else:
|
|
385
|
-
|
|
386
|
+
_LOGGER.warning(message)
|
|
386
387
|
return None
|
|
387
388
|
else:
|
|
388
389
|
if verbose:
|
|
389
|
-
|
|
390
|
+
_LOGGER.info(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
|
|
390
391
|
return None
|
|
391
392
|
|
|
392
393
|
|
|
@@ -409,11 +410,11 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
|
|
|
409
410
|
if raise_on_error:
|
|
410
411
|
raise Exception(message)
|
|
411
412
|
else:
|
|
412
|
-
|
|
413
|
+
_LOGGER.warning(message)
|
|
413
414
|
return None
|
|
414
415
|
else:
|
|
415
416
|
if verbose:
|
|
416
|
-
|
|
417
|
+
_LOGGER.info(f"✅ Loaded object of type '{type(obj)}'")
|
|
417
418
|
return obj
|
|
418
419
|
|
|
419
420
|
|
|
@@ -500,10 +501,10 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
|
|
|
500
501
|
save_dataframe(df=df, save_dir=save_dir, filename=filename)
|
|
501
502
|
total_saved += 1
|
|
502
503
|
except Exception as e:
|
|
503
|
-
|
|
504
|
+
_LOGGER.warning(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
|
|
504
505
|
continue
|
|
505
506
|
|
|
506
|
-
|
|
507
|
+
_LOGGER.info(f"✅ {total_saved} single-target datasets were created.")
|
|
507
508
|
|
|
508
509
|
|
|
509
510
|
def info():
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|