dragon-ml-toolbox 8.2.0__py3-none-any.whl → 9.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/METADATA +5 -1
- dragon_ml_toolbox-9.0.0.dist-info/RECORD +35 -0
- ml_tools/ETL_engineering.py +177 -79
- ml_tools/GUI_tools.py +5 -5
- ml_tools/MICE_imputation.py +12 -8
- ml_tools/ML_callbacks.py +6 -3
- ml_tools/ML_datasetmaster.py +37 -20
- ml_tools/ML_evaluation.py +4 -4
- ml_tools/ML_evaluation_multi.py +26 -17
- ml_tools/ML_inference.py +30 -23
- ml_tools/ML_models.py +14 -14
- ml_tools/ML_optimization.py +4 -3
- ml_tools/ML_scaler.py +7 -7
- ml_tools/ML_trainer.py +17 -15
- ml_tools/PSO_optimization.py +16 -8
- ml_tools/RNN_forecast.py +1 -1
- ml_tools/SQL.py +22 -13
- ml_tools/VIF_factor.py +7 -6
- ml_tools/_logger.py +105 -7
- ml_tools/custom_logger.py +12 -8
- ml_tools/data_exploration.py +20 -15
- ml_tools/ensemble_evaluation.py +10 -6
- ml_tools/ensemble_inference.py +18 -18
- ml_tools/ensemble_learning.py +8 -5
- ml_tools/handle_excel.py +15 -11
- ml_tools/optimization_tools.py +3 -4
- ml_tools/path_manager.py +21 -15
- ml_tools/utilities.py +35 -26
- dragon_ml_toolbox-8.2.0.dist-info/RECORD +0 -36
- ml_tools/_ML_optimization_multi.py +0 -231
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-8.2.0.dist-info → dragon_ml_toolbox-9.0.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_trainer.py
CHANGED
|
@@ -76,10 +76,10 @@ class MLTrainer:
|
|
|
76
76
|
"""Validates the selected device and returns a torch.device object."""
|
|
77
77
|
device_lower = device.lower()
|
|
78
78
|
if "cuda" in device_lower and not torch.cuda.is_available():
|
|
79
|
-
_LOGGER.warning("
|
|
79
|
+
_LOGGER.warning("CUDA not available, switching to CPU.")
|
|
80
80
|
device = "cpu"
|
|
81
81
|
elif device_lower == "mps" and not torch.backends.mps.is_available():
|
|
82
|
-
_LOGGER.warning("
|
|
82
|
+
_LOGGER.warning("Apple Metal Performance Shaders (MPS) not available, switching to CPU.")
|
|
83
83
|
device = "cpu"
|
|
84
84
|
return torch.device(device)
|
|
85
85
|
|
|
@@ -275,7 +275,8 @@ class MLTrainer:
|
|
|
275
275
|
dataset_for_names = data
|
|
276
276
|
else: # data is None, use the trainer's default test dataset
|
|
277
277
|
if self.test_dataset is None:
|
|
278
|
-
|
|
278
|
+
_LOGGER.error("Cannot evaluate. No data provided and no test_dataset available in the trainer.")
|
|
279
|
+
raise ValueError()
|
|
279
280
|
# Create a fresh DataLoader from the test_dataset
|
|
280
281
|
eval_loader = DataLoader(self.test_dataset,
|
|
281
282
|
batch_size=32,
|
|
@@ -285,7 +286,8 @@ class MLTrainer:
|
|
|
285
286
|
dataset_for_names = self.test_dataset
|
|
286
287
|
|
|
287
288
|
if eval_loader is None:
|
|
288
|
-
|
|
289
|
+
_LOGGER.error("Cannot evaluate. No valid data was provided or found.")
|
|
290
|
+
raise ValueError()
|
|
289
291
|
|
|
290
292
|
print("\n--- Model Evaluation ---")
|
|
291
293
|
|
|
@@ -296,7 +298,7 @@ class MLTrainer:
|
|
|
296
298
|
if y_true_b is not None: all_true.append(y_true_b)
|
|
297
299
|
|
|
298
300
|
if not all_true:
|
|
299
|
-
_LOGGER.error("
|
|
301
|
+
_LOGGER.error("Evaluation failed: No data was processed.")
|
|
300
302
|
return
|
|
301
303
|
|
|
302
304
|
y_pred = np.concatenate(all_preds)
|
|
@@ -316,7 +318,7 @@ class MLTrainer:
|
|
|
316
318
|
except AttributeError:
|
|
317
319
|
num_targets = y_true.shape[1]
|
|
318
320
|
target_names = [f"target_{i}" for i in range(num_targets)]
|
|
319
|
-
_LOGGER.warning(f"
|
|
321
|
+
_LOGGER.warning(f"Dataset has no 'target_names' attribute. Using generic names.")
|
|
320
322
|
multi_target_regression_metrics(y_true, y_pred, target_names, save_dir)
|
|
321
323
|
|
|
322
324
|
elif self.kind == "multi_label_classification":
|
|
@@ -325,10 +327,10 @@ class MLTrainer:
|
|
|
325
327
|
except AttributeError:
|
|
326
328
|
num_targets = y_true.shape[1]
|
|
327
329
|
target_names = [f"label_{i}" for i in range(num_targets)]
|
|
328
|
-
_LOGGER.warning(f"
|
|
330
|
+
_LOGGER.warning(f"Dataset has no 'target_names' attribute. Using generic names.")
|
|
329
331
|
|
|
330
332
|
if y_prob is None:
|
|
331
|
-
_LOGGER.error("
|
|
333
|
+
_LOGGER.error("Evaluation for multi_label_classification requires probabilities (y_prob).")
|
|
332
334
|
return
|
|
333
335
|
multi_label_classification_metrics(y_true, y_prob, target_names, save_dir, classification_threshold)
|
|
334
336
|
|
|
@@ -390,14 +392,14 @@ class MLTrainer:
|
|
|
390
392
|
# 1. Get background data from the trainer's train_dataset
|
|
391
393
|
background_data = _get_random_sample(self.train_dataset, n_samples)
|
|
392
394
|
if background_data is None:
|
|
393
|
-
_LOGGER.error("
|
|
395
|
+
_LOGGER.error("Trainer's train_dataset is empty or invalid. Skipping SHAP analysis.")
|
|
394
396
|
return
|
|
395
397
|
|
|
396
398
|
# 2. Determine target dataset and get explanation instances
|
|
397
399
|
target_dataset = explain_dataset if explain_dataset is not None else self.test_dataset
|
|
398
400
|
instances_to_explain = _get_random_sample(target_dataset, n_samples)
|
|
399
401
|
if instances_to_explain is None:
|
|
400
|
-
_LOGGER.error("
|
|
402
|
+
_LOGGER.error("Explanation dataset is empty or invalid. Skipping SHAP analysis.")
|
|
401
403
|
return
|
|
402
404
|
|
|
403
405
|
# attempt to get feature names
|
|
@@ -410,8 +412,8 @@ class MLTrainer:
|
|
|
410
412
|
# Handle PyTorch Subset
|
|
411
413
|
feature_names = target_dataset.dataset.feature_names # type: ignore
|
|
412
414
|
except AttributeError:
|
|
413
|
-
_LOGGER.error("
|
|
414
|
-
raise ValueError(
|
|
415
|
+
_LOGGER.error("Could not extract `feature_names` from the dataset. It must be provided if the dataset object does not have a `feature_names` attribute.")
|
|
416
|
+
raise ValueError()
|
|
415
417
|
|
|
416
418
|
# 3. Call the plotting function
|
|
417
419
|
if self.kind in ["regression", "classification"]:
|
|
@@ -490,13 +492,13 @@ class MLTrainer:
|
|
|
490
492
|
|
|
491
493
|
# --- Step 1: Check if the model supports this explanation ---
|
|
492
494
|
if not hasattr(self.model, 'forward_attention'):
|
|
493
|
-
_LOGGER.error("
|
|
495
|
+
_LOGGER.error("Model does not have a `forward_attention` method. Skipping attention explanation.")
|
|
494
496
|
return
|
|
495
497
|
|
|
496
498
|
# --- Step 2: Set up the dataloader ---
|
|
497
499
|
dataset_to_use = explain_dataset if explain_dataset is not None else self.test_dataset
|
|
498
500
|
if not isinstance(dataset_to_use, Dataset):
|
|
499
|
-
_LOGGER.error("
|
|
501
|
+
_LOGGER.error("The explanation dataset is empty or invalid. Skipping attention analysis.")
|
|
500
502
|
return
|
|
501
503
|
|
|
502
504
|
explain_loader = DataLoader(
|
|
@@ -519,7 +521,7 @@ class MLTrainer:
|
|
|
519
521
|
save_dir=save_dir
|
|
520
522
|
)
|
|
521
523
|
else:
|
|
522
|
-
_LOGGER.error("
|
|
524
|
+
_LOGGER.error("No attention weights were collected from the model.")
|
|
523
525
|
|
|
524
526
|
def callbacks_hook(self, method_name: str, *args, **kwargs):
|
|
525
527
|
"""Calls the specified method on all callbacks."""
|
ml_tools/PSO_optimization.py
CHANGED
|
@@ -65,7 +65,9 @@ class ObjectiveFunction():
|
|
|
65
65
|
np.ndarray
|
|
66
66
|
1D array with length n_samples containing predicted target values.
|
|
67
67
|
"""
|
|
68
|
-
|
|
68
|
+
if features_array.ndim != 2:
|
|
69
|
+
_LOGGER.error(f"Expected 2D array, got shape {features_array.shape}.")
|
|
70
|
+
raise AssertionError()
|
|
69
71
|
|
|
70
72
|
# Apply noise if enabled
|
|
71
73
|
if self.use_noise:
|
|
@@ -101,7 +103,9 @@ class ObjectiveFunction():
|
|
|
101
103
|
np.ndarray
|
|
102
104
|
Noised array of same shape
|
|
103
105
|
"""
|
|
104
|
-
|
|
106
|
+
if features_array.ndim != 2:
|
|
107
|
+
_LOGGER.error(f"Expected 2D array for batch noise injection, got shape {features_array.shape}.")
|
|
108
|
+
raise AssertionError()
|
|
105
109
|
|
|
106
110
|
if self.binary_features > 0:
|
|
107
111
|
split_idx = -self.binary_features
|
|
@@ -118,13 +122,16 @@ class ObjectiveFunction():
|
|
|
118
122
|
|
|
119
123
|
def check_model(self):
|
|
120
124
|
if isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
|
|
121
|
-
|
|
125
|
+
_LOGGER.error(f"[Model Check Failed]\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
|
|
126
|
+
raise ValueError()
|
|
122
127
|
if self.model is None:
|
|
123
|
-
|
|
128
|
+
_LOGGER.error("Loaded model is None")
|
|
129
|
+
raise ValueError()
|
|
124
130
|
|
|
125
131
|
def _get_from_artifact(self, key: str):
|
|
126
132
|
if self._artifact is None:
|
|
127
|
-
|
|
133
|
+
_LOGGER.error("Load model error")
|
|
134
|
+
raise TypeError()
|
|
128
135
|
val = self._artifact.get(key)
|
|
129
136
|
if key == EnsembleKeys.FEATURES:
|
|
130
137
|
result = val if isinstance(val, list) and val else None
|
|
@@ -314,7 +321,8 @@ def run_pso(lower_boundaries: list[float],
|
|
|
314
321
|
if target_name is None and objective_function.target_name is not None:
|
|
315
322
|
target_name = objective_function.target_name
|
|
316
323
|
if target_name is None:
|
|
317
|
-
|
|
324
|
+
_LOGGER.error(f"'target' name was not provided and was not found in the .joblib object.")
|
|
325
|
+
raise ValueError()
|
|
318
326
|
|
|
319
327
|
# --- Setup: Saving Infrastructure ---
|
|
320
328
|
sanitized_target_name = sanitize_filename(target_name)
|
|
@@ -355,7 +363,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
355
363
|
objective_function, pso_arguments, names, target_name, random_state,
|
|
356
364
|
save_format, csv_path, db_manager, db_table_name
|
|
357
365
|
)
|
|
358
|
-
_LOGGER.info(f"
|
|
366
|
+
_LOGGER.info(f"Single optimization complete.")
|
|
359
367
|
return features_dict, target_dict
|
|
360
368
|
|
|
361
369
|
else:
|
|
@@ -365,7 +373,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
365
373
|
objective_function, pso_arguments, names, target_name, post_hoc_analysis,
|
|
366
374
|
save_format, csv_path, db_manager, db_table_name
|
|
367
375
|
)
|
|
368
|
-
_LOGGER.info("
|
|
376
|
+
_LOGGER.info("Post-hoc analysis complete. Results saved.")
|
|
369
377
|
return None
|
|
370
378
|
|
|
371
379
|
|
ml_tools/RNN_forecast.py
CHANGED
ml_tools/SQL.py
CHANGED
|
@@ -62,7 +62,7 @@ class DatabaseManager:
|
|
|
62
62
|
_LOGGER.info(f"❇️ Successfully connected to database: {self.db_path}")
|
|
63
63
|
return self
|
|
64
64
|
except sqlite3.Error as e:
|
|
65
|
-
_LOGGER.error(f"
|
|
65
|
+
_LOGGER.error(f"Database connection failed: {e}")
|
|
66
66
|
raise # Re-raise the exception after logging
|
|
67
67
|
|
|
68
68
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
@@ -70,11 +70,11 @@ class DatabaseManager:
|
|
|
70
70
|
if self.conn:
|
|
71
71
|
if exc_type: # If an exception occurred, rollback
|
|
72
72
|
self.conn.rollback()
|
|
73
|
-
_LOGGER.warning("
|
|
73
|
+
_LOGGER.warning("Rolling back transaction due to an error.")
|
|
74
74
|
else: # Otherwise, commit the transaction
|
|
75
75
|
self.conn.commit()
|
|
76
76
|
self.conn.close()
|
|
77
|
-
_LOGGER.info(f"
|
|
77
|
+
_LOGGER.info(f"Database connection closed: {self.db_path.name}")
|
|
78
78
|
|
|
79
79
|
def create_table(self, table_name: str, schema: Dict[str, str], if_not_exists: bool = True):
|
|
80
80
|
"""
|
|
@@ -92,7 +92,8 @@ class DatabaseManager:
|
|
|
92
92
|
if the table already exists.
|
|
93
93
|
"""
|
|
94
94
|
if not self.cursor:
|
|
95
|
-
|
|
95
|
+
_LOGGER.error("Database connection is not open.")
|
|
96
|
+
raise sqlite3.Error()
|
|
96
97
|
|
|
97
98
|
columns_def = ", ".join([f'"{col_name}" {col_type}' for col_name, col_type in schema.items()])
|
|
98
99
|
exists_clause = "IF NOT EXISTS" if if_not_exists else ""
|
|
@@ -115,7 +116,8 @@ class DatabaseManager:
|
|
|
115
116
|
data to be inserted.
|
|
116
117
|
"""
|
|
117
118
|
if not self.cursor:
|
|
118
|
-
|
|
119
|
+
_LOGGER.error("Database connection is not open.")
|
|
120
|
+
raise sqlite3.Error()
|
|
119
121
|
|
|
120
122
|
columns = ', '.join(f'"{k}"' for k in data.keys())
|
|
121
123
|
placeholders = ', '.join(['?'] * len(data))
|
|
@@ -143,7 +145,8 @@ class DatabaseManager:
|
|
|
143
145
|
A DataFrame containing the query results.
|
|
144
146
|
"""
|
|
145
147
|
if not self.conn:
|
|
146
|
-
|
|
148
|
+
_LOGGER.error("Database connection is not open.")
|
|
149
|
+
raise sqlite3.Error()
|
|
147
150
|
|
|
148
151
|
return pd.read_sql_query(query, self.conn, params=params)
|
|
149
152
|
|
|
@@ -159,7 +162,8 @@ class DatabaseManager:
|
|
|
159
162
|
An optional tuple of parameters for the query.
|
|
160
163
|
"""
|
|
161
164
|
if not self.cursor:
|
|
162
|
-
|
|
165
|
+
_LOGGER.error("Database connection is not open.")
|
|
166
|
+
raise sqlite3.Error()
|
|
163
167
|
|
|
164
168
|
self.cursor.execute(query, params if params else ())
|
|
165
169
|
|
|
@@ -176,9 +180,10 @@ class DatabaseManager:
|
|
|
176
180
|
All dictionaries should have the same keys.
|
|
177
181
|
"""
|
|
178
182
|
if not self.cursor:
|
|
179
|
-
|
|
183
|
+
_LOGGER.error("Database connection is not open.")
|
|
184
|
+
raise sqlite3.Error()
|
|
180
185
|
if not data:
|
|
181
|
-
_LOGGER.warning("
|
|
186
|
+
_LOGGER.warning("'insert_many' called with empty data list. No action taken.")
|
|
182
187
|
return
|
|
183
188
|
|
|
184
189
|
# Assume all dicts have the same keys as the first one
|
|
@@ -211,7 +216,8 @@ class DatabaseManager:
|
|
|
211
216
|
- 'append': Insert new values to the existing table.
|
|
212
217
|
"""
|
|
213
218
|
if not self.conn:
|
|
214
|
-
|
|
219
|
+
_LOGGER.error("Database connection is not open.")
|
|
220
|
+
raise sqlite3.Error()
|
|
215
221
|
|
|
216
222
|
df.to_sql(
|
|
217
223
|
table_name,
|
|
@@ -224,7 +230,8 @@ class DatabaseManager:
|
|
|
224
230
|
def list_tables(self) -> List[str]:
|
|
225
231
|
"""Returns a list of all table names in the database."""
|
|
226
232
|
if not self.cursor:
|
|
227
|
-
|
|
233
|
+
_LOGGER.error("Database connection is not open.")
|
|
234
|
+
raise sqlite3.Error()
|
|
228
235
|
|
|
229
236
|
self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
|
230
237
|
# The result of the fetch is a list of tuples, e.g., [('table1',), ('table2',)]
|
|
@@ -237,7 +244,8 @@ class DatabaseManager:
|
|
|
237
244
|
Returns a DataFrame with columns: cid, name, type, notnull, dflt_value, pk
|
|
238
245
|
"""
|
|
239
246
|
if not self.conn:
|
|
240
|
-
|
|
247
|
+
_LOGGER.error("Database connection is not open.")
|
|
248
|
+
raise sqlite3.Error()
|
|
241
249
|
|
|
242
250
|
# PRAGMA is a special SQL command in SQLite for database metadata
|
|
243
251
|
return pd.read_sql_query(f'PRAGMA table_info("{table_name}");', self.conn)
|
|
@@ -257,7 +265,8 @@ class DatabaseManager:
|
|
|
257
265
|
column are unique.
|
|
258
266
|
"""
|
|
259
267
|
if not self.cursor:
|
|
260
|
-
|
|
268
|
+
_LOGGER.error("Database connection is not open.")
|
|
269
|
+
raise sqlite3.Error()
|
|
261
270
|
|
|
262
271
|
index_name = f"idx_{table_name}_{column_name}"
|
|
263
272
|
unique_clause = "UNIQUE" if unique else ""
|
ml_tools/VIF_factor.py
CHANGED
|
@@ -55,19 +55,19 @@ def compute_vif(
|
|
|
55
55
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
56
56
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
57
57
|
if missing_features:
|
|
58
|
-
_LOGGER.warning(f"
|
|
58
|
+
_LOGGER.warning(f"These columns are not Numeric:\n{missing_features}")
|
|
59
59
|
else:
|
|
60
60
|
sanitized_columns = list()
|
|
61
61
|
for feature in use_columns:
|
|
62
62
|
if feature not in ground_truth_cols:
|
|
63
|
-
_LOGGER.warning(f"
|
|
63
|
+
_LOGGER.warning(f"The provided column '{feature}' is not in the DataFrame.")
|
|
64
64
|
else:
|
|
65
65
|
sanitized_columns.append(feature)
|
|
66
66
|
|
|
67
67
|
if ignore_columns is not None and use_columns is None:
|
|
68
68
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
69
69
|
if missing_ignore:
|
|
70
|
-
_LOGGER.warning(f"
|
|
70
|
+
_LOGGER.warning(f"The following 'columns to ignore' are not found in the Dataframe:\n{missing_ignore}")
|
|
71
71
|
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
72
72
|
|
|
73
73
|
X = df[sanitized_columns].copy()
|
|
@@ -138,7 +138,7 @@ def compute_vif(
|
|
|
138
138
|
filename += ".svg"
|
|
139
139
|
full_save_path = save_path / filename
|
|
140
140
|
plt.savefig(full_save_path, format='svg', bbox_inches='tight')
|
|
141
|
-
_LOGGER.info(f"
|
|
141
|
+
_LOGGER.info(f"📊 Saved VIF plot: '{filename}'")
|
|
142
142
|
|
|
143
143
|
if show_plot:
|
|
144
144
|
plt.show()
|
|
@@ -163,7 +163,8 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
163
163
|
"""
|
|
164
164
|
# Ensure expected structure
|
|
165
165
|
if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
|
|
166
|
-
|
|
166
|
+
_LOGGER.error("'vif_df' must contain 'feature' and 'VIF' columns.")
|
|
167
|
+
raise ValueError()
|
|
167
168
|
|
|
168
169
|
# Identify features to drop
|
|
169
170
|
to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
|
|
@@ -177,7 +178,7 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
177
178
|
result_df = df.drop(columns=to_drop)
|
|
178
179
|
|
|
179
180
|
if result_df.empty:
|
|
180
|
-
_LOGGER.warning(f"
|
|
181
|
+
_LOGGER.warning(f"All columns were dropped.")
|
|
181
182
|
|
|
182
183
|
return result_df, to_drop
|
|
183
184
|
|
ml_tools/_logger.py
CHANGED
|
@@ -1,6 +1,73 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
|
+
# Step 1: Conditionally import colorlog
|
|
5
|
+
try:
|
|
6
|
+
import colorlog # type: ignore
|
|
7
|
+
except ImportError:
|
|
8
|
+
colorlog = None
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# --- Centralized Configuration ---
|
|
14
|
+
LEVEL_EMOJIS = {
|
|
15
|
+
logging.INFO: "✅",
|
|
16
|
+
logging.WARNING: "⚠️ ",
|
|
17
|
+
logging.ERROR: "🚨",
|
|
18
|
+
logging.CRITICAL: "❌"
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
# Define base format strings.
|
|
22
|
+
BASE_INFO_FORMAT = '\n🐉 %(asctime)s [%(emoji)s %(levelname)s] - %(message)s'
|
|
23
|
+
BASE_WARN_FORMAT = '\n🐉 %(asctime)s [%(emoji)s %(levelname)s] [%(filename)s:%(lineno)d] - %(message)s'
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# --- Unified Formatter ---
|
|
27
|
+
# Determine the base class and format strings based on colorlog availability
|
|
28
|
+
if colorlog:
|
|
29
|
+
# If colorlog is available, use it as the base and use colorized formats.
|
|
30
|
+
_BaseFormatter = colorlog.ColoredFormatter
|
|
31
|
+
_INFO_FORMAT = BASE_INFO_FORMAT.replace('%(levelname)s', '%(log_color)s%(levelname)s%(reset)s')
|
|
32
|
+
_WARN_FORMAT = BASE_WARN_FORMAT.replace('%(levelname)s', '%(log_color)s%(levelname)s%(reset)s')
|
|
33
|
+
else:
|
|
34
|
+
# Otherwise, fall back to the standard logging.Formatter.
|
|
35
|
+
_BaseFormatter = logging.Formatter
|
|
36
|
+
_INFO_FORMAT = BASE_INFO_FORMAT
|
|
37
|
+
_WARN_FORMAT = BASE_WARN_FORMAT
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class _UnifiedFormatter(_BaseFormatter): # type: ignore
|
|
41
|
+
"""
|
|
42
|
+
A unified log formatter that adds emojis, uses level-specific formats,
|
|
43
|
+
and applies colors if colorlog is available.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, *args, **kwargs):
|
|
47
|
+
"""Initializes the formatter, creating sub-formatters for each level."""
|
|
48
|
+
# The base class __init__ is called implicitly. We prepare our custom formatters here.
|
|
49
|
+
self.datefmt = kwargs.get('datefmt')
|
|
50
|
+
|
|
51
|
+
# We need to pass the correct arguments to the correct formatter type
|
|
52
|
+
if colorlog:
|
|
53
|
+
log_colors = kwargs.get('log_colors', {})
|
|
54
|
+
self.info_formatter = colorlog.ColoredFormatter(_INFO_FORMAT, datefmt=self.datefmt, log_colors=log_colors)
|
|
55
|
+
self.warn_formatter = colorlog.ColoredFormatter(_WARN_FORMAT, datefmt=self.datefmt, log_colors=log_colors)
|
|
56
|
+
else:
|
|
57
|
+
self.info_formatter = logging.Formatter(_INFO_FORMAT, datefmt=self.datefmt)
|
|
58
|
+
self.warn_formatter = logging.Formatter(_WARN_FORMAT, datefmt=self.datefmt)
|
|
59
|
+
|
|
60
|
+
def format(self, record):
|
|
61
|
+
"""Adds a custom emoji attribute to the record before formatting."""
|
|
62
|
+
# Add the new attribute to the record. Use .get() for a safe default.
|
|
63
|
+
record.emoji = LEVEL_EMOJIS.get(record.levelno, "")
|
|
64
|
+
|
|
65
|
+
# Select the appropriate formatter and let it handle the rest.
|
|
66
|
+
if record.levelno >= logging.WARNING:
|
|
67
|
+
return self.warn_formatter.format(record)
|
|
68
|
+
else:
|
|
69
|
+
return self.info_formatter.format(record)
|
|
70
|
+
|
|
4
71
|
|
|
5
72
|
def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
|
|
6
73
|
"""
|
|
@@ -9,6 +76,7 @@ def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
|
|
|
9
76
|
- `logger.info()`
|
|
10
77
|
- `logger.warning()`
|
|
11
78
|
- `logger.error()` the program can potentially recover.
|
|
79
|
+
- `logger.exception()` inside an except block.
|
|
12
80
|
- `logger.critical()` the program is going to crash.
|
|
13
81
|
"""
|
|
14
82
|
logger = logging.getLogger(name)
|
|
@@ -16,15 +84,26 @@ def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
|
|
|
16
84
|
|
|
17
85
|
# Prevents adding handlers multiple times if the function is called again
|
|
18
86
|
if not logger.handlers:
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
date_format = '%Y-%m-%d %H:%M' # Format: Year-Month-Day Hour:Minute
|
|
87
|
+
# Prepare arguments for the unified formatter
|
|
88
|
+
formatter_kwargs = {
|
|
89
|
+
'datefmt': '%Y-%m-%d %H:%M'
|
|
90
|
+
}
|
|
24
91
|
|
|
25
|
-
#
|
|
26
|
-
|
|
92
|
+
# Use colorlog's handler if available, and add color arguments
|
|
93
|
+
if colorlog:
|
|
94
|
+
handler = colorlog.StreamHandler()
|
|
95
|
+
formatter_kwargs["log_colors"] = { # type: ignore
|
|
96
|
+
'DEBUG': 'cyan',
|
|
97
|
+
'INFO': 'green',
|
|
98
|
+
'WARNING': 'yellow',
|
|
99
|
+
'ERROR': 'red',
|
|
100
|
+
'CRITICAL': 'red,bg_white',
|
|
101
|
+
}
|
|
102
|
+
else:
|
|
103
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
27
104
|
|
|
105
|
+
# Create and set the single, unified formatter
|
|
106
|
+
formatter = _UnifiedFormatter(**formatter_kwargs)
|
|
28
107
|
handler.setFormatter(formatter)
|
|
29
108
|
logger.addHandler(handler)
|
|
30
109
|
|
|
@@ -32,5 +111,24 @@ def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
|
|
|
32
111
|
|
|
33
112
|
return logger
|
|
34
113
|
|
|
114
|
+
|
|
35
115
|
# Create a single logger instance to be imported by other modules
|
|
36
116
|
_LOGGER = _get_logger()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _log_and_exit(message: str, exit_code: int = 1):
|
|
120
|
+
"""Logs a critical message inside an exception block and terminates the program."""
|
|
121
|
+
_LOGGER.exception(message)
|
|
122
|
+
sys.exit(exit_code)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
if __name__ == "__main__":
|
|
126
|
+
_LOGGER.info("Data loading process started.")
|
|
127
|
+
_LOGGER.warning("A non-critical configuration value is missing.")
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
x = 1 / 0
|
|
131
|
+
except ZeroDivisionError:
|
|
132
|
+
_LOGGER.exception("Critical error during calculation.")
|
|
133
|
+
|
|
134
|
+
_LOGGER.critical("Total failure.")
|
ml_tools/custom_logger.py
CHANGED
|
@@ -76,12 +76,13 @@ def custom_logger(
|
|
|
76
76
|
_log_exception_to_log(data, base_path.with_suffix(".log"))
|
|
77
77
|
|
|
78
78
|
else:
|
|
79
|
-
|
|
79
|
+
_LOGGER.error("Unsupported data type. Must be list, dict, str, or BaseException.")
|
|
80
|
+
raise ValueError()
|
|
80
81
|
|
|
81
|
-
_LOGGER.info(f"
|
|
82
|
+
_LOGGER.info(f"Log saved to: '{base_path}'")
|
|
82
83
|
|
|
83
|
-
except Exception
|
|
84
|
-
_LOGGER.
|
|
84
|
+
except Exception:
|
|
85
|
+
_LOGGER.exception(f"Log not saved.")
|
|
85
86
|
|
|
86
87
|
|
|
87
88
|
def _log_list_to_txt(data: List[Any], path: Path) -> None:
|
|
@@ -102,7 +103,9 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
|
|
|
102
103
|
|
|
103
104
|
for key, value in data.items():
|
|
104
105
|
if not isinstance(value, list):
|
|
105
|
-
|
|
106
|
+
_LOGGER.error(f"Dictionary value for key '{key}' must be a list.")
|
|
107
|
+
raise ValueError()
|
|
108
|
+
|
|
106
109
|
sanitized_key = str(key).strip().replace('\n', '_').replace('\r', '_')
|
|
107
110
|
padded_value = value + [None] * (max_length - len(value))
|
|
108
111
|
sanitized_dict[sanitized_key] = padded_value
|
|
@@ -152,7 +155,7 @@ def save_list_strings(list_strings: list[str], directory: Union[str,Path], filen
|
|
|
152
155
|
f.write(f"{string_data}\n")
|
|
153
156
|
|
|
154
157
|
if verbose:
|
|
155
|
-
_LOGGER.info(f"
|
|
158
|
+
_LOGGER.info(f"Text file saved as '{full_path.name}'.")
|
|
156
159
|
|
|
157
160
|
|
|
158
161
|
def load_list_strings(text_file: Union[str,Path], verbose: bool=True) -> list[str]:
|
|
@@ -164,10 +167,11 @@ def load_list_strings(text_file: Union[str,Path], verbose: bool=True) -> list[st
|
|
|
164
167
|
loaded_strings = [line.strip() for line in f]
|
|
165
168
|
|
|
166
169
|
if len(loaded_strings) == 0:
|
|
167
|
-
|
|
170
|
+
_LOGGER.error("The text file is empty.")
|
|
171
|
+
raise ValueError()
|
|
168
172
|
|
|
169
173
|
if verbose:
|
|
170
|
-
_LOGGER.info(f"
|
|
174
|
+
_LOGGER.info(f"Text file loaded as list of strings.")
|
|
171
175
|
|
|
172
176
|
return loaded_strings
|
|
173
177
|
|
ml_tools/data_exploration.py
CHANGED
|
@@ -83,7 +83,8 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
|
|
|
83
83
|
A new DataFrame with the constant columns removed.
|
|
84
84
|
"""
|
|
85
85
|
if not isinstance(df, pd.DataFrame):
|
|
86
|
-
|
|
86
|
+
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
87
|
+
raise TypeError()
|
|
87
88
|
|
|
88
89
|
original_columns = set(df.columns)
|
|
89
90
|
cols_to_keep = []
|
|
@@ -136,7 +137,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
136
137
|
_LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
|
|
137
138
|
df_clean = df_clean[~target_na]
|
|
138
139
|
else:
|
|
139
|
-
_LOGGER.info("
|
|
140
|
+
_LOGGER.info("No rows found where all targets are missing.")
|
|
140
141
|
else:
|
|
141
142
|
valid_targets = []
|
|
142
143
|
|
|
@@ -149,9 +150,9 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
149
150
|
_LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
|
|
150
151
|
df_clean = df_clean.drop(index=rows_to_drop)
|
|
151
152
|
else:
|
|
152
|
-
_LOGGER.info(f"
|
|
153
|
+
_LOGGER.info(f"No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
|
|
153
154
|
else:
|
|
154
|
-
_LOGGER.warning("
|
|
155
|
+
_LOGGER.warning("No feature columns available to evaluate.")
|
|
155
156
|
|
|
156
157
|
return df_clean
|
|
157
158
|
|
|
@@ -211,7 +212,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
211
212
|
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
212
213
|
|
|
213
214
|
if len(cols_to_drop) > 0:
|
|
214
|
-
_LOGGER.info(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
215
|
+
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
215
216
|
print(list(cols_to_drop))
|
|
216
217
|
|
|
217
218
|
result_df = df.drop(columns=cols_to_drop)
|
|
@@ -339,7 +340,8 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
|
|
|
339
340
|
TypeError: If any column is not numeric.
|
|
340
341
|
"""
|
|
341
342
|
if not all(np.issubdtype(dtype, np.number) for dtype in df.dtypes):
|
|
342
|
-
|
|
343
|
+
_LOGGER.error("All columns must be numeric (int or float).")
|
|
344
|
+
raise TypeError()
|
|
343
345
|
|
|
344
346
|
binary_cols = []
|
|
345
347
|
continuous_cols = []
|
|
@@ -390,7 +392,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
|
|
|
390
392
|
"""
|
|
391
393
|
numeric_df = df.select_dtypes(include='number')
|
|
392
394
|
if numeric_df.empty:
|
|
393
|
-
_LOGGER.warning("
|
|
395
|
+
_LOGGER.warning("No numeric columns found. Heatmap not generated.")
|
|
394
396
|
return
|
|
395
397
|
|
|
396
398
|
corr = numeric_df.corr(method=method)
|
|
@@ -558,11 +560,11 @@ def clip_outliers_single(
|
|
|
558
560
|
None: if a problem with the dataframe column occurred.
|
|
559
561
|
"""
|
|
560
562
|
if column not in df.columns:
|
|
561
|
-
_LOGGER.warning(f"
|
|
563
|
+
_LOGGER.warning(f"Column '{column}' not found in DataFrame.")
|
|
562
564
|
return None
|
|
563
565
|
|
|
564
566
|
if not pd.api.types.is_numeric_dtype(df[column]):
|
|
565
|
-
_LOGGER.warning(f"
|
|
567
|
+
_LOGGER.warning(f"Column '{column}' must be numeric.")
|
|
566
568
|
return None
|
|
567
569
|
|
|
568
570
|
new_df = df.copy(deep=True)
|
|
@@ -600,13 +602,16 @@ def clip_outliers_multi(
|
|
|
600
602
|
for col, bounds in clip_dict.items():
|
|
601
603
|
try:
|
|
602
604
|
if col not in df.columns:
|
|
603
|
-
|
|
605
|
+
_LOGGER.error(f"Column '{col}' not found in DataFrame.")
|
|
606
|
+
raise ValueError()
|
|
604
607
|
|
|
605
608
|
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
606
|
-
|
|
609
|
+
_LOGGER.error(f"Column '{col}' is not numeric.")
|
|
610
|
+
raise TypeError()
|
|
607
611
|
|
|
608
612
|
if not (isinstance(bounds, tuple) and len(bounds) == 2):
|
|
609
|
-
|
|
613
|
+
_LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
|
|
614
|
+
raise ValueError()
|
|
610
615
|
|
|
611
616
|
min_val, max_val = bounds
|
|
612
617
|
new_df[col] = new_df[col].clip(lower=min_val, upper=max_val)
|
|
@@ -621,7 +626,7 @@ def clip_outliers_multi(
|
|
|
621
626
|
_LOGGER.info(f"Clipped {clipped_columns} columns.")
|
|
622
627
|
|
|
623
628
|
if skipped_columns:
|
|
624
|
-
_LOGGER.warning("
|
|
629
|
+
_LOGGER.warning("Skipped columns:")
|
|
625
630
|
for col, msg in skipped_columns:
|
|
626
631
|
print(f" - {col}: {msg}")
|
|
627
632
|
|
|
@@ -707,11 +712,11 @@ def standardize_percentages(
|
|
|
707
712
|
for col in columns:
|
|
708
713
|
# --- Robustness Checks ---
|
|
709
714
|
if col not in df_copy.columns:
|
|
710
|
-
_LOGGER.warning(f"
|
|
715
|
+
_LOGGER.warning(f"Column '{col}' not found. Skipping.")
|
|
711
716
|
continue
|
|
712
717
|
|
|
713
718
|
if not is_numeric_dtype(df_copy[col]):
|
|
714
|
-
_LOGGER.warning(f"
|
|
719
|
+
_LOGGER.warning(f"Column '{col}' is not numeric. Skipping.")
|
|
715
720
|
continue
|
|
716
721
|
|
|
717
722
|
# --- Applying the Logic ---
|