dragon-ml-toolbox 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.5.1
3
+ Version: 3.7.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,25 @@
1
+ dragon_ml_toolbox-3.7.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-3.7.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
+ ml_tools/ETL_engineering.py,sha256=yeZsW_7zRvEcuMZbM4E2GV1dxwBoWIeJAcFFk2AK0fY,39502
4
+ ml_tools/GUI_tools.py,sha256=3kRxok-QCN5S0q1i7yK137Bsr6c2N4M4nIvgPVAuZU0,20371
5
+ ml_tools/MICE_imputation.py,sha256=rYqvwQDVtoAJJ0agXWoGzoZEHedWiA6QzcEKEIkiZ08,11388
6
+ ml_tools/ML_callbacks.py,sha256=OT2zwORLcn49megBEgXsSUxDHoW0Ft0_v7hLEVF3jHM,13063
7
+ ml_tools/ML_evaluation.py,sha256=oiDV6HItQloUUKCUpltV-2pogubWLBieGpc-VUwosAQ,10106
8
+ ml_tools/ML_trainer.py,sha256=Pw4tLtlexoZJs_3o5I6ElQMTLjijzydXXQE834949Dw,14470
9
+ ml_tools/ML_tutorial.py,sha256=-9tJO9ISPxEjRINVaF_Bu7tiiJ2W3zznQ4gNlZeP1HQ,12238
10
+ ml_tools/PSO_optimization.py,sha256=c23Fd-ttqoO8IBPK5-TXZLqPi9UPHUC4HNoF02Q8wLo,24774
11
+ ml_tools/RNN_forecast.py,sha256=IZLcPs3by0Chei7ill_Grjxs7BBUnzau0Oavi3dWiyE,1886
12
+ ml_tools/VIF_factor.py,sha256=BeP4ig3l7b1Igwgte9z8rEwHdSZvVT7W_9mcBHGoNJw,10299
13
+ ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
15
+ ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
16
+ ml_tools/data_exploration.py,sha256=M7bn2q5XN9zJZJGAmMMFSFFZh8LGzC2arFelrXw3N6Q,25241
17
+ ml_tools/datasetmaster.py,sha256=S3PKHNQZ9cyAOck8xQltVLZhaD1gFLfgHFL-aRjz4JU,30077
18
+ ml_tools/ensemble_learning.py,sha256=CDSIygnHaNe92aJ46Fofevd7q6lowTnE98yWuIV3Y6w,37462
19
+ ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
20
+ ml_tools/logger.py,sha256=UkbiU9ihBhw9VKyn3rZzisdClWV94EBV6B09_D0iUU0,6026
21
+ ml_tools/utilities.py,sha256=0w0vka0Aj9IYOHJ6crWIb6gwpQIJnPyj3v2_dnVxHrs,23138
22
+ dragon_ml_toolbox-3.7.0.dist-info/METADATA,sha256=kvgFjd_BRwob7xycC5rbROCkq4C6FVq3J5-VdCXEPrI,3273
23
+ dragon_ml_toolbox-3.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
+ dragon_ml_toolbox-3.7.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
25
+ dragon_ml_toolbox-3.7.0.dist-info/RECORD,,
@@ -294,7 +294,7 @@ class DataProcessor:
294
294
  raise TypeError(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
295
295
 
296
296
  if not processed_columns:
297
- _LOGGER.warning("The transformation resulted in an empty DataFrame.")
297
+ _LOGGER.warning("⚠️ The transformation resulted in an empty DataFrame.")
298
298
  return pl.DataFrame()
299
299
 
300
300
  return pl.DataFrame(processed_columns)
@@ -588,7 +588,7 @@ class NumberExtractor:
588
588
  if not isinstance(round_digits, int):
589
589
  raise TypeError("round_digits must be an integer.")
590
590
  if dtype == "int":
591
- _LOGGER.warning(f"'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
591
+ _LOGGER.warning(f"⚠️ 'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
592
592
 
593
593
  self.regex_pattern = regex_pattern
594
594
  self.dtype = dtype
ml_tools/GUI_tools.py CHANGED
@@ -148,7 +148,7 @@ class ConfigManager:
148
148
  """
149
149
  path = Path(file_path)
150
150
  if path.exists() and not force_overwrite:
151
- _LOGGER.warning(f"Configuration file already exists at {path}. Aborting.")
151
+ _LOGGER.warning(f"⚠️ Configuration file already exists at {path}. Aborting.")
152
152
  return
153
153
 
154
154
  config = configparser.ConfigParser()
@@ -206,7 +206,7 @@ class ConfigManager:
206
206
 
207
207
  with open(path, 'w') as configfile:
208
208
  config.write(configfile)
209
- _LOGGER.info(f"Successfully generated config template at: '{path}'")
209
+ _LOGGER.info(f"📝 Successfully generated config template at: '{path}'")
210
210
 
211
211
 
212
212
  # --- GUI Factory ---
@@ -482,13 +482,12 @@ def update_target_fields(window: sg.Window, results_dict: Dict[str, Any]):
482
482
 
483
483
  Args:
484
484
  window (sg.Window): The application's window object.
485
- results_dict (dict): A dictionary where keys are target names (without the
486
- 'TARGET_' prefix) and values are the predicted results.
485
+ results_dict (dict): A dictionary where keys are target key names (including 'TARGET_' prefix if necessary) and values are the predicted results.
487
486
  """
488
487
  for target_name, result in results_dict.items():
489
488
  # Format numbers to 2 decimal places, leave other types as-is
490
489
  display_value = f"{result:.2f}" if isinstance(result, (int, float)) else result
491
- window[f'TARGET_{target_name}'].update(display_value)
490
+ window[target_name].update(display_value)
492
491
 
493
492
 
494
493
  def info():
@@ -128,7 +128,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
128
128
  plt.savefig(save_path, bbox_inches='tight', format="svg")
129
129
  plt.close()
130
130
 
131
- _LOGGER.info(f"{dataset_file_dir} completed.")
131
+ _LOGGER.info(f"{dataset_file_dir} process completed.")
132
132
 
133
133
 
134
134
  # Imputed distributions
@@ -213,7 +213,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
213
213
  fig = kernel.plot_imputed_distributions(variables=[feature])
214
214
  _process_figure(fig, feature)
215
215
 
216
- _LOGGER.info(f"{local_dir_name} completed.")
216
+ _LOGGER.info(f"{local_dir_name} completed.")
217
217
 
218
218
 
219
219
  def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],
ml_tools/ML_callbacks.py CHANGED
@@ -178,7 +178,6 @@ class EarlyStopping(Callback):
178
178
  self.stopped_epoch = epoch
179
179
  self.trainer.stop_training = True # type: ignore
180
180
  if self.verbose > 0:
181
- print("")
182
181
  _LOGGER.info(f"Epoch {epoch+1}: early stopping after {self.wait} epochs with no improvement.")
183
182
 
184
183
 
@@ -256,7 +255,6 @@ class ModelCheckpoint(Callback):
256
255
  new_filepath = self.save_dir / filename
257
256
 
258
257
  if self.verbose > 0:
259
- print("")
260
258
  _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
261
259
 
262
260
  # Save the new best model
@@ -276,7 +274,6 @@ class ModelCheckpoint(Callback):
276
274
  filepath = self.save_dir / filename
277
275
 
278
276
  if self.verbose > 0:
279
- print("")
280
277
  _LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
281
278
  torch.save(self.trainer.model.state_dict(), filepath) # type: ignore
282
279
 
@@ -325,7 +322,6 @@ class LRScheduler(Callback):
325
322
  if metric_val is not None:
326
323
  self.scheduler.step(metric_val)
327
324
  else:
328
- print("")
329
325
  _LOGGER.warning(f"LRScheduler could not find metric '{self.monitor}' in logs.")
330
326
 
331
327
  # For all other schedulers
@@ -335,7 +331,6 @@ class LRScheduler(Callback):
335
331
  # Log the change if the LR was updated
336
332
  current_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
337
333
  if current_lr != self.previous_lr:
338
- print("")
339
334
  _LOGGER.info(f"Epoch {epoch}: Learning rate changed to {current_lr:.6f}")
340
335
  self.previous_lr = current_lr
341
336
 
ml_tools/ML_evaluation.py CHANGED
@@ -65,7 +65,7 @@ def plot_losses(history: dict, save_dir: Optional[Union[str, Path]] = None):
65
65
  save_dir_path = make_fullpath(save_dir, make=True)
66
66
  save_path = save_dir_path / "loss_plot.svg"
67
67
  plt.savefig(save_path)
68
- _LOGGER.info(f"Loss plot saved as '{save_path.name}'")
68
+ _LOGGER.info(f"📉 Loss plot saved as '{save_path.name}'")
69
69
  else:
70
70
  plt.show()
71
71
  plt.close(fig)
@@ -92,7 +92,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
92
92
  # Save text report
93
93
  report_path = save_dir_path / "classification_report.txt"
94
94
  report_path.write_text(report, encoding="utf-8")
95
- _LOGGER.info(f"Classification report saved as '{report_path.name}'")
95
+ _LOGGER.info(f"📝 Classification report saved as '{report_path.name}'")
96
96
 
97
97
  # Save Confusion Matrix
98
98
  fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
@@ -100,7 +100,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
100
100
  ax_cm.set_title("Confusion Matrix")
101
101
  cm_path = save_dir_path / "confusion_matrix.svg"
102
102
  plt.savefig(cm_path)
103
- _LOGGER.info(f"Confusion matrix saved as '{cm_path.name}'")
103
+ _LOGGER.info(f"❇️ Confusion matrix saved as '{cm_path.name}'")
104
104
  plt.close(fig_cm)
105
105
 
106
106
  # Save ROC Curve
@@ -117,7 +117,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
117
117
  ax_roc.grid(True)
118
118
  roc_path = save_dir_path / "roc_curve.svg"
119
119
  plt.savefig(roc_path)
120
- _LOGGER.info(f"ROC curve saved as '{roc_path.name}'")
120
+ _LOGGER.info(f"📈 ROC curve saved as '{roc_path.name}'")
121
121
  plt.close(fig_roc)
122
122
  else:
123
123
  # Show plots if not saving
@@ -162,7 +162,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
162
162
  # Save text report
163
163
  report_path = save_dir_path / "regression_report.txt"
164
164
  report_path.write_text(report_string)
165
- _LOGGER.info(f"Regression report saved as '{report_path.name}'")
165
+ _LOGGER.info(f"📝 Regression report saved as '{report_path.name}'")
166
166
 
167
167
  # Save residual plot
168
168
  residuals = y_true - y_pred
@@ -176,7 +176,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
176
176
  plt.tight_layout()
177
177
  res_path = save_dir_path / "residual_plot.svg"
178
178
  plt.savefig(res_path)
179
- _LOGGER.info(f"Residual plot saved as '{res_path.name}'")
179
+ _LOGGER.info(f"📈 Residual plot saved as '{res_path.name}'")
180
180
  plt.close(fig_res)
181
181
 
182
182
  # Save true vs predicted plot
@@ -190,7 +190,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
190
190
  plt.tight_layout()
191
191
  tvp_path = save_dir_path / "true_vs_predicted_plot.svg"
192
192
  plt.savefig(tvp_path)
193
- _LOGGER.info(f"True vs. Predicted plot saved as '{tvp_path.name}'")
193
+ _LOGGER.info(f"📉 True vs. Predicted plot saved as '{tvp_path.name}'")
194
194
  plt.close(fig_tvp)
195
195
 
196
196
 
@@ -227,7 +227,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
227
227
  plt.title("SHAP Feature Importance")
228
228
  plt.tight_layout()
229
229
  plt.savefig(bar_path)
230
- _LOGGER.info(f"SHAP bar plot saved as '{bar_path.name}'")
230
+ _LOGGER.info(f"📊 SHAP bar plot saved as '{bar_path.name}'")
231
231
  plt.close()
232
232
 
233
233
  # Save Dot Plot
@@ -236,7 +236,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
236
236
  plt.title("SHAP Feature Importance")
237
237
  plt.tight_layout()
238
238
  plt.savefig(dot_path)
239
- _LOGGER.info(f"SHAP dot plot saved as '{dot_path.name}'")
239
+ _LOGGER.info(f"📊 SHAP dot plot saved as '{dot_path.name}'")
240
240
  plt.close()
241
241
 
242
242
  # Save Summary Data to CSV
@@ -249,7 +249,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
249
249
  'mean_abs_shap_value': mean_abs_shap
250
250
  }).sort_values('mean_abs_shap_value', ascending=False)
251
251
  summary_df.to_csv(summary_path, index=False)
252
- _LOGGER.info(f"SHAP summary data saved as '{summary_path.name}'")
252
+ _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
253
253
  else:
254
254
  _LOGGER.info("No save directory provided. Displaying SHAP dot plot.")
255
255
  shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot")
ml_tools/ML_trainer.py CHANGED
@@ -72,10 +72,10 @@ class MyTrainer:
72
72
  """Validates the selected device and returns a torch.device object."""
73
73
  device_lower = device.lower()
74
74
  if "cuda" in device_lower and not torch.cuda.is_available():
75
- _LOGGER.warning("CUDA not available, switching to CPU.")
75
+ _LOGGER.warning("⚠️ CUDA not available, switching to CPU.")
76
76
  device = "cpu"
77
77
  elif device_lower == "mps" and not torch.backends.mps.is_available():
78
- _LOGGER.warning("Apple Metal Performance Shaders (MPS) not available, switching to CPU.")
78
+ _LOGGER.warning("⚠️ Apple Metal Performance Shaders (MPS) not available, switching to CPU.")
79
79
  device = "cpu"
80
80
  return torch.device(device)
81
81
 
@@ -22,7 +22,6 @@ import torch
22
22
  from tqdm import trange
23
23
  import matplotlib.pyplot as plt
24
24
  import seaborn as sns
25
- from collections import defaultdict
26
25
  from .logger import _LOGGER
27
26
 
28
27
 
@@ -307,7 +306,7 @@ def run_pso(lower_boundaries: list[float],
307
306
  else:
308
307
  device = torch.device("cpu")
309
308
 
310
- _LOGGER.info(f"Using device: '{device}'")
309
+ _LOGGER.info(f"👾 Using device: '{device}'")
311
310
 
312
311
  # set local deep copies to prevent in place list modification
313
312
  local_lower_boundaries = deepcopy(lower_boundaries)
@@ -511,13 +510,13 @@ def _pso(func: ObjectiveFunction,
511
510
  return best_position, best_score
512
511
 
513
512
 
514
- def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path], color_by_target: bool = True):
513
+ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
515
514
  """
516
515
  Analyzes optimization results and plots the distribution of optimal values for each feature.
517
516
 
518
- This function can operate in two modes based on the `color_by_target` parameter:
519
- 1. Aggregates all values for a feature into a single group and plots one overall distribution (histogram + KDE).
520
- 2. Color-coded: Plots a separate, color-coded Kernel Density Estimate (KDE) for each source target, allowing for direct comparison on a single chart.
517
+ For features with more than two unique values, this function generates a color-coded
518
+ Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
519
+ showing relative frequency.
521
520
 
522
521
  Parameters
523
522
  ----------
@@ -525,76 +524,69 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
525
524
  The path to the directory containing the optimization result CSV files.
526
525
  save_dir : str or Path
527
526
  The directory where the output plots will be saved.
528
- color_by_target : bool, optional
529
- If True, generates comparative plots with distributions colored by their source target.
530
527
  """
531
- mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
532
- _LOGGER.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
533
-
534
- # Check results_dir
528
+ # Check results_dir and create output path
535
529
  results_path = make_fullpath(results_dir)
536
- # make output path
537
530
  output_path = make_fullpath(save_dir, make=True)
538
531
 
539
532
  all_csvs = list_csv_paths(results_path)
540
-
541
533
  if not all_csvs:
542
- _LOGGER.warning("No data found. No plots will be generated.")
534
+ _LOGGER.warning("⚠️ No data found. No plots will be generated.")
543
535
  return
544
536
 
545
- # --- MODE 1: Color-coded plots by target ---
546
- if color_by_target:
547
- data_to_plot = []
548
- for df, df_name in yield_dataframes_from_dir(results_path):
549
- # Assumes last col is target, rest are features
550
- melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
551
- # Sanitize target name for cleaner legend labels
552
- melted_df['target'] = df_name.replace("Optimization_", "")
553
- data_to_plot.append(melted_df)
554
-
555
- long_df = pd.concat(data_to_plot, ignore_index=True)
556
- features = long_df['feature'].unique()
557
- _LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
558
-
559
- for feature_name in features:
560
- plt.figure(figsize=(12, 7))
561
- feature_df = long_df[long_df['feature'] == feature_name]
537
+ # --- Data Loading and Preparation ---
538
+ _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
539
+ data_to_plot = []
540
+ for df, df_name in yield_dataframes_from_dir(results_path):
541
+ melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
542
+ melted_df['target'] = df_name.replace("Optimization_", "")
543
+ data_to_plot.append(melted_df)
544
+
545
+ long_df = pd.concat(data_to_plot, ignore_index=True)
546
+ features = long_df['feature'].unique()
547
+ _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
548
+
549
+ # --- Plotting Loop ---
550
+ for feature_name in features:
551
+ plt.figure(figsize=(12, 7))
552
+ feature_df = long_df[long_df['feature'] == feature_name]
553
+
554
+ # Check if the feature is binary or constant
555
+ if feature_df['value'].nunique() <= 2:
556
+ # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
557
+ # This ensures the X-axis is clean (e.g., just 0 and 1).
558
+ norm_df = (feature_df.groupby('target')['value']
559
+ .value_counts(normalize=True)
560
+ .mul(100)
561
+ .rename('percent')
562
+ .reset_index())
562
563
 
563
- sns.kdeplot(data=feature_df, x='value', hue='target', fill=True, alpha=0.1)
564
+ ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
564
565
 
565
- plt.title(f"Comparative Distribution for '{feature_name}'", fontsize=16)
566
- plt.xlabel("Feature Value", fontsize=12)
567
- plt.ylabel("Density", fontsize=12)
568
- plt.grid(axis='y', alpha=0.5, linestyle='--')
569
- plt.legend(title='Target')
570
-
571
- sanitized_feature_name = sanitize_filename(feature_name)
572
- plot_filename = output_path / f"Comparative_{sanitized_feature_name}.svg"
573
- plt.savefig(plot_filename, bbox_inches='tight')
574
- plt.close()
575
-
576
- # --- MODE 2: Aggregate plot ---
577
- else:
578
- feature_distributions = defaultdict(list)
579
- for df, _ in yield_dataframes_from_dir(results_path):
580
- feature_columns = df.iloc[:, :-1]
581
- for feature_name in feature_columns:
582
- feature_distributions[feature_name].extend(df[feature_name].tolist())
566
+ plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
567
+ plt.ylabel("Frequency (%)", fontsize=12)
568
+ ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
569
+
570
+ else:
571
+ # PLOT 2: KDE plot for continuous values.
572
+ ax = sns.kdeplot(data=feature_df, x='value', hue='target',
573
+ fill=True, alpha=0.1, warn_singular=False)
574
+
575
+ plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
576
+ plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
577
+
578
+ # --- Common settings for both plot types ---
579
+ plt.xlabel("Feature Value", fontsize=12)
580
+ plt.grid(axis='y', alpha=0.5, linestyle='--')
583
581
 
584
- _LOGGER.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
585
- for feature_name, values in feature_distributions.items():
586
- plt.figure(figsize=(12, 7))
587
- sns.histplot(x=values, kde=True, bins='auto', stat="density")
588
-
589
- plt.title(f"Aggregate Distribution for '{feature_name}'", fontsize=16)
590
- plt.xlabel("Feature Value", fontsize=12)
591
- plt.ylabel("Density", fontsize=12)
592
- plt.grid(axis='y', alpha=0.5, linestyle='--')
593
-
594
- sanitized_feature_name = sanitize_filename(feature_name)
595
- plot_filename = output_path / f"Aggregate_{sanitized_feature_name}.svg"
596
- plt.savefig(plot_filename, bbox_inches='tight')
597
- plt.close()
582
+ legend = ax.get_legend()
583
+ if legend:
584
+ legend.set_title('Target')
585
+
586
+ sanitized_feature_name = sanitize_filename(feature_name)
587
+ plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
588
+ plt.savefig(plot_filename, bbox_inches='tight')
589
+ plt.close()
598
590
 
599
591
  _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
600
592
 
ml_tools/VIF_factor.py CHANGED
@@ -168,12 +168,12 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
168
168
 
169
169
  # Identify features to drop
170
170
  to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
171
- _LOGGER.info(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
171
+ _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
172
172
 
173
173
  result_df = df.drop(columns=to_drop)
174
174
 
175
175
  if result_df.empty:
176
- _LOGGER.warning(f"\t⚠️ All columns were dropped.")
176
+ _LOGGER.warning(f"⚠️ All columns were dropped.")
177
177
 
178
178
  return result_df, to_drop
179
179
 
@@ -15,7 +15,7 @@ import re
15
15
  # Keep track of all available tools, show using `info()`
16
16
  __all__ = [
17
17
  "summarize_dataframe",
18
- "drop_zero_only_columns",
18
+ "drop_constant_columns",
19
19
  "drop_rows_with_missing_data",
20
20
  "split_features_targets",
21
21
  "show_null_columns",
@@ -62,44 +62,50 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
62
62
  return summary
63
63
 
64
64
 
65
- def drop_zero_only_columns(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
65
+ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
66
66
  """
67
- Removes columns from a pandas DataFrame that contain only zeros and null/NaN values.
67
+ Removes columns from a pandas DataFrame that contain only a single unique
68
+ value or are entirely null/NaN.
68
69
 
69
- This utility is useful for cleaning data after dummification steps that may result in empty columns.
70
+ This utility is useful for cleaning data by removing constant features that
71
+ have no predictive value.
70
72
 
71
73
  Args:
72
74
  df (pd.DataFrame):
73
75
  The pandas DataFrame to clean.
76
+ verbose (bool):
77
+ If True, prints the names of the columns that were dropped.
78
+ Defaults to True.
74
79
 
75
80
  Returns:
76
81
  pd.DataFrame:
77
- A new DataFrame with the empty columns removed.
82
+ A new DataFrame with the constant columns removed.
78
83
  """
79
84
  if not isinstance(df, pd.DataFrame):
80
85
  raise TypeError("Input must be a pandas DataFrame.")
81
-
86
+
82
87
  original_columns = set(df.columns)
83
-
84
88
  cols_to_keep = []
89
+
85
90
  for col_name in df.columns:
86
91
  column = df[col_name]
87
92
 
88
- # Keep any column that is not numeric by default
89
- if not is_numeric_dtype(column):
93
+ # We can apply this logic to all columns or only focus on numeric ones.
94
+ # if not is_numeric_dtype(column):
95
+ # cols_to_keep.append(col_name)
96
+ # continue
97
+
98
+ # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
99
+ if column.nunique(dropna=True) > 1:
90
100
  cols_to_keep.append(col_name)
91
- continue
92
101
 
93
- # For numeric columns, check if there's at least one non-zero value.
94
- if (column.fillna(0) != 0).any():
95
- cols_to_keep.append(col_name)
96
-
97
- dropped_columns = original_columns - set(cols_to_keep)
98
- if dropped_columns and verbose:
99
- print(f"Dropped {len(dropped_columns)} columns:")
100
- for dropped_column in dropped_columns:
101
- print(f" {dropped_column}")
102
-
102
+ dropped_columns = original_columns - set(cols_to_keep)
103
+ if verbose:
104
+ print(f"🧹 Dropped {len(dropped_columns)} constant columns.")
105
+ if dropped_columns:
106
+ for dropped_column in dropped_columns:
107
+ print(f" {dropped_column}")
108
+
103
109
  return df[cols_to_keep]
104
110
 
105
111
 
ml_tools/datasetmaster.py CHANGED
@@ -13,7 +13,7 @@ from torchvision.datasets import ImageFolder
13
13
  from torchvision import transforms
14
14
  import matplotlib.pyplot as plt
15
15
  from pathlib import Path
16
- from .utilities import _script_info
16
+ from .utilities import _script_info, make_fullpath
17
17
  from .logger import _LOGGER
18
18
 
19
19
 
@@ -204,7 +204,7 @@ class DatasetMaker(_BaseMaker):
204
204
  if not self._is_split:
205
205
  raise RuntimeError("Continuous features must be normalized AFTER splitting data. Call .split_data() first.")
206
206
  if self._is_normalized:
207
- _LOGGER.warning("Data has already been normalized.")
207
+ _LOGGER.warning("⚠️ Data has already been normalized.")
208
208
  return self
209
209
 
210
210
  # Use continuous features columns
@@ -232,7 +232,7 @@ class DatasetMaker(_BaseMaker):
232
232
  def split_data(self, test_size: float = 0.2, stratify: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
233
233
  """Splits the data into training and testing sets."""
234
234
  if self._is_split:
235
- _LOGGER.warning("Data has already been split.")
235
+ _LOGGER.warning("⚠️ Data has already been split.")
236
236
  return self
237
237
 
238
238
  if self.labels.dtype == 'object' or self.labels.dtype.name == 'category':
@@ -260,9 +260,9 @@ class DatasetMaker(_BaseMaker):
260
260
  Defaults to `SMOTETomek`.
261
261
  """
262
262
  if not self._is_split:
263
- raise RuntimeError("Cannot balance data before it has been split. Call .split_data() first.")
263
+ raise RuntimeError("Cannot balance data before it has been split. Call .split_data() first.")
264
264
  if self._is_balanced:
265
- _LOGGER.warning("Training data has already been balanced.")
265
+ _LOGGER.warning("⚠️ Training data has already been balanced.")
266
266
  return self
267
267
 
268
268
  if resampler is None:
@@ -278,13 +278,13 @@ class DatasetMaker(_BaseMaker):
278
278
  def process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
279
279
  balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
280
280
  """Runs a standard, fully automated preprocessing pipeline."""
281
- _LOGGER.info("--- Running Automated Processing Pipeline ---")
281
+ _LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
282
282
  self.process_categoricals(method=cat_method)
283
283
  self.split_data(test_size=test_size, stratify=True, random_state=random_state)
284
284
  self.normalize_continuous(method=normalize_method)
285
285
  if balance:
286
286
  self.balance_data()
287
- _LOGGER.info("--- Automated Processing Complete ---")
287
+ _LOGGER.info("--- 🤖 Automated Processing Complete ---")
288
288
  return self
289
289
 
290
290
  def denormalize(self, data: Union[torch.Tensor, numpy.ndarray, pandas.DataFrame]) -> Union[numpy.ndarray, pandas.DataFrame]:
@@ -400,10 +400,7 @@ class VisionDatasetMaker(_BaseMaker):
400
400
  Logs a report of the types, sizes, and channels of image files
401
401
  found in the directory and its subdirectories.
402
402
  """
403
- path_obj = Path(path)
404
- if not path_obj.is_dir():
405
- _LOGGER.error(f"Path is not a valid directory: {path_obj}")
406
- return
403
+ path_obj = make_fullpath(path)
407
404
 
408
405
  non_image_files = set()
409
406
  img_types = set()
@@ -505,7 +502,7 @@ class VisionDatasetMaker(_BaseMaker):
505
502
  if not self._is_split:
506
503
  raise RuntimeError("Data has not been split. Call .split_data() first.")
507
504
  if not self._are_transforms_configured:
508
- _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
505
+ _LOGGER.warning("⚠️ Transforms have not been configured. Using default ToTensor only.")
509
506
 
510
507
  if self._test_dataset:
511
508
  return self._train_dataset, self._val_dataset, self._test_dataset
@@ -555,7 +552,7 @@ class SequenceMaker(_BaseMaker):
555
552
  raise RuntimeError("Data must be split BEFORE normalizing. Call .split_data() first.")
556
553
 
557
554
  if self.scaler:
558
- _LOGGER.warning("Data has already been normalized.")
555
+ _LOGGER.warning("⚠️ Data has already been normalized.")
559
556
  return self
560
557
 
561
558
  if method == "standard":
@@ -579,7 +576,7 @@ class SequenceMaker(_BaseMaker):
579
576
  def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
580
577
  """Splits the sequence into training and testing portions."""
581
578
  if self._is_split:
582
- _LOGGER.warning("Data has already been split.")
579
+ _LOGGER.warning("⚠️ Data has already been split.")
583
580
  return self
584
581
 
585
582
  split_idx = int(len(self.sequence) * (1 - test_size))
@@ -915,7 +915,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
915
915
  datasets_path = make_fullpath(datasets_dir)
916
916
  save_path = make_fullpath(save_dir, make=True)
917
917
 
918
- _LOGGER.info("Training starting...")
918
+ _LOGGER.info("🏁 Training starting...")
919
919
  #Yield imputed dataset
920
920
  for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
921
921
  #Yield features dataframe and target dataframe
@@ -933,7 +933,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
933
933
  test_features=X_test, test_target=y_test,
934
934
  feature_names=feature_names,target_name=target_name,
935
935
  debug=debug, save_dir=save_path, save_model=save_model)
936
- print("")
936
+
937
937
  _LOGGER.info("✅ Training and evaluation complete.")
938
938
 
939
939
 
ml_tools/logger.py CHANGED
@@ -10,7 +10,6 @@ import logging
10
10
  import sys
11
11
 
12
12
 
13
-
14
13
  __all__ = [
15
14
  "custom_logger"
16
15
  ]
@@ -85,10 +84,10 @@ def custom_logger(
85
84
  else:
86
85
  raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
87
86
 
88
- _LOGGER.info(f"Log saved to: '{base_path}'")
87
+ _LOGGER.info(f"🗄️ Log saved to: '{base_path}'")
89
88
 
90
89
  except Exception as e:
91
- _LOGGER.error(f"Log not saved: {e}")
90
+ _LOGGER.error(f"Log not saved: {e}")
92
91
 
93
92
 
94
93
  def _log_list_to_txt(data: List[Any], path: Path) -> None:
@@ -176,7 +175,7 @@ def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
176
175
  handler = logging.StreamHandler(sys.stdout)
177
176
 
178
177
  # Define the format string and the date format separately
179
- log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
178
+ log_format = '\n🐉%(asctime)s - %(name)s - %(levelname)s - %(message)s'
180
179
  date_format = '%Y-%m-%d %H:%M' # Format: Year-Month-Day Hour:Minute
181
180
 
182
181
  # Pass both the format and the date format to the Formatter
ml_tools/utilities.py CHANGED
@@ -640,7 +640,7 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
640
640
  print(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
641
641
  continue
642
642
 
643
- print(f"{total_saved} single-target datasets were created.")
643
+ print(f"\n✅ {total_saved} single-target datasets were created.")
644
644
 
645
645
 
646
646
  class LogKeys:
@@ -1,25 +0,0 @@
1
- dragon_ml_toolbox-3.5.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-3.5.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
- ml_tools/ETL_engineering.py,sha256=URol7s45fVIdLqnhyOU1Etbi-D7MksFg-qtNwsKiunY,39488
4
- ml_tools/GUI_tools.py,sha256=uFx6zIrQZzDPSTtOSHz8ptz-fxZiQz-lXHcrqwuYV_E,20385
5
- ml_tools/MICE_imputation.py,sha256=ed-YeQkEAeHxTNkWIHs09T4YeYNF0aqAnrUTcdIEp9E,11372
6
- ml_tools/ML_callbacks.py,sha256=gHZk-lyzAax6iEtG26zHuoobdAZCFJ6BmI6pWoXkOrw,13189
7
- ml_tools/ML_evaluation.py,sha256=3xOqVXLJDhbioKZ922yxFnSuO4VDQ-HFzZyZZ1MskVM,10054
8
- ml_tools/ML_trainer.py,sha256=zRs3crz_z4B285iJhmY7m4AFwnvvq4urOyl4zDuCLtA,14456
9
- ml_tools/ML_tutorial.py,sha256=-9tJO9ISPxEjRINVaF_Bu7tiiJ2W3zznQ4gNlZeP1HQ,12238
10
- ml_tools/PSO_optimization.py,sha256=RCvIFGyf28voo2mpbRKC6LfDzKslzY-aYoPwgv9F4Bg,25458
11
- ml_tools/RNN_forecast.py,sha256=IZLcPs3by0Chei7ill_Grjxs7BBUnzau0Oavi3dWiyE,1886
12
- ml_tools/VIF_factor.py,sha256=4b3HmrrolN7ZIAo16TWwLlExqj_xaa8MxbkXD1xPCys,10295
13
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
15
- ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
16
- ml_tools/data_exploration.py,sha256=41q0ux4rsf6ktQDzX1haYOk0iRZzmNucrHRi_rqlNLs,25013
17
- ml_tools/datasetmaster.py,sha256=N-uwfzWnl_qnoAqjbfS98I1pVNra5u6rhKLdWbFIReA,30122
18
- ml_tools/ensemble_learning.py,sha256=PPtBBLgLvaYOdY-MlcjXuxWWXf3JQavLNEysFgzjc_s,37470
19
- ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
20
- ml_tools/logger.py,sha256=jC4Q2OqmDm8ZO9VpuZqBSWdXryqaJvLscqVJ6caNMOk,6009
21
- ml_tools/utilities.py,sha256=7cVWXjdxgSoIbZunuxJEOnJDSYp29liYsZexbrVDabs,23132
22
- dragon_ml_toolbox-3.5.1.dist-info/METADATA,sha256=F1RicIFxIpnKadElu8EU_k6P0FYKwGPRjHF2YXe9F6E,3273
23
- dragon_ml_toolbox-3.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- dragon_ml_toolbox-3.5.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
25
- dragon_ml_toolbox-3.5.1.dist-info/RECORD,,