PyPI - dragon-ml-toolbox - Versions diffs - 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl - Mend

dragon-ml-toolbox 3.5.1py3-none-any.whl → 3.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (20) hide show

{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/METADATA +1 -1
dragon_ml_toolbox-3.7.0.dist-info/RECORD +25 -0
ml_tools/ETL_engineering.py +2 -2
ml_tools/GUI_tools.py +4 -5
ml_tools/MICE_imputation.py +2 -2
ml_tools/ML_callbacks.py +0 -5
ml_tools/ML_evaluation.py +10 -10
ml_tools/ML_trainer.py +2 -2
ml_tools/PSO_optimization.py +57 -65
ml_tools/VIF_factor.py +2 -2
ml_tools/data_exploration.py +26 -20
ml_tools/datasetmaster.py +11 -14
ml_tools/ensemble_learning.py +2 -2
ml_tools/logger.py +3 -4
ml_tools/utilities.py +1 -1
dragon_ml_toolbox-3.5.1.dist-info/RECORD +0 -25
{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/top_level.txt +0 -0

{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.5.1
+Version: 3.7.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

dragon_ml_toolbox-3.7.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,25 @@
+dragon_ml_toolbox-3.7.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-3.7.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
+ml_tools/ETL_engineering.py,sha256=yeZsW_7zRvEcuMZbM4E2GV1dxwBoWIeJAcFFk2AK0fY,39502
+ml_tools/GUI_tools.py,sha256=3kRxok-QCN5S0q1i7yK137Bsr6c2N4M4nIvgPVAuZU0,20371
+ml_tools/MICE_imputation.py,sha256=rYqvwQDVtoAJJ0agXWoGzoZEHedWiA6QzcEKEIkiZ08,11388
+ml_tools/ML_callbacks.py,sha256=OT2zwORLcn49megBEgXsSUxDHoW0Ft0_v7hLEVF3jHM,13063
+ml_tools/ML_evaluation.py,sha256=oiDV6HItQloUUKCUpltV-2pogubWLBieGpc-VUwosAQ,10106
+ml_tools/ML_trainer.py,sha256=Pw4tLtlexoZJs_3o5I6ElQMTLjijzydXXQE834949Dw,14470
+ml_tools/ML_tutorial.py,sha256=-9tJO9ISPxEjRINVaF_Bu7tiiJ2W3zznQ4gNlZeP1HQ,12238
+ml_tools/PSO_optimization.py,sha256=c23Fd-ttqoO8IBPK5-TXZLqPi9UPHUC4HNoF02Q8wLo,24774
+ml_tools/RNN_forecast.py,sha256=IZLcPs3by0Chei7ill_Grjxs7BBUnzau0Oavi3dWiyE,1886
+ml_tools/VIF_factor.py,sha256=BeP4ig3l7b1Igwgte9z8rEwHdSZvVT7W_9mcBHGoNJw,10299
+ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
+ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
+ml_tools/data_exploration.py,sha256=M7bn2q5XN9zJZJGAmMMFSFFZh8LGzC2arFelrXw3N6Q,25241
+ml_tools/datasetmaster.py,sha256=S3PKHNQZ9cyAOck8xQltVLZhaD1gFLfgHFL-aRjz4JU,30077
+ml_tools/ensemble_learning.py,sha256=CDSIygnHaNe92aJ46Fofevd7q6lowTnE98yWuIV3Y6w,37462
+ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
+ml_tools/logger.py,sha256=UkbiU9ihBhw9VKyn3rZzisdClWV94EBV6B09_D0iUU0,6026
+ml_tools/utilities.py,sha256=0w0vka0Aj9IYOHJ6crWIb6gwpQIJnPyj3v2_dnVxHrs,23138
+dragon_ml_toolbox-3.7.0.dist-info/METADATA,sha256=kvgFjd_BRwob7xycC5rbROCkq4C6FVq3J5-VdCXEPrI,3273
+dragon_ml_toolbox-3.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-3.7.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-3.7.0.dist-info/RECORD,,

ml_tools/ETL_engineering.py CHANGED Viewed

@@ -294,7 +294,7 @@ class DataProcessor:
                 raise TypeError(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
         if not processed_columns:
-            _LOGGER.warning("The transformation resulted in an empty DataFrame.")
+            _LOGGER.warning("⚠️ The transformation resulted in an empty DataFrame.")
             return pl.DataFrame()
         return pl.DataFrame(processed_columns)
@@ -588,7 +588,7 @@ class NumberExtractor:
             if not isinstance(round_digits, int):
                 raise TypeError("round_digits must be an integer.")
             if dtype == "int":
-                _LOGGER.warning(f"'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
+                _LOGGER.warning(f"⚠️ 'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
         self.regex_pattern = regex_pattern
         self.dtype = dtype

ml_tools/GUI_tools.py CHANGED Viewed

@@ -148,7 +148,7 @@ class ConfigManager:
         """
         path = Path(file_path)
         if path.exists() and not force_overwrite:
-            _LOGGER.warning(f"Configuration file already exists at {path}. Aborting.")
+            _LOGGER.warning(f"⚠️ Configuration file already exists at {path}. Aborting.")
             return
         config = configparser.ConfigParser()
@@ -206,7 +206,7 @@ class ConfigManager:
         with open(path, 'w') as configfile:
             config.write(configfile)
-        _LOGGER.info(f"Successfully generated config template at: '{path}'")
+        _LOGGER.info(f"📝 Successfully generated config template at: '{path}'")
 # --- GUI Factory ---
@@ -482,13 +482,12 @@ def update_target_fields(window: sg.Window, results_dict: Dict[str, Any]):
     Args:
         window (sg.Window): The application's window object.
-        results_dict (dict): A dictionary where keys are target names (without the
-                             'TARGET_' prefix) and values are the predicted results.
+        results_dict (dict): A dictionary where keys are target key names (including 'TARGET_' prefix if necessary) and values are the predicted results.
     """
     for target_name, result in results_dict.items():
         # Format numbers to 2 decimal places, leave other types as-is
         display_value = f"{result:.2f}" if isinstance(result, (int, float)) else result
-        window[f'TARGET_{target_name}'].update(display_value)
+        window[target_name].update(display_value)
 def info():

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -128,7 +128,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
             plt.savefig(save_path, bbox_inches='tight', format="svg")
             plt.close()
-        _LOGGER.info(f"{dataset_file_dir} completed.")
+        _LOGGER.info(f"✅ {dataset_file_dir} process completed.")
 # Imputed distributions
@@ -213,7 +213,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
             fig = kernel.plot_imputed_distributions(variables=[feature])
             _process_figure(fig, feature)
-    _LOGGER.info(f"{local_dir_name} completed.")
+    _LOGGER.info(f"✅ {local_dir_name} completed.")
 def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],

ml_tools/ML_callbacks.py CHANGED Viewed

@@ -178,7 +178,6 @@ class EarlyStopping(Callback):
                 self.stopped_epoch = epoch
                 self.trainer.stop_training = True # type: ignore
                 if self.verbose > 0:
-                    print("")
                     _LOGGER.info(f"Epoch {epoch+1}: early stopping after {self.wait} epochs with no improvement.")
@@ -256,7 +255,6 @@ class ModelCheckpoint(Callback):
             new_filepath = self.save_dir / filename
             if self.verbose > 0:
-                print("")
                 _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
             # Save the new best model
@@ -276,7 +274,6 @@ class ModelCheckpoint(Callback):
         filepath = self.save_dir / filename
         if self.verbose > 0:
-            print("")
             _LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
         torch.save(self.trainer.model.state_dict(), filepath) # type: ignore
@@ -325,7 +322,6 @@ class LRScheduler(Callback):
             if metric_val is not None:
                 self.scheduler.step(metric_val)
             else:
-                print("")
                 _LOGGER.warning(f"LRScheduler could not find metric '{self.monitor}' in logs.")
         # For all other schedulers
@@ -335,7 +331,6 @@ class LRScheduler(Callback):
         # Log the change if the LR was updated
         current_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
         if current_lr != self.previous_lr:
-            print("")
             _LOGGER.info(f"Epoch {epoch}: Learning rate changed to {current_lr:.6f}")
             self.previous_lr = current_lr

ml_tools/ML_evaluation.py CHANGED Viewed

@@ -65,7 +65,7 @@ def plot_losses(history: dict, save_dir: Optional[Union[str, Path]] = None):
         save_dir_path = make_fullpath(save_dir, make=True)
         save_path = save_dir_path / "loss_plot.svg"
         plt.savefig(save_path)
-        _LOGGER.info(f"Loss plot saved as '{save_path.name}'")
+        _LOGGER.info(f"📉 Loss plot saved as '{save_path.name}'")
     else:
         plt.show()
     plt.close(fig)
@@ -92,7 +92,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
         # Save text report
         report_path = save_dir_path / "classification_report.txt"
         report_path.write_text(report, encoding="utf-8")
-        _LOGGER.info(f"Classification report saved as '{report_path.name}'")
+        _LOGGER.info(f"📝 Classification report saved as '{report_path.name}'")
         # Save Confusion Matrix
         fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
@@ -100,7 +100,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
         ax_cm.set_title("Confusion Matrix")
         cm_path = save_dir_path / "confusion_matrix.svg"
         plt.savefig(cm_path)
-        _LOGGER.info(f"Confusion matrix saved as '{cm_path.name}'")
+        _LOGGER.info(f"❇️ Confusion matrix saved as '{cm_path.name}'")
         plt.close(fig_cm)
         # Save ROC Curve
@@ -117,7 +117,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
             ax_roc.grid(True)
             roc_path = save_dir_path / "roc_curve.svg"
             plt.savefig(roc_path)
-            _LOGGER.info(f"ROC curve saved as '{roc_path.name}'")
+            _LOGGER.info(f"📈 ROC curve saved as '{roc_path.name}'")
             plt.close(fig_roc)
     else:
         # Show plots if not saving
@@ -162,7 +162,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
         # Save text report
         report_path = save_dir_path / "regression_report.txt"
         report_path.write_text(report_string)
-        _LOGGER.info(f"Regression report saved as '{report_path.name}'")
+        _LOGGER.info(f"📝 Regression report saved as '{report_path.name}'")
         # Save residual plot
         residuals = y_true - y_pred
@@ -176,7 +176,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
         plt.tight_layout()
         res_path = save_dir_path / "residual_plot.svg"
         plt.savefig(res_path)
-        _LOGGER.info(f"Residual plot saved as '{res_path.name}'")
+        _LOGGER.info(f"📈 Residual plot saved as '{res_path.name}'")
         plt.close(fig_res)
         # Save true vs predicted plot
@@ -190,7 +190,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
         plt.tight_layout()
         tvp_path = save_dir_path / "true_vs_predicted_plot.svg"
         plt.savefig(tvp_path)
-        _LOGGER.info(f"True vs. Predicted plot saved as '{tvp_path.name}'")
+        _LOGGER.info(f"📉 True vs. Predicted plot saved as '{tvp_path.name}'")
         plt.close(fig_tvp)
@@ -227,7 +227,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         plt.title("SHAP Feature Importance")
         plt.tight_layout()
         plt.savefig(bar_path)
-        _LOGGER.info(f"SHAP bar plot saved as '{bar_path.name}'")
+        _LOGGER.info(f"📊 SHAP bar plot saved as '{bar_path.name}'")
         plt.close()
         # Save Dot Plot
@@ -236,7 +236,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         plt.title("SHAP Feature Importance")
         plt.tight_layout()
         plt.savefig(dot_path)
-        _LOGGER.info(f"SHAP dot plot saved as '{dot_path.name}'")
+        _LOGGER.info(f"📊 SHAP dot plot saved as '{dot_path.name}'")
         plt.close()
         # Save Summary Data to CSV
@@ -249,7 +249,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
             'mean_abs_shap_value': mean_abs_shap
         }).sort_values('mean_abs_shap_value', ascending=False)
         summary_df.to_csv(summary_path, index=False)
-        _LOGGER.info(f"SHAP summary data saved as '{summary_path.name}'")
+        _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
     else:
         _LOGGER.info("No save directory provided. Displaying SHAP dot plot.")
         shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot")

ml_tools/ML_trainer.py CHANGED Viewed

@@ -72,10 +72,10 @@ class MyTrainer:
         """Validates the selected device and returns a torch.device object."""
         device_lower = device.lower()
         if "cuda" in device_lower and not torch.cuda.is_available():
-            _LOGGER.warning("CUDA not available, switching to CPU.")
+            _LOGGER.warning("⚠️ CUDA not available, switching to CPU.")
             device = "cpu"
         elif device_lower == "mps" and not torch.backends.mps.is_available():
-            _LOGGER.warning("Apple Metal Performance Shaders (MPS) not available, switching to CPU.")
+            _LOGGER.warning("⚠️ Apple Metal Performance Shaders (MPS) not available, switching to CPU.")
             device = "cpu"
         return torch.device(device)

ml_tools/PSO_optimization.py CHANGED Viewed

@@ -22,7 +22,6 @@ import torch
 from tqdm import trange
 import matplotlib.pyplot as plt
 import seaborn as sns
-from collections import defaultdict
 from .logger import _LOGGER
@@ -307,7 +306,7 @@ def run_pso(lower_boundaries: list[float],
     else:
         device = torch.device("cpu")
-    _LOGGER.info(f"Using device: '{device}'")
+    _LOGGER.info(f"👾 Using device: '{device}'")
     # set local deep copies to prevent in place list modification
     local_lower_boundaries = deepcopy(lower_boundaries)
@@ -511,13 +510,13 @@ def _pso(func: ObjectiveFunction,
         return best_position, best_score
-def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path], color_by_target: bool = True):
+def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
     """
     Analyzes optimization results and plots the distribution of optimal values for each feature.
-    This function can operate in two modes based on the `color_by_target` parameter:
-    1.  Aggregates all values for a feature into a single group and plots one overall distribution (histogram + KDE).
-    2.  Color-coded: Plots a separate, color-coded Kernel Density Estimate (KDE) for each source target, allowing for direct comparison on a single chart.
+    For features with more than two unique values, this function generates a color-coded
+    Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
+    showing relative frequency.
     Parameters
     ----------
@@ -525,76 +524,69 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
         The path to the directory containing the optimization result CSV files.
     save_dir : str or Path
         The directory where the output plots will be saved.
-    color_by_target : bool, optional
-        If True, generates comparative plots with distributions colored by their source target.
     """
-    mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
-    _LOGGER.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
-    # Check results_dir
+    # Check results_dir and create output path
     results_path = make_fullpath(results_dir)
-    # make output path
     output_path = make_fullpath(save_dir, make=True)
     all_csvs = list_csv_paths(results_path)
     if not all_csvs:
-        _LOGGER.warning("No data found. No plots will be generated.")
+        _LOGGER.warning("⚠️ No data found. No plots will be generated.")
         return
-    # --- MODE 1: Color-coded plots by target ---
-    if color_by_target:
-        data_to_plot = []
-        for df, df_name in yield_dataframes_from_dir(results_path):
-            # Assumes last col is target, rest are features
-            melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
-            # Sanitize target name for cleaner legend labels
-            melted_df['target'] = df_name.replace("Optimization_", "")
-            data_to_plot.append(melted_df)
-        long_df = pd.concat(data_to_plot, ignore_index=True)
-        features = long_df['feature'].unique()
-        _LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
-        for feature_name in features:
-            plt.figure(figsize=(12, 7))
-            feature_df = long_df[long_df['feature'] == feature_name]
+    # --- Data Loading and Preparation ---
+    _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
+    data_to_plot = []
+    for df, df_name in yield_dataframes_from_dir(results_path):
+        melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
+        melted_df['target'] = df_name.replace("Optimization_", "")
+        data_to_plot.append(melted_df)
+    long_df = pd.concat(data_to_plot, ignore_index=True)
+    features = long_df['feature'].unique()
+    _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
+    # --- Plotting Loop ---
+    for feature_name in features:
+        plt.figure(figsize=(12, 7))
+        feature_df = long_df[long_df['feature'] == feature_name]
+        # Check if the feature is binary or constant
+        if feature_df['value'].nunique() <= 2:
+            # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
+            # This ensures the X-axis is clean (e.g., just 0 and 1).
+            norm_df = (feature_df.groupby('target')['value']
+                       .value_counts(normalize=True)
+                       .mul(100)
+                       .rename('percent')
+                       .reset_index())
-            sns.kdeplot(data=feature_df, x='value', hue='target', fill=True, alpha=0.1)
+            ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
-            plt.title(f"Comparative Distribution for '{feature_name}'", fontsize=16)
-            plt.xlabel("Feature Value", fontsize=12)
-            plt.ylabel("Density", fontsize=12)
-            plt.grid(axis='y', alpha=0.5, linestyle='--')
-            plt.legend(title='Target')
-            sanitized_feature_name = sanitize_filename(feature_name)
-            plot_filename = output_path / f"Comparative_{sanitized_feature_name}.svg"
-            plt.savefig(plot_filename, bbox_inches='tight')
-            plt.close()
-    # --- MODE 2: Aggregate plot ---
-    else:
-        feature_distributions = defaultdict(list)
-        for df, _ in yield_dataframes_from_dir(results_path):
-            feature_columns = df.iloc[:, :-1]
-            for feature_name in feature_columns:
-                feature_distributions[feature_name].extend(df[feature_name].tolist())
+            plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
+            plt.ylabel("Frequency (%)", fontsize=12)
+            ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
+        else:
+            # PLOT 2: KDE plot for continuous values.
+            ax = sns.kdeplot(data=feature_df, x='value', hue='target',
+                             fill=True, alpha=0.1, warn_singular=False)
+            plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
+            plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
+        # --- Common settings for both plot types ---
+        plt.xlabel("Feature Value", fontsize=12)
+        plt.grid(axis='y', alpha=0.5, linestyle='--')
-        _LOGGER.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
-        for feature_name, values in feature_distributions.items():
-            plt.figure(figsize=(12, 7))
-            sns.histplot(x=values, kde=True, bins='auto', stat="density")
-            plt.title(f"Aggregate Distribution for '{feature_name}'", fontsize=16)
-            plt.xlabel("Feature Value", fontsize=12)
-            plt.ylabel("Density", fontsize=12)
-            plt.grid(axis='y', alpha=0.5, linestyle='--')
-            sanitized_feature_name = sanitize_filename(feature_name)
-            plot_filename = output_path / f"Aggregate_{sanitized_feature_name}.svg"
-            plt.savefig(plot_filename, bbox_inches='tight')
-            plt.close()
+        legend = ax.get_legend()
+        if legend:
+            legend.set_title('Target')
+        sanitized_feature_name = sanitize_filename(feature_name)
+        plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
+        plt.savefig(plot_filename, bbox_inches='tight')
+        plt.close()
     _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")

ml_tools/VIF_factor.py CHANGED Viewed

@@ -168,12 +168,12 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
     # Identify features to drop
     to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
-    _LOGGER.info(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
+    _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
     result_df = df.drop(columns=to_drop)
     if result_df.empty:
-        _LOGGER.warning(f"\t⚠️ All columns were dropped.")
+        _LOGGER.warning(f"⚠️ All columns were dropped.")
     return result_df, to_drop

ml_tools/data_exploration.py CHANGED Viewed

@@ -15,7 +15,7 @@ import re
 # Keep track of all available tools, show using `info()`
 __all__ = [
     "summarize_dataframe",
-    "drop_zero_only_columns",
+    "drop_constant_columns",
     "drop_rows_with_missing_data",
     "split_features_targets",
     "show_null_columns",
@@ -62,44 +62,50 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
     return summary
-def drop_zero_only_columns(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
+def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
     """
-    Removes columns from a pandas DataFrame that contain only zeros and null/NaN values.
+    Removes columns from a pandas DataFrame that contain only a single unique
+    value or are entirely null/NaN.
-    This utility is useful for cleaning data after dummification steps that may result in empty columns.
+    This utility is useful for cleaning data by removing constant features that
+    have no predictive value.
     Args:
         df (pd.DataFrame):
             The pandas DataFrame to clean.
+        verbose (bool):
+            If True, prints the names of the columns that were dropped.
+            Defaults to True.
     Returns:
         pd.DataFrame:
-            A new DataFrame with the empty columns removed.
+            A new DataFrame with the constant columns removed.
     """
     if not isinstance(df, pd.DataFrame):
         raise TypeError("Input must be a pandas DataFrame.")
     original_columns = set(df.columns)
     cols_to_keep = []
     for col_name in df.columns:
         column = df[col_name]
-        # Keep any column that is not numeric by default
-        if not is_numeric_dtype(column):
+        # We can apply this logic to all columns or only focus on numeric ones.
+        # if not is_numeric_dtype(column):
+        #     cols_to_keep.append(col_name)
+        #     continue
+        # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
+        if column.nunique(dropna=True) > 1:
             cols_to_keep.append(col_name)
-            continue
-        # For numeric columns, check if there's at least one non-zero value.
-        if (column.fillna(0) != 0).any():
-            cols_to_keep.append(col_name)
-    dropped_columns = original_columns - set(cols_to_keep)
-    if dropped_columns and verbose:
-        print(f"Dropped {len(dropped_columns)} columns:")
-        for dropped_column in dropped_columns:
-            print(f"    {dropped_column}")
+    dropped_columns = original_columns - set(cols_to_keep)
+    if verbose:
+        print(f"🧹 Dropped {len(dropped_columns)} constant columns.")
+        if dropped_columns:
+            for dropped_column in dropped_columns:
+                print(f"    {dropped_column}")
     return df[cols_to_keep]

ml_tools/datasetmaster.py CHANGED Viewed

@@ -13,7 +13,7 @@ from torchvision.datasets import ImageFolder
 from torchvision import transforms
 import matplotlib.pyplot as plt
 from pathlib import Path
-from .utilities import _script_info
+from .utilities import _script_info, make_fullpath
 from .logger import _LOGGER
@@ -204,7 +204,7 @@ class DatasetMaker(_BaseMaker):
         if not self._is_split:
             raise RuntimeError("Continuous features must be normalized AFTER splitting data. Call .split_data() first.")
         if self._is_normalized:
-            _LOGGER.warning("Data has already been normalized.")
+            _LOGGER.warning("⚠️ Data has already been normalized.")
             return self
         # Use continuous features columns
@@ -232,7 +232,7 @@ class DatasetMaker(_BaseMaker):
     def split_data(self, test_size: float = 0.2, stratify: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
         """Splits the data into training and testing sets."""
         if self._is_split:
-            _LOGGER.warning("Data has already been split.")
+            _LOGGER.warning("⚠️ Data has already been split.")
             return self
         if self.labels.dtype == 'object' or self.labels.dtype.name == 'category':
@@ -260,9 +260,9 @@ class DatasetMaker(_BaseMaker):
         Defaults to `SMOTETomek`.
         """
         if not self._is_split:
-            raise RuntimeError("Cannot balance data before it has been split. Call .split_data() first.")
+            raise RuntimeError("❌ Cannot balance data before it has been split. Call .split_data() first.")
         if self._is_balanced:
-            _LOGGER.warning("Training data has already been balanced.")
+            _LOGGER.warning("⚠️ Training data has already been balanced.")
             return self
         if resampler is None:
@@ -278,13 +278,13 @@ class DatasetMaker(_BaseMaker):
     def process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
                 balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
         """Runs a standard, fully automated preprocessing pipeline."""
-        _LOGGER.info("--- Running Automated Processing Pipeline ---")
+        _LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
         self.process_categoricals(method=cat_method)
         self.split_data(test_size=test_size, stratify=True, random_state=random_state)
         self.normalize_continuous(method=normalize_method)
         if balance:
             self.balance_data()
-        _LOGGER.info("--- Automated Processing Complete ---")
+        _LOGGER.info("--- 🤖 Automated Processing Complete ---")
         return self
     def denormalize(self, data: Union[torch.Tensor, numpy.ndarray, pandas.DataFrame]) -> Union[numpy.ndarray, pandas.DataFrame]:
@@ -400,10 +400,7 @@ class VisionDatasetMaker(_BaseMaker):
         Logs a report of the types, sizes, and channels of image files
         found in the directory and its subdirectories.
         """
-        path_obj = Path(path)
-        if not path_obj.is_dir():
-            _LOGGER.error(f"Path is not a valid directory: {path_obj}")
-            return
+        path_obj = make_fullpath(path)
         non_image_files = set()
         img_types = set()
@@ -505,7 +502,7 @@ class VisionDatasetMaker(_BaseMaker):
         if not self._is_split:
             raise RuntimeError("Data has not been split. Call .split_data() first.")
         if not self._are_transforms_configured:
-            _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
+            _LOGGER.warning("⚠️ Transforms have not been configured. Using default ToTensor only.")
         if self._test_dataset:
             return self._train_dataset, self._val_dataset, self._test_dataset
@@ -555,7 +552,7 @@ class SequenceMaker(_BaseMaker):
             raise RuntimeError("Data must be split BEFORE normalizing. Call .split_data() first.")
         if self.scaler:
-            _LOGGER.warning("Data has already been normalized.")
+            _LOGGER.warning("⚠️ Data has already been normalized.")
             return self
         if method == "standard":
@@ -579,7 +576,7 @@ class SequenceMaker(_BaseMaker):
     def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
         """Splits the sequence into training and testing portions."""
         if self._is_split:
-            _LOGGER.warning("Data has already been split.")
+            _LOGGER.warning("⚠️ Data has already been split.")
             return self
         split_idx = int(len(self.sequence) * (1 - test_size))

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -915,7 +915,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
     datasets_path = make_fullpath(datasets_dir)
     save_path = make_fullpath(save_dir, make=True)
-    _LOGGER.info("Training starting...")
+    _LOGGER.info("🏁 Training starting...")
     #Yield imputed dataset
     for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
         #Yield features dataframe and target dataframe
@@ -933,7 +933,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
                                     test_features=X_test, test_target=y_test,
                                     feature_names=feature_names,target_name=target_name,
                                     debug=debug, save_dir=save_path, save_model=save_model)
-    print("")
     _LOGGER.info("✅ Training and evaluation complete.")

ml_tools/logger.py CHANGED Viewed

@@ -10,7 +10,6 @@ import logging
 import sys
 __all__ = [
     "custom_logger"
 ]
@@ -85,10 +84,10 @@ def custom_logger(
         else:
             raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
-        _LOGGER.info(f"Log saved to: '{base_path}'")
+        _LOGGER.info(f"🗄️ Log saved to: '{base_path}'")
     except Exception as e:
-        _LOGGER.error(f"Log not saved: {e}")
+        _LOGGER.error(f"❌ Log not saved: {e}")
 def _log_list_to_txt(data: List[Any], path: Path) -> None:
@@ -176,7 +175,7 @@ def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
         handler = logging.StreamHandler(sys.stdout)
         # Define the format string and the date format separately
-        log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        log_format = '\n🐉%(asctime)s - %(name)s - %(levelname)s - %(message)s'
         date_format = '%Y-%m-%d %H:%M' # Format: Year-Month-Day Hour:Minute
         # Pass both the format and the date format to the Formatter

ml_tools/utilities.py CHANGED Viewed

@@ -640,7 +640,7 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
                 print(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
                 continue
-    print(f"{total_saved} single-target datasets were created.")
+    print(f"\n✅ {total_saved} single-target datasets were created.")
 class LogKeys:

dragon_ml_toolbox-3.5.1.dist-info/RECORD DELETED Viewed

@@ -1,25 +0,0 @@
-dragon_ml_toolbox-3.5.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-3.5.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
-ml_tools/ETL_engineering.py,sha256=URol7s45fVIdLqnhyOU1Etbi-D7MksFg-qtNwsKiunY,39488
-ml_tools/GUI_tools.py,sha256=uFx6zIrQZzDPSTtOSHz8ptz-fxZiQz-lXHcrqwuYV_E,20385
-ml_tools/MICE_imputation.py,sha256=ed-YeQkEAeHxTNkWIHs09T4YeYNF0aqAnrUTcdIEp9E,11372
-ml_tools/ML_callbacks.py,sha256=gHZk-lyzAax6iEtG26zHuoobdAZCFJ6BmI6pWoXkOrw,13189
-ml_tools/ML_evaluation.py,sha256=3xOqVXLJDhbioKZ922yxFnSuO4VDQ-HFzZyZZ1MskVM,10054
-ml_tools/ML_trainer.py,sha256=zRs3crz_z4B285iJhmY7m4AFwnvvq4urOyl4zDuCLtA,14456
-ml_tools/ML_tutorial.py,sha256=-9tJO9ISPxEjRINVaF_Bu7tiiJ2W3zznQ4gNlZeP1HQ,12238
-ml_tools/PSO_optimization.py,sha256=RCvIFGyf28voo2mpbRKC6LfDzKslzY-aYoPwgv9F4Bg,25458
-ml_tools/RNN_forecast.py,sha256=IZLcPs3by0Chei7ill_Grjxs7BBUnzau0Oavi3dWiyE,1886
-ml_tools/VIF_factor.py,sha256=4b3HmrrolN7ZIAo16TWwLlExqj_xaa8MxbkXD1xPCys,10295
-ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
-ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
-ml_tools/data_exploration.py,sha256=41q0ux4rsf6ktQDzX1haYOk0iRZzmNucrHRi_rqlNLs,25013
-ml_tools/datasetmaster.py,sha256=N-uwfzWnl_qnoAqjbfS98I1pVNra5u6rhKLdWbFIReA,30122
-ml_tools/ensemble_learning.py,sha256=PPtBBLgLvaYOdY-MlcjXuxWWXf3JQavLNEysFgzjc_s,37470
-ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
-ml_tools/logger.py,sha256=jC4Q2OqmDm8ZO9VpuZqBSWdXryqaJvLscqVJ6caNMOk,6009
-ml_tools/utilities.py,sha256=7cVWXjdxgSoIbZunuxJEOnJDSYp29liYsZexbrVDabs,23132
-dragon_ml_toolbox-3.5.1.dist-info/METADATA,sha256=F1RicIFxIpnKadElu8EU_k6P0FYKwGPRjHF2YXe9F6E,3273
-dragon_ml_toolbox-3.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-3.5.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-3.5.1.dist-info/RECORD,,

{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.5.1.dist-info → dragon_ml_toolbox-3.7.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 3.5.1py3-none-any.whl → 3.7.0py3-none-any.whl