PyPI - dragon-ml-toolbox - Versions diffs - 3.6.0__tar.gz → 3.8.0__tar.gz - Mend

dragon-ml-toolbox 3.6.0tar.gz → 3.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (30) hide show

{dragon_ml_toolbox-3.6.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.6.0
+Version: 3.8.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.6.0
+Version: 3.8.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/ETL_engineering.py RENAMED Viewed

@@ -294,7 +294,7 @@ class DataProcessor:
                 raise TypeError(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
         if not processed_columns:
-            _LOGGER.warning("The transformation resulted in an empty DataFrame.")
+            _LOGGER.warning("⚠️ The transformation resulted in an empty DataFrame.")
             return pl.DataFrame()
         return pl.DataFrame(processed_columns)
@@ -588,7 +588,7 @@ class NumberExtractor:
             if not isinstance(round_digits, int):
                 raise TypeError("round_digits must be an integer.")
             if dtype == "int":
-                _LOGGER.warning(f"'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
+                _LOGGER.warning(f"⚠️ 'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
         self.regex_pattern = regex_pattern
         self.dtype = dtype

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/GUI_tools.py RENAMED Viewed

@@ -4,14 +4,13 @@ from typing import Optional, Callable, Any
 import traceback
 import FreeSimpleGUI as sg
 from functools import wraps
-from typing import Any, Dict, Tuple, List
+from typing import Any, Dict, Tuple, List, Literal
 from .utilities import _script_info
 import numpy as np
 from .logger import _LOGGER
 __all__ = [
-    "PathManager",
     "ConfigManager",
     "GUIFactory",
     "catch_exceptions",
@@ -19,68 +18,6 @@ __all__ = [
     "update_target_fields"
 ]
-# --- Path Management ---
-class PathManager:
-    """
-    Manages paths for a Python application, supporting both development mode and bundled mode via Briefcase.
-    """
-    def __init__(self, anchor_file: str):
-        """
-        Initializes the PathManager. The package name is automatically inferred
-        from the parent directory of the anchor file.
-        Args:
-            anchor_file (str): The absolute path to a file within the project's
-                               package, typically `__file__` from a module inside
-                               that package (paths.py).
-        Note:
-            This inference assumes that the anchor file's parent directory
-            has the same name as the package (e.g., `.../src/my_app/paths.py`).
-            This is a standard and recommended project structure.
-        """
-        resolved_anchor_path = Path(anchor_file).resolve()
-        self.package_name = resolved_anchor_path.parent.name
-        self._is_bundled, self._resource_path_func = self._check_bundle_status()
-        if self._is_bundled:
-            # In a Briefcase bundle, resource_path gives an absolute path
-            # to the resource directory.
-            self.package_root = self._resource_path_func(self.package_name, "") # type: ignore
-        else:
-            # In development mode, the package root is the directory
-            # containing the anchor file.
-            self.package_root = resolved_anchor_path.parent
-    def _check_bundle_status(self) -> tuple[bool, Optional[Callable]]:
-        """Checks if the app is running in a bundled environment."""
-        try:
-            # This is the function Briefcase provides in a bundled app
-            from briefcase.platforms.base import resource_path # type: ignore
-            return True, resource_path
-        except ImportError:
-            return False, None
-    def get_path(self, relative_path: str | Path) -> Path:
-        """
-        Gets the absolute path for a given resource file or directory
-        relative to the package root.
-        Args:
-            relative_path (str | Path): The path relative to the package root (e.g., 'helpers/icon.png').
-        Returns:
-            Path: The absolute path to the resource.
-        """
-        if self._is_bundled:
-            # Briefcase's resource_path handles resolving the path within the app bundle
-            return self._resource_path_func(self.package_name, str(relative_path)) # type: ignore
-        else:
-            # In dev mode, join package root with the relative path.
-            return self.package_root / relative_path
 # --- Configuration Management ---
 class _SectionProxy:
     """A helper class to represent a section of the .ini file as an object."""
@@ -148,7 +85,7 @@ class ConfigManager:
         """
         path = Path(file_path)
         if path.exists() and not force_overwrite:
-            _LOGGER.warning(f"Configuration file already exists at {path}. Aborting.")
+            _LOGGER.warning(f"⚠️ Configuration file already exists at {path}. Aborting.")
             return
         config = configparser.ConfigParser()
@@ -206,7 +143,7 @@ class ConfigManager:
         with open(path, 'w') as configfile:
             config.write(configfile)
-        _LOGGER.info(f"Successfully generated config template at: '{path}'")
+        _LOGGER.info(f"📝 Successfully generated config template at: '{path}'")
 # --- GUI Factory ---
@@ -273,8 +210,8 @@ class GUIFactory:
         self,
         data_dict: Dict[str, Tuple[float, float]],
         is_target: bool = False,
-        layout_mode: str = 'grid',
-        columns_per_row: int = 4
+        layout_mode: Literal["grid", "row"] = 'grid',
+        features_per_column: int = 4
     ) -> List[List[sg.Column]]:
         """
         Generates a layout for continuous features or targets.
@@ -283,7 +220,7 @@ class GUIFactory:
             data_dict (dict): Keys are feature names, values are (min, max) tuples.
             is_target (bool): If True, creates disabled inputs for displaying results.
             layout_mode (str): 'grid' for a multi-row grid layout, or 'row' for a single horizontal row.
-            columns_per_row (int): Number of feature columns per row when layout_mode is 'grid'.
+            features_per_column (int): Number of features per column when `layout_mode` is 'grid'.
         Returns:
             A list of lists of sg.Column elements, ready to be used in a window layout.
@@ -294,7 +231,7 @@ class GUIFactory:
         columns = []
         for name, (val_min, val_max) in data_dict.items():
-            key = f"TARGET_{name}" if is_target else name
+            key = name
             default_text = "" if is_target else str(val_max)
             label = sg.Text(name, font=label_font, background_color=bg_color, key=f"_text_{name}")
@@ -313,6 +250,7 @@ class GUIFactory:
                 range_text = sg.Text(f"Range: {int(val_min)}-{int(val_max)}", font=range_font, background_color=bg_color)
                 layout = [[label], [element], [range_text]]
+            # each feature is wrapped as a column element
             layout.append([sg.Text(" ", font=(cfg.fonts.font_family, 2), background_color=bg_color)]) # type: ignore
             columns.append(sg.Column(layout, background_color=bg_color))
@@ -320,13 +258,13 @@ class GUIFactory:
             return [columns] # A single row containing all columns
         # Default to 'grid' layout
-        return [columns[i:i + columns_per_row] for i in range(0, len(columns), columns_per_row)]
+        return [columns[i:i + features_per_column] for i in range(0, len(columns), features_per_column)]
     def generate_combo_layout(
         self,
         data_dict: Dict[str, List[Any]],
-        layout_mode: str = 'grid',
-        columns_per_row: int = 4
+        layout_mode: Literal["grid", "row"] = 'grid',
+        features_per_column: int = 4
     ) -> List[List[sg.Column]]:
         """
         Generates a layout for categorical or binary features using Combo boxes.
@@ -334,7 +272,7 @@ class GUIFactory:
         Args:
             data_dict (dict): Keys are feature names, values are lists of options.
             layout_mode (str): 'grid' for a multi-row grid layout, or 'row' for a single horizontal row.
-            columns_per_row (int): Number of feature columns per row when layout_mode is 'grid'.
+            features_per_column (int): Number of features per column when `layout_mode` is 'grid'.
         Returns:
             A list of lists of sg.Column elements, ready to be used in a window layout.
@@ -352,13 +290,14 @@ class GUIFactory:
             )
             layout = [[label], [element]]
             layout.append([sg.Text(" ", font=(cfg.fonts.font_family, 2), background_color=bg_color)]) # type: ignore
+            # each feature is wrapped in a Column element
             columns.append(sg.Column(layout, background_color=bg_color))
         if layout_mode == 'row':
             return [columns] # A single row containing all columns
         # Default to 'grid' layout
-        return [columns[i:i + columns_per_row] for i in range(0, len(columns), columns_per_row)]
+        return [columns[i:i + features_per_column] for i in range(0, len(columns), features_per_column)]
     # --- Window Creation ---
     def create_window(self, title: str, layout: List[List[sg.Element]], **kwargs) -> sg.Window:
@@ -421,8 +360,8 @@ def _default_categorical_processor(feature_name: str, chosen_value: Any) -> List
     return [1.0] if str(chosen_value) == 'True' else [0.0]
 def prepare_feature_vector(
-    values: Dict[str, Any],
-    feature_order: List[str],
+    window_values: Dict[str, Any],
+    gui_feature_order: List[str],
     continuous_features: List[str],
     categorical_features: List[str],
     categorical_processor: Optional[Callable[[str, Any], List[float]]] = None
@@ -432,8 +371,8 @@ def prepare_feature_vector(
     This function supports label encoding and one-hot encoding via the processor.
     Args:
-        values (dict): The values dictionary from a `window.read()` call.
-        feature_order (list): A list of all feature names that have a GUI element.
+        window_values (dict): The values dictionary from a `window.read()` call.
+        gui_feature_order (list): A list of all feature names that have a GUI element.
                               For one-hot encoding, this should be the name of the
                               single GUI element (e.g., 'material_type'), not the
                               expanded feature names (e.g., 'material_is_steel').
@@ -456,8 +395,8 @@ def prepare_feature_vector(
     cont_set = set(continuous_features)
     cat_set = set(categorical_features)
-    for name in feature_order:
-        chosen_value = values.get(name)
+    for name in gui_feature_order:
+        chosen_value = window_values.get(name)
         if chosen_value is None or chosen_value == '':
             raise ValueError(f"Feature '{name}' is missing a value.")
@@ -482,13 +421,12 @@ def update_target_fields(window: sg.Window, results_dict: Dict[str, Any]):
     Args:
         window (sg.Window): The application's window object.
-        results_dict (dict): A dictionary where keys are target names (without the
-                             'TARGET_' prefix) and values are the predicted results.
+        results_dict (dict): A dictionary where keys are target element-keys and values are the predicted results to update.
     """
     for target_name, result in results_dict.items():
         # Format numbers to 2 decimal places, leave other types as-is
         display_value = f"{result:.2f}" if isinstance(result, (int, float)) else result
-        window[f'TARGET_{target_name}'].update(display_value)
+        window[target_name].update(display_value) # type: ignore
 def info():

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/MICE_imputation.py RENAMED Viewed

@@ -128,7 +128,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
             plt.savefig(save_path, bbox_inches='tight', format="svg")
             plt.close()
-        _LOGGER.info(f"{dataset_file_dir} completed.")
+        _LOGGER.info(f"✅ {dataset_file_dir} process completed.")
 # Imputed distributions
@@ -213,7 +213,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
             fig = kernel.plot_imputed_distributions(variables=[feature])
             _process_figure(fig, feature)
-    _LOGGER.info(f"{local_dir_name} completed.")
+    _LOGGER.info(f"✅ {local_dir_name} completed.")
 def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/ML_callbacks.py RENAMED Viewed

@@ -178,7 +178,6 @@ class EarlyStopping(Callback):
                 self.stopped_epoch = epoch
                 self.trainer.stop_training = True # type: ignore
                 if self.verbose > 0:
-                    print("")
                     _LOGGER.info(f"Epoch {epoch+1}: early stopping after {self.wait} epochs with no improvement.")
@@ -256,7 +255,6 @@ class ModelCheckpoint(Callback):
             new_filepath = self.save_dir / filename
             if self.verbose > 0:
-                print("")
                 _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
             # Save the new best model
@@ -276,7 +274,6 @@ class ModelCheckpoint(Callback):
         filepath = self.save_dir / filename
         if self.verbose > 0:
-            print("")
             _LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
         torch.save(self.trainer.model.state_dict(), filepath) # type: ignore
@@ -325,7 +322,6 @@ class LRScheduler(Callback):
             if metric_val is not None:
                 self.scheduler.step(metric_val)
             else:
-                print("")
                 _LOGGER.warning(f"LRScheduler could not find metric '{self.monitor}' in logs.")
         # For all other schedulers
@@ -335,7 +331,6 @@ class LRScheduler(Callback):
         # Log the change if the LR was updated
         current_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
         if current_lr != self.previous_lr:
-            print("")
             _LOGGER.info(f"Epoch {epoch}: Learning rate changed to {current_lr:.6f}")
             self.previous_lr = current_lr

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/ML_evaluation.py RENAMED Viewed

@@ -65,7 +65,7 @@ def plot_losses(history: dict, save_dir: Optional[Union[str, Path]] = None):
         save_dir_path = make_fullpath(save_dir, make=True)
         save_path = save_dir_path / "loss_plot.svg"
         plt.savefig(save_path)
-        _LOGGER.info(f"Loss plot saved as '{save_path.name}'")
+        _LOGGER.info(f"📉 Loss plot saved as '{save_path.name}'")
     else:
         plt.show()
     plt.close(fig)
@@ -92,7 +92,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
         # Save text report
         report_path = save_dir_path / "classification_report.txt"
         report_path.write_text(report, encoding="utf-8")
-        _LOGGER.info(f"Classification report saved as '{report_path.name}'")
+        _LOGGER.info(f"📝 Classification report saved as '{report_path.name}'")
         # Save Confusion Matrix
         fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
@@ -100,7 +100,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
         ax_cm.set_title("Confusion Matrix")
         cm_path = save_dir_path / "confusion_matrix.svg"
         plt.savefig(cm_path)
-        _LOGGER.info(f"Confusion matrix saved as '{cm_path.name}'")
+        _LOGGER.info(f"❇️ Confusion matrix saved as '{cm_path.name}'")
         plt.close(fig_cm)
         # Save ROC Curve
@@ -117,7 +117,7 @@ def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optio
             ax_roc.grid(True)
             roc_path = save_dir_path / "roc_curve.svg"
             plt.savefig(roc_path)
-            _LOGGER.info(f"ROC curve saved as '{roc_path.name}'")
+            _LOGGER.info(f"📈 ROC curve saved as '{roc_path.name}'")
             plt.close(fig_roc)
     else:
         # Show plots if not saving
@@ -162,7 +162,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
         # Save text report
         report_path = save_dir_path / "regression_report.txt"
         report_path.write_text(report_string)
-        _LOGGER.info(f"Regression report saved as '{report_path.name}'")
+        _LOGGER.info(f"📝 Regression report saved as '{report_path.name}'")
         # Save residual plot
         residuals = y_true - y_pred
@@ -176,7 +176,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
         plt.tight_layout()
         res_path = save_dir_path / "residual_plot.svg"
         plt.savefig(res_path)
-        _LOGGER.info(f"Residual plot saved as '{res_path.name}'")
+        _LOGGER.info(f"📈 Residual plot saved as '{res_path.name}'")
         plt.close(fig_res)
         # Save true vs predicted plot
@@ -190,7 +190,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
         plt.tight_layout()
         tvp_path = save_dir_path / "true_vs_predicted_plot.svg"
         plt.savefig(tvp_path)
-        _LOGGER.info(f"True vs. Predicted plot saved as '{tvp_path.name}'")
+        _LOGGER.info(f"📉 True vs. Predicted plot saved as '{tvp_path.name}'")
         plt.close(fig_tvp)
@@ -227,7 +227,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         plt.title("SHAP Feature Importance")
         plt.tight_layout()
         plt.savefig(bar_path)
-        _LOGGER.info(f"SHAP bar plot saved as '{bar_path.name}'")
+        _LOGGER.info(f"📊 SHAP bar plot saved as '{bar_path.name}'")
         plt.close()
         # Save Dot Plot
@@ -236,7 +236,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         plt.title("SHAP Feature Importance")
         plt.tight_layout()
         plt.savefig(dot_path)
-        _LOGGER.info(f"SHAP dot plot saved as '{dot_path.name}'")
+        _LOGGER.info(f"📊 SHAP dot plot saved as '{dot_path.name}'")
         plt.close()
         # Save Summary Data to CSV
@@ -249,7 +249,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
             'mean_abs_shap_value': mean_abs_shap
         }).sort_values('mean_abs_shap_value', ascending=False)
         summary_df.to_csv(summary_path, index=False)
-        _LOGGER.info(f"SHAP summary data saved as '{summary_path.name}'")
+        _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
     else:
         _LOGGER.info("No save directory provided. Displaying SHAP dot plot.")
         shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot")

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/ML_trainer.py RENAMED Viewed

@@ -72,10 +72,10 @@ class MyTrainer:
         """Validates the selected device and returns a torch.device object."""
         device_lower = device.lower()
         if "cuda" in device_lower and not torch.cuda.is_available():
-            _LOGGER.warning("CUDA not available, switching to CPU.")
+            _LOGGER.warning("⚠️ CUDA not available, switching to CPU.")
             device = "cpu"
         elif device_lower == "mps" and not torch.backends.mps.is_available():
-            _LOGGER.warning("Apple Metal Performance Shaders (MPS) not available, switching to CPU.")
+            _LOGGER.warning("⚠️ Apple Metal Performance Shaders (MPS) not available, switching to CPU.")
             device = "cpu"
         return torch.device(device)

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/PSO_optimization.py RENAMED Viewed

@@ -22,7 +22,6 @@ import torch
 from tqdm import trange
 import matplotlib.pyplot as plt
 import seaborn as sns
-from collections import defaultdict
 from .logger import _LOGGER
@@ -307,7 +306,7 @@ def run_pso(lower_boundaries: list[float],
     else:
         device = torch.device("cpu")
-    _LOGGER.info(f"Using device: '{device}'")
+    _LOGGER.info(f"👾 Using device: '{device}'")
     # set local deep copies to prevent in place list modification
     local_lower_boundaries = deepcopy(lower_boundaries)
@@ -511,13 +510,13 @@ def _pso(func: ObjectiveFunction,
         return best_position, best_score
-def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path], color_by_target: bool = True):
+def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
     """
     Analyzes optimization results and plots the distribution of optimal values for each feature.
-    This function can operate in two modes based on the `color_by_target` parameter:
-    1.  Aggregates all values for a feature into a single group and plots one overall distribution (histogram + KDE).
-    2.  Color-coded: Plots a separate, color-coded Kernel Density Estimate (KDE) for each source target, allowing for direct comparison on a single chart.
+    For features with more than two unique values, this function generates a color-coded
+    Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
+    showing relative frequency.
     Parameters
     ----------
@@ -525,76 +524,69 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
         The path to the directory containing the optimization result CSV files.
     save_dir : str or Path
         The directory where the output plots will be saved.
-    color_by_target : bool, optional
-        If True, generates comparative plots with distributions colored by their source target.
     """
-    mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
-    _LOGGER.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
-    # Check results_dir
+    # Check results_dir and create output path
     results_path = make_fullpath(results_dir)
-    # make output path
     output_path = make_fullpath(save_dir, make=True)
     all_csvs = list_csv_paths(results_path)
     if not all_csvs:
-        _LOGGER.warning("No data found. No plots will be generated.")
+        _LOGGER.warning("⚠️ No data found. No plots will be generated.")
         return
-    # --- MODE 1: Color-coded plots by target ---
-    if color_by_target:
-        data_to_plot = []
-        for df, df_name in yield_dataframes_from_dir(results_path):
-            # Assumes last col is target, rest are features
-            melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
-            # Sanitize target name for cleaner legend labels
-            melted_df['target'] = df_name.replace("Optimization_", "")
-            data_to_plot.append(melted_df)
-        long_df = pd.concat(data_to_plot, ignore_index=True)
-        features = long_df['feature'].unique()
-        _LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
-        for feature_name in features:
-            plt.figure(figsize=(12, 7))
-            feature_df = long_df[long_df['feature'] == feature_name]
+    # --- Data Loading and Preparation ---
+    _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
+    data_to_plot = []
+    for df, df_name in yield_dataframes_from_dir(results_path):
+        melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
+        melted_df['target'] = df_name.replace("Optimization_", "")
+        data_to_plot.append(melted_df)
+    long_df = pd.concat(data_to_plot, ignore_index=True)
+    features = long_df['feature'].unique()
+    _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
+    # --- Plotting Loop ---
+    for feature_name in features:
+        plt.figure(figsize=(12, 7))
+        feature_df = long_df[long_df['feature'] == feature_name]
+        # Check if the feature is binary or constant
+        if feature_df['value'].nunique() <= 2:
+            # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
+            # This ensures the X-axis is clean (e.g., just 0 and 1).
+            norm_df = (feature_df.groupby('target')['value']
+                       .value_counts(normalize=True)
+                       .mul(100)
+                       .rename('percent')
+                       .reset_index())
-            sns.kdeplot(data=feature_df, x='value', hue='target', fill=True, alpha=0.1)
+            ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
-            plt.title(f"Comparative Distribution for '{feature_name}'", fontsize=16)
-            plt.xlabel("Feature Value", fontsize=12)
-            plt.ylabel("Density", fontsize=12)
-            plt.grid(axis='y', alpha=0.5, linestyle='--')
-            plt.legend(title='Target')
-            sanitized_feature_name = sanitize_filename(feature_name)
-            plot_filename = output_path / f"Comparative_{sanitized_feature_name}.svg"
-            plt.savefig(plot_filename, bbox_inches='tight')
-            plt.close()
-    # --- MODE 2: Aggregate plot ---
-    else:
-        feature_distributions = defaultdict(list)
-        for df, _ in yield_dataframes_from_dir(results_path):
-            feature_columns = df.iloc[:, :-1]
-            for feature_name in feature_columns:
-                feature_distributions[feature_name].extend(df[feature_name].tolist())
+            plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
+            plt.ylabel("Frequency (%)", fontsize=12)
+            ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
+        else:
+            # PLOT 2: KDE plot for continuous values.
+            ax = sns.kdeplot(data=feature_df, x='value', hue='target',
+                             fill=True, alpha=0.1, warn_singular=False)
+            plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
+            plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
+        # --- Common settings for both plot types ---
+        plt.xlabel("Feature Value", fontsize=12)
+        plt.grid(axis='y', alpha=0.5, linestyle='--')
-        _LOGGER.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
-        for feature_name, values in feature_distributions.items():
-            plt.figure(figsize=(12, 7))
-            sns.histplot(x=values, kde=True, bins='auto', stat="density")
-            plt.title(f"Aggregate Distribution for '{feature_name}'", fontsize=16)
-            plt.xlabel("Feature Value", fontsize=12)
-            plt.ylabel("Density", fontsize=12)
-            plt.grid(axis='y', alpha=0.5, linestyle='--')
-            sanitized_feature_name = sanitize_filename(feature_name)
-            plot_filename = output_path / f"Aggregate_{sanitized_feature_name}.svg"
-            plt.savefig(plot_filename, bbox_inches='tight')
-            plt.close()
+        legend = ax.get_legend()
+        if legend:
+            legend.set_title('Target')
+        sanitized_feature_name = sanitize_filename(feature_name)
+        plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
+        plt.savefig(plot_filename, bbox_inches='tight')
+        plt.close()
     _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/VIF_factor.py RENAMED Viewed

@@ -168,12 +168,12 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
     # Identify features to drop
     to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
-    _LOGGER.info(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
+    _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
     result_df = df.drop(columns=to_drop)
     if result_df.empty:
-        _LOGGER.warning(f"\t⚠️ All columns were dropped.")
+        _LOGGER.warning(f"⚠️ All columns were dropped.")
     return result_df, to_drop

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/data_exploration.py RENAMED Viewed

@@ -100,10 +100,11 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
             cols_to_keep.append(col_name)
     dropped_columns = original_columns - set(cols_to_keep)
-    if dropped_columns and verbose:
-        print(f"Dropped {len(dropped_columns)} constant columns:")
-        for dropped_column in dropped_columns:
-            print(f"    {dropped_column}")
+    if verbose:
+        print(f"🧹 Dropped {len(dropped_columns)} constant columns.")
+        if dropped_columns:
+            for dropped_column in dropped_columns:
+                print(f"    {dropped_column}")
     return df[cols_to_keep]

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/datasetmaster.py RENAMED Viewed

@@ -13,7 +13,7 @@ from torchvision.datasets import ImageFolder
 from torchvision import transforms
 import matplotlib.pyplot as plt
 from pathlib import Path
-from .utilities import _script_info
+from .utilities import _script_info, make_fullpath
 from .logger import _LOGGER
@@ -204,7 +204,7 @@ class DatasetMaker(_BaseMaker):
         if not self._is_split:
             raise RuntimeError("Continuous features must be normalized AFTER splitting data. Call .split_data() first.")
         if self._is_normalized:
-            _LOGGER.warning("Data has already been normalized.")
+            _LOGGER.warning("⚠️ Data has already been normalized.")
             return self
         # Use continuous features columns
@@ -232,7 +232,7 @@ class DatasetMaker(_BaseMaker):
     def split_data(self, test_size: float = 0.2, stratify: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
         """Splits the data into training and testing sets."""
         if self._is_split:
-            _LOGGER.warning("Data has already been split.")
+            _LOGGER.warning("⚠️ Data has already been split.")
             return self
         if self.labels.dtype == 'object' or self.labels.dtype.name == 'category':
@@ -260,9 +260,9 @@ class DatasetMaker(_BaseMaker):
         Defaults to `SMOTETomek`.
         """
         if not self._is_split:
-            raise RuntimeError("Cannot balance data before it has been split. Call .split_data() first.")
+            raise RuntimeError("❌ Cannot balance data before it has been split. Call .split_data() first.")
         if self._is_balanced:
-            _LOGGER.warning("Training data has already been balanced.")
+            _LOGGER.warning("⚠️ Training data has already been balanced.")
             return self
         if resampler is None:
@@ -278,13 +278,13 @@ class DatasetMaker(_BaseMaker):
     def process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
                 balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
         """Runs a standard, fully automated preprocessing pipeline."""
-        _LOGGER.info("--- Running Automated Processing Pipeline ---")
+        _LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
         self.process_categoricals(method=cat_method)
         self.split_data(test_size=test_size, stratify=True, random_state=random_state)
         self.normalize_continuous(method=normalize_method)
         if balance:
             self.balance_data()
-        _LOGGER.info("--- Automated Processing Complete ---")
+        _LOGGER.info("--- 🤖 Automated Processing Complete ---")
         return self
     def denormalize(self, data: Union[torch.Tensor, numpy.ndarray, pandas.DataFrame]) -> Union[numpy.ndarray, pandas.DataFrame]:
@@ -400,10 +400,7 @@ class VisionDatasetMaker(_BaseMaker):
         Logs a report of the types, sizes, and channels of image files
         found in the directory and its subdirectories.
         """
-        path_obj = Path(path)
-        if not path_obj.is_dir():
-            _LOGGER.error(f"Path is not a valid directory: {path_obj}")
-            return
+        path_obj = make_fullpath(path)
         non_image_files = set()
         img_types = set()
@@ -505,7 +502,7 @@ class VisionDatasetMaker(_BaseMaker):
         if not self._is_split:
             raise RuntimeError("Data has not been split. Call .split_data() first.")
         if not self._are_transforms_configured:
-            _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
+            _LOGGER.warning("⚠️ Transforms have not been configured. Using default ToTensor only.")
         if self._test_dataset:
             return self._train_dataset, self._val_dataset, self._test_dataset
@@ -555,7 +552,7 @@ class SequenceMaker(_BaseMaker):
             raise RuntimeError("Data must be split BEFORE normalizing. Call .split_data() first.")
         if self.scaler:
-            _LOGGER.warning("Data has already been normalized.")
+            _LOGGER.warning("⚠️ Data has already been normalized.")
             return self
         if method == "standard":
@@ -579,7 +576,7 @@ class SequenceMaker(_BaseMaker):
     def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
         """Splits the sequence into training and testing portions."""
         if self._is_split:
-            _LOGGER.warning("Data has already been split.")
+            _LOGGER.warning("⚠️ Data has already been split.")
             return self
         split_idx = int(len(self.sequence) * (1 - test_size))

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/ensemble_learning.py RENAMED Viewed

@@ -915,7 +915,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
     datasets_path = make_fullpath(datasets_dir)
     save_path = make_fullpath(save_dir, make=True)
-    _LOGGER.info("Training starting...")
+    _LOGGER.info("🏁 Training starting...")
     #Yield imputed dataset
     for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
         #Yield features dataframe and target dataframe
@@ -933,7 +933,7 @@ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Pat
                                     test_features=X_test, test_target=y_test,
                                     feature_names=feature_names,target_name=target_name,
                                     debug=debug, save_dir=save_path, save_model=save_model)
-    print("")
     _LOGGER.info("✅ Training and evaluation complete.")

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/logger.py RENAMED Viewed

@@ -10,7 +10,6 @@ import logging
 import sys
 __all__ = [
     "custom_logger"
 ]
@@ -85,10 +84,10 @@ def custom_logger(
         else:
             raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
-        _LOGGER.info(f"Log saved to: '{base_path}'")
+        _LOGGER.info(f"🗄️ Log saved to: '{base_path}'")
     except Exception as e:
-        _LOGGER.error(f"Log not saved: {e}")
+        _LOGGER.error(f"❌ Log not saved: {e}")
 def _log_list_to_txt(data: List[Any], path: Path) -> None:
@@ -176,7 +175,7 @@ def _get_logger(name: str = "ml_tools", level: int = logging.INFO):
         handler = logging.StreamHandler(sys.stdout)
         # Define the format string and the date format separately
-        log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        log_format = '\n🐉%(asctime)s - %(name)s - %(levelname)s - %(message)s'
         date_format = '%Y-%m-%d %H:%M' # Format: Year-Month-Day Hour:Minute
         # Pass both the format and the date format to the Formatter

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/ml_tools/utilities.py RENAMED Viewed

@@ -4,9 +4,10 @@ import pandas as pd
 import polars as pl
 from pathlib import Path
 import re
-from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
+from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple, Callable, List, Dict
 import joblib
 from joblib.externals.loky.process_executor import TerminatedWorkerError
+from pprint import pprint
 # Keep track of available tools
@@ -25,7 +26,8 @@ __all__ = [
     "serialize_object",
     "deserialize_object",
     "distribute_datasets_by_target",
-    "train_dataset_orchestrator"
+    "train_dataset_orchestrator",
+    "PathManager"
 ]
@@ -640,12 +642,214 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
                 print(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
                 continue
-    print(f"{total_saved} single-target datasets were created.")
+    print(f"\n✅ {total_saved} single-target datasets were created.")
+### Path Manager
+class PathManager:
+    """
+    Manages and stores a project's file paths, acting as a centralized
+    "path database". It supports both development mode and applications
+    bundled with Briefcase.
+    Supports python dictionary syntax.
+    """
+    def __init__(
+        self,
+        anchor_file: str,
+        base_directories: Optional[List[str]] = None
+    ):
+        """
+        The initializer determines the project's root directory and can pre-register
+        a list of base directories relative to that root.
+        Args:
+            anchor_file (str): The absolute path to a file whose parent directory will be considered the package root and name. Typically, `__file__`.
+            base_directories (Optional[List[str]]): A list of directory names
+                                located at the same level as the anchor file's
+                                parent directory to register immediately.
+        """
+        resolved_anchor_path = Path(anchor_file).resolve()
+        self._package_name = resolved_anchor_path.parent.name
+        self._is_bundled, self._resource_path_func = self._check_bundle_status()
+        self._paths: Dict[str, Path] = {}
+        if self._is_bundled:
+            # In a bundle, resource_path gives the absolute path to the 'app_packages' dir
+            # when given the package name.
+            package_root = self._resource_path_func(self._package_name) # type: ignore
+        else:
+            # In dev mode, the package root is the directory containing the anchor file.
+            package_root = resolved_anchor_path.parent
+        # Register the root of the package itself
+        self._paths["ROOT"] = package_root
+        # Register all the base directories
+        if base_directories:
+            for dir_name in base_directories:
+                # In dev mode, this is simple. In a bundle, we must resolve
+                # each path from the package root.
+                if self._is_bundled:
+                     self._paths[dir_name] = self._resource_path_func(self._package_name, dir_name) # type: ignore
+                else:
+                     self._paths[dir_name] = package_root / dir_name
+    # A helper function to find the briefcase-injected resource function
+    def _check_bundle_status(self) -> tuple[bool, Optional[Callable]]:
+        """Checks if the app is running in a Briefcase bundle."""
+        try:
+            # This function is injected by Briefcase into the global scope
+            from briefcase.platforms.base import resource_path # type: ignore
+            return True, resource_path
+        except (ImportError, NameError):
+            return False, None
+    def get(self, key: str) -> Path:
+        """
+        Retrieves a stored path by its key.
+        Args:
+            key (str): The key of the path to retrieve.
+        Returns:
+            Path: The resolved, absolute Path object.
+        Raises:
+            KeyError: If the key is not found in the manager.
+        """
+        try:
+            return self._paths[key]
+        except KeyError:
+            print(f"❌ Path key '{key}' not found.")
+            # Consider suggesting close matches if you want to get fancy
+            raise
+    def update(self, new_paths: Dict[str, Union[str, Path]], overwrite: bool = False) -> None:
+        """
+        Adds new paths or overwrites existing ones in the manager.
+        Args:
+            new_paths (Dict[str, Union[str, Path]]): A dictionary where keys are
+                                    the identifiers and values are the
+                                    Path objects or strings to store.
+            overwrite (bool): If False (default), raises a KeyError if any
+                            key in new_paths already exists. If True,
+                            allows overwriting existing keys.
+        """
+        if not overwrite:
+            for key in new_paths:
+                if key in self._paths:
+                    raise KeyError(
+                        f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
+                    )
+        # Resolve any string paths to Path objects before storing
+        resolved_new_paths = {k: Path(v) for k, v in new_paths.items()}
+        self._paths.update(resolved_new_paths)
+    def make_dirs(self, keys: Optional[List[str]] = None, verbose: bool = False) -> None:
+        """
+        Creates directory structures for registered paths in writable locations.
+        This method identifies paths that are directories (no file suffix) and creates them on the filesystem.
+        In a bundled application, this method will NOT attempt to create directories inside the read-only app package, preventing crashes. It
+        will only operate on paths outside of the package (e.g., user data dirs).
+        Args:
+            keys (Optional[List[str]]): If provided, only the directories
+                                        corresponding to these keys will be
+                                        created. If None (default), all
+                                        registered directory paths are used.
+            verbose (bool): If True, prints a message for each action.
+        """
+        path_items = []
+        if keys:
+            for key in keys:
+                if key in self._paths:
+                    path_items.append((key, self._paths[key]))
+                elif verbose:
+                    print(f"⚠️ Key '{key}' not found in PathManager, skipping.")
+        else:
+            path_items = self._paths.items()
+        # Get the package root to check against.
+        package_root = self._paths.get("ROOT")
+        for key, path in path_items:
+            if path.suffix:  # It's a file, not a directory
+                continue
+            # --- THE CRITICAL CHECK ---
+            # Determine if the path is inside the main application package.
+            is_internal_path = package_root and path.is_relative_to(package_root)
+            if self._is_bundled and is_internal_path:
+                if verbose:
+                    print(f"ℹ️ Skipping internal directory '{key}' in bundled app (read-only).")
+                continue
+            # -------------------------
+            if verbose:
+                print(f"📁 Ensuring directory exists for key '{key}': {path}")
+            path.mkdir(parents=True, exist_ok=True)
+    def status(self) -> None:
+        """
+        Checks the status of all registered paths on the filesystem and prints a formatted report.
+        """
+        report = {}
+        for key, path in self.items():
+            if path.is_dir():
+                report[key] = "📁 Directory"
+            elif path.is_file():
+                report[key] = "📄 File"
+            else:
+                report[key] = "❌ Not Found"
+        print("\n--- Path Status Report ---")
+        pprint(report)
+    def __repr__(self) -> str:
+        """Provides a string representation of the stored paths."""
+        path_list = "\n".join(f"  '{k}': '{v}'" for k, v in self._paths.items())
+        return f"PathManager(\n{path_list}\n)"
+    # --- Dictionary-Style Methods ---
+    def __getitem__(self, key: str) -> Path:
+        """Allows dictionary-style getting, e.g., PM['my_key']"""
+        return self.get(key)
+    def __setitem__(self, key: str, value: Union[str, Path]):
+        """Allows dictionary-style setting, e.g., PM['my_key'] = path"""
+        self.update({key: value}, overwrite=True)
+    def __contains__(self, key: str) -> bool:
+        """Allows checking for a key's existence, e.g., if 'my_key' in PM"""
+        return key in self._paths
+    def __len__(self) -> int:
+        """Allows getting the number of paths, e.g., len(PM)"""
+        return len(self._paths)
+    def keys(self):
+        """Returns all registered path keys."""
+        return self._paths.keys()
+    def values(self):
+        """Returns all registered Path objects."""
+        return self._paths.values()
+    def items(self):
+        """Returns all registered (key, Path) pairs."""
+        return self._paths.items()
 class LogKeys:
     """
-    Used for ML scripts only
+    Used internally for ML scripts.
     Centralized keys for logging and history.
     """

{dragon_ml_toolbox-3.6.0 → dragon_ml_toolbox-3.8.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "3.6.0"
+version = "3.8.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }