PyPI - dragon-ml-toolbox - Versions diffs - 3.11.0__tar.gz → 3.12.1__tar.gz - Mend

dragon-ml-toolbox 3.11.0tar.gz → 3.12.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (31) hide show

{dragon_ml_toolbox-3.11.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.12.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.11.0
+Version: 3.12.1
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.11.0
+Version: 3.12.1
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/GUI_tools.py RENAMED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 import traceback
 import FreeSimpleGUI as sg
 from functools import wraps
-from typing import Any, Dict, Tuple, List, Literal, Union, Any, Optional, Callable
+from typing import Any, Dict, Tuple, List, Literal, Union, Optional, Callable
 from .utilities import _script_info
 import numpy as np
 from .logger import _LOGGER
@@ -104,11 +104,13 @@ class ConfigManager:
             'max_size': ''
         }
         config['Layout'] = {
-            '; Default size for continuous input boxes (width,height in characters).': '',
+            '; Default size for continuous input boxes (width,height in characters/rows).': '',
             'input_size_cont': '16,1',
-            '; Default size for combo/binary boxes (width,height in characters).': '',
+            '; Default size for combo/binary boxes (width,height in characters/rows).': '',
             'input_size_binary': '14,1',
-            '; Default size for buttons (width,height in characters).': '',
+            '; Size for multiselect listboxes (width,height in characters/rows).': '',
+            'input_size_multi': '14,4',
+            '; Default size for buttons (width,height in characters/rows).': '',
             'button_size': '15,2'
         }
         config['Fonts'] = {
@@ -303,6 +305,57 @@ class GUIFactory:
         # Default to 'grid' layout
         return [columns[i:i + features_per_column] for i in range(0, len(columns), features_per_column)]
+    def generate_multiselect_layout(
+        self,
+        data_dict: Dict[str, Union[List[Any], Tuple[Any, ...]]],
+        layout_mode: Literal["grid", "row"] = 'grid',
+        features_per_column: int = 4
+    ) -> List[List[sg.Column]]:
+        """
+        Generates a layout for features using Listbox elements for multiple selections.
+        This allows the user to select zero or more options from a list without
+        being able to input custom text.
+        Args:
+            data_dict (dict): Keys are feature names, values are lists of options.
+            layout_mode (str): 'grid' for a multi-row grid layout, or 'row' for a single horizontal row.
+            features_per_column (int): Number of features per column when `layout_mode` is 'grid'.
+        Returns:
+            A list of lists of sg.Column elements, ready to be used in a window layout.
+        """
+        cfg = self.config
+        bg_color = sg.theme_background_color()
+        label_font = (cfg.fonts.font_family, cfg.fonts.label_size, cfg.fonts.label_style) # type: ignore
+        columns = []
+        for name, values in data_dict.items():
+            label = sg.Text(name, font=label_font, background_color=bg_color, key=f"_text_{name}")
+            # Use sg.Listbox for multiple selections.
+            element = sg.Listbox(
+                values,
+                key=name,
+                select_mode=sg.LISTBOX_SELECT_MODE_MULTIPLE,
+                size=cfg.layout.input_size_multi, # type: ignore
+                no_scrollbar=False
+            )
+            # -------------------
+            layout = [[label], [element]]
+            # Add a small spacer for consistent vertical alignment.
+            layout.append([sg.Text(" ", font=(cfg.fonts.font_family, 2), background_color=bg_color)]) # type: ignore
+            # Each feature is wrapped in a Column element for proper alignment.
+            columns.append(sg.Column(layout, background_color=bg_color))
+        if layout_mode == 'row':
+            return [columns]  # A single row containing all columns
+        # Default to 'grid' layout
+        return [columns[i:i + features_per_column] for i in range(0, len(columns), features_per_column)]
     # --- Window Creation ---
     def create_window(self, title: str, layout: List[List[sg.Element]], **kwargs) -> sg.Window:
@@ -384,6 +437,7 @@ class FeatureMaster:
                  targets: Dict[str, str],
                  continuous_features: Optional[Dict[str, Tuple[str, float, float]]] = None,
                  binary_features: Optional[Dict[str, str]] = None,
+                 multi_binary_features: Optional[Dict[str, Dict[str, str]]] = None,
                  one_hot_features: Optional[Dict[str, Dict[str, str]]] = None,
                  categorical_features: Optional[List[Tuple[str, str, Dict[str, int]]]] = None) -> None:
         """
@@ -410,6 +464,14 @@ class FeatureMaster:
                 A dictionary for binary (True/False) features.
                 -   **key** (str): The name to be displayed in the GUI (e.g., for a checkbox).
                 -   **value** (str): The model's internal feature name.
+            multi_binary_features (Dict[str, Dict[str, str]]):
+                A dictionary for features where multiple binary-like options can be
+                selected at once (e.g., from a multi-select listbox).
+                -   **key** (str): The name for the group to be displayed in the GUI.
+                -   **value** (Dict[str, str]): A nested dictionary where:
+                    -   key (str): The user-selectable option.
+                    -   value (str): The corresponding model's internal feature name.
             one_hot_features (Dict[str, Dict[str, str]]):
                 A dictionary for features that will be one-hot encoded from a single
@@ -418,8 +480,7 @@ class FeatureMaster:
                     for a dropdown menu).
                 -   **value** (Dict[str, str]): A nested dictionary where:
                     -   key (str): The user-selectable option (e.g., 'Category A').
-                    -   value (str): The corresponding model column name that will be
-                        set to 1.
+                    -   value (str): The corresponding model column name.
             categorical_features (List[Tuple[str, str, Dict[str, int]]]):
                 A list for ordinal or label-encoded categorical features.
@@ -431,7 +492,7 @@ class FeatureMaster:
                         options to their corresponding integer values.
         """
         # Validation
-        if continuous_features is None and binary_features is None and one_hot_features is None and categorical_features is None:
+        if continuous_features is None and binary_features is None and one_hot_features is None and categorical_features is None and multi_binary_features is None:
             raise ValueError("No features provided.")
         # Targets
@@ -454,6 +515,15 @@ class FeatureMaster:
         else:
             self._binary_values, self._binary_mapping = None, None
             self.has_binary = False
+        # multi-binary features
+        if multi_binary_features is not None:
+            self._multi_binary_values = self._handle_multi_binary_features(multi_binary_features)
+            self._multi_binary_mapping = multi_binary_features
+            self.has_multi_binary = True
+        else:
+            self._multi_binary_values, self._multi_binary_mapping = None, None
+            self.has_multi_binary = False
         # one-hot features
         if one_hot_features is not None:
@@ -493,6 +563,14 @@ class FeatureMaster:
         gui_values: dict[str, tuple[Literal["False"],Literal["True"]]] = {gui_key: ("False", "True") for gui_key in binary_features.keys()}
         # Map GUI name to Model name (same as input)
         return gui_values
+    def _handle_multi_binary_features(self, multi_binary_features: Dict[str, Dict[str, str]]):
+        # Make dictionary GUI name: range values
+        gui_values: dict[str, tuple[str,...]] = {
+            gui_key: tuple(nested_dict.keys())
+            for gui_key, nested_dict in multi_binary_features.items()}
+        # Map GUI name to Model name and preserve internal mapping (same as input)
+        return gui_values
     def _handle_one_hot_features(self, one_hot_features: Dict[str, Dict[str,str]]):
         # Make dictionary GUI name: range values
@@ -514,6 +592,8 @@ class FeatureMaster:
             all_dict.update(self._continuous_mapping)
         if self._binary_mapping is not None:
             all_dict.update(self._binary_mapping)
+        if self._multi_binary_mapping is not None:
+            all_dict.update(self._multi_binary_mapping)
         if self._one_hot_mapping is not None:
             all_dict.update(self._one_hot_mapping)
         if self._categorical_mapping is not None:
@@ -595,6 +675,28 @@ class FeatureMaster:
         """
         if self._binary_values is not None:
             return self._binary_values
+    @property
+    def multi_binary(self):
+        """
+        The mapping for multi-binary features.
+        Structure:
+            {"GUI NAME": {"GUI OPTION 1": "model_column"}}
+        """
+        if self._multi_binary_mapping is not None:
+            return self._multi_binary_mapping
+    @property
+    def multi_binary_gui(self):
+        """
+        The GUI options for multi-binary feature groups.
+        Structure:
+            Dict[str, Tuple[str, ...]]
+        """
+        if self._multi_binary_values is not None:
+            return self._multi_binary_values
     @property
     def one_hot(self):
@@ -697,7 +799,7 @@ class GUIHandler:
         Maps GUI name to model expected name and casts the value to float.
         """
         try:
-            model_name = self.master.continuous[gui_feature]
+            model_name = self.master.continuous[gui_feature] # type: ignore
             float_value = float(chosen_value)
         except KeyError as e:
             _LOGGER.error(f"No matching name for '{gui_feature}' defined as continuous.")
@@ -713,8 +815,8 @@ class GUIHandler:
         Maps GUI name to model expected name and casts the value to binary (0,1).
         """
         try:
-            model_name = self.master.binary[gui_feature]
-            binary_mapping_keys = self.master.binary_gui[gui_feature]
+            model_name = self.master.binary[gui_feature] # type: ignore
+            binary_mapping_keys = self.master.binary_gui[gui_feature] # type: ignore
         except KeyError as e:
             _LOGGER.error(f"No matching name for '{gui_feature}' defined as binary.")
             raise e
@@ -725,13 +827,36 @@ class GUIHandler:
             }
             result = mapping_dict[chosen_value]
             return model_name, result
+    def _process_multi_binary(self, gui_feature: str, chosen_values: list[str]) -> dict[str, int]:
+        """
+        Maps GUI names to model expected names and casts values to multi-binary encoding.
+        For a given feature group, this sets all selected options to 1 and all
+        unselected options to 0.
+        """
+        try:
+            # Get the mapping for the group
+            multi_binary_mapping = self.master.multi_binary[gui_feature] # type: ignore
+        except KeyError as e:
+            _LOGGER.error(f"No matching name for '{gui_feature}' defined as multi-binary.")
+            raise e
+        else:
+            # Start with all possible features for this group set to 0 (unselected)
+            results = {model_key: 0 for model_key in multi_binary_mapping.values()}
+            # Set the features for the chosen options to 1
+            for chosen_option in chosen_values:
+                model_name = multi_binary_mapping[chosen_option]
+                results[model_name] = 1
+            return results
     def _process_one_hot(self, gui_feature: str, chosen_value: str) -> Dict[str,int]:
         """
         Maps GUI names to model expected names and casts values to one-hot encoding.
         """
         try:
-            one_hot_mapping = self.master.one_hot[gui_feature]
+            one_hot_mapping = self.master.one_hot[gui_feature] # type: ignore
         except KeyError as e:
             _LOGGER.error(f"No matching name for '{gui_feature}' defined as one-hot.")
             raise e
@@ -748,7 +873,7 @@ class GUIHandler:
         Maps GUI name to model expected name and casts the value to a categorical number.
         """
         try:
-            categorical_tuple = self.master.categorical[gui_feature]
+            categorical_tuple = self.master.categorical[gui_feature] # type: ignore
         except KeyError as e:
             _LOGGER.error(f"No matching name for '{gui_feature}' defined as categorical.")
             raise e
@@ -804,25 +929,31 @@ class GUIHandler:
         if self.master.has_continuous:
             processed_subset = self._call_subprocess(window_values=window_values,
-                                                     master_feature=self.master.continuous,
+                                                     master_feature=self.master.continuous, # type: ignore
                                                      processor=self._process_continuous)
             processed_features.update(processed_subset)
         if self.master.has_binary:
             processed_subset = self._call_subprocess(window_values=window_values,
-                                                     master_feature=self.master.binary,
+                                                     master_feature=self.master.binary, # type: ignore
                                                      processor=self._process_binary)
             processed_features.update(processed_subset)
+        if self.master.has_multi_binary:
+            processed_subset = self._call_subprocess(window_values=window_values,
+                                                     master_feature=self.master.multi_binary, # type: ignore
+                                                     processor=self._process_multi_binary)
+            processed_features.update(processed_subset)
         if self.master.has_one_hot:
             processed_subset = self._call_subprocess(window_values=window_values,
-                                                     master_feature=self.master.one_hot,
+                                                     master_feature=self.master.one_hot, # type: ignore
                                                      processor=self._process_one_hot)
             processed_features.update(processed_subset)
         if self.master.has_categorical:
             processed_subset = self._call_subprocess(window_values=window_values,
-                                                     master_feature=self.master.categorical,
+                                                     master_feature=self.master.categorical, # type: ignore
                                                      processor=self._process_categorical)
             processed_features.update(processed_subset)
@@ -836,7 +967,6 @@ class GUIHandler:
             raise RuntimeError(f"Configuration Error: Implemented methods failed to generate the required model feature: '{e}'")
         return np.array(final_vector, dtype=np.float32)
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/MICE_imputation.py RENAMED Viewed

@@ -35,7 +35,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
     imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
     if imputed_datasets is None or len(imputed_datasets) == 0:
-        raise ValueError("No imputed datasets were generated. Check the MICE process.")
+        raise ValueError("❌ No imputed datasets were generated. Check the MICE process.")
     # threshold binary columns
     if binary_columns is not None:
@@ -56,8 +56,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
     # Ensure indexes match
     for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
-        assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
-        assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
+        assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}" # type: ignore
+        assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}" # type: ignore
     # print("✅ All imputed datasets match the original DataFrame indexes.")
     return kernel, imputed_datasets, imputed_dataset_names
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
     dataset_count = kernel.num_datasets
     if dataset_count != len(imputed_dataset_names):
-        raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
+        raise ValueError(f"❌ Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
     # Check path
     root_path = make_fullpath(root_dir, make=True)
@@ -152,7 +152,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
         """Helper function to add labels and legends to a figure"""
         if not isinstance(fig, ggplot):
-            raise TypeError("Expected a plotnine.ggplot object")
+            raise TypeError("❌ Expected a plotnine.ggplot object")
         # Edit labels and title
         fig = fig + theme(
@@ -166,7 +166,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
         fig = fig.draw()
         if not hasattr(fig, 'axes') or len(fig.axes) == 0:
-            raise RuntimeError("Rendered figure has no axes to modify")
+            raise RuntimeError("❌ Rendered figure has no axes to modify")
         if filename == "Combined_Distributions":
             custom_xlabel = "Feature Values"

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/PSO_optimization.py RENAMED Viewed

@@ -530,10 +530,8 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
     results_path = make_fullpath(results_dir)
     output_path = make_fullpath(save_dir, make=True)
-    all_csvs = list_csv_paths(results_path)
-    if not all_csvs:
-        _LOGGER.warning("⚠️ No data found. No plots will be generated.")
-        return
+    # Check that the directory contains csv files
+    list_csv_paths(results_path, verbose=False)
     # --- Data Loading and Preparation ---
     _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/VIF_factor.py RENAMED Viewed

@@ -26,8 +26,7 @@ def compute_vif(
     save_dir: Optional[Union[str,Path]] = None,
     filename: Optional[str] = None,
     fontsize: int = 14,
-    show_plot: bool = True,
-    verbose: bool = True
+    show_plot: bool = True
 ) -> pd.DataFrame:
     """
     Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
@@ -54,21 +53,20 @@ def compute_vif(
     if use_columns is None:
         sanitized_columns = df.select_dtypes(include='number').columns.tolist()
         missing_features = set(ground_truth_cols) - set(sanitized_columns)
-        if missing_features and verbose:
+        if missing_features:
             _LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
     else:
         sanitized_columns = list()
         for feature in use_columns:
             if feature not in ground_truth_cols:
-                if verbose:
-                    _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
+                _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
             else:
                 sanitized_columns.append(feature)
     if ignore_columns is not None and use_columns is None:
         missing_ignore = set(ignore_columns) - set(ground_truth_cols)
-        if missing_ignore and verbose:
-            _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
+        if missing_ignore:
+            _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not found in the Dataframe:\n{missing_ignore}")
         sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
     X = df[sanitized_columns].copy()
@@ -139,7 +137,7 @@ def compute_vif(
                         filename += ".svg"
                 full_save_path = save_path / filename
                 plt.savefig(full_save_path, format='svg', bbox_inches='tight')
-                print(f"\tSaved VIF plot: '{filename}'")
+                _LOGGER.info(f"✅ Saved VIF plot: '{filename}'")
             if show_plot:
                 plt.show()
@@ -164,11 +162,16 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
     """
     # Ensure expected structure
     if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
-        raise ValueError("`vif_df` must contain 'feature' and 'VIF' columns.")
+        raise ValueError("'vif_df' must contain 'feature' and 'VIF' columns.")
     # Identify features to drop
     to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
-    _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
+    if len(to_drop) > 0:
+        _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}:")
+        for dropped_column in to_drop:
+            print(f"\t{dropped_column}")
+    else:
+        _LOGGER.info(f"No columns exceed the VIF threshold of '{threshold}'.")
     result_df = df.drop(columns=to_drop)
@@ -186,7 +189,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
                       max_features_to_plot: int = 20,
                       fontsize: int = 14):
     """
-    Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
+    Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots will be displayed inline.
     Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
     Args:
@@ -216,8 +219,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
                             fontsize=fontsize,
                             save_dir=output_plot_directory,
                             filename=df_name,
-                            show_plot=False,
-                            verbose=False)
+                            show_plot=False)
         if output_dataset_path is not None:
             new_filename = df_name + '_VIF'

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/data_exploration.py RENAMED Viewed

@@ -143,7 +143,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
         feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
         rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
         if len(rows_to_drop) > 0:
-            print(f"📉 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
+            print(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
             df_clean = df_clean.drop(index=rows_to_drop)
         else:
             print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/handle_excel.py RENAMED Viewed

@@ -36,7 +36,7 @@ def find_excel_files(
     input_path = make_fullpath(directory)
     if not input_path.is_dir():
-        raise NotADirectoryError(f"Directory not found: {input_path}")
+        raise NotADirectoryError(f"❌ Directory not found: {input_path}")
     excel_files = [
         f for f in input_path.iterdir()
@@ -46,7 +46,7 @@ def find_excel_files(
     ]
     if not excel_files:
-        raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
+        raise FileNotFoundError(f"❌ No valid Excel files found in directory: {input_path}")
     return excel_files
@@ -198,7 +198,7 @@ def validate_excel_schema(
                     invalid_files.append(file)
         except Exception as e:
-            _LOGGER.error(f"Error processing '{file}': {e}")
+            _LOGGER.error(f"❌ Error processing '{file}': {e}")
             invalid_files.append(file)
     valid_excel_number = len(excel_paths) - len(invalid_files)
@@ -251,7 +251,7 @@ def vertical_merge_transform_excel(
         if target_columns is not None:
             missing = [col for col in target_columns if col not in df.columns]
             if missing:
-                raise ValueError(f"Invalid columns in {file.name}: {missing}")
+                raise ValueError(f"❌ Invalid columns in {file.name}: {missing}")
             df = df[target_columns]
         dataframes.append(df)
@@ -261,7 +261,7 @@ def vertical_merge_transform_excel(
     if rename_columns is not None:
         expected_len = len(target_columns if target_columns is not None else merged_df.columns)
         if len(rename_columns) != expected_len:
-            raise ValueError("Length of 'rename_columns' must match the selected columns")
+            raise ValueError("❌ Length of 'rename_columns' must match the selected columns")
         merged_df.columns = rename_columns
     merged_df.to_csv(csv_path, index=False, encoding='utf-8')
@@ -324,6 +324,9 @@ def horizontal_merge_transform_excel(
     merged_df = pd.concat(padded_dataframes, axis=1)
     duplicate_columns = merged_df.columns[merged_df.columns.duplicated()].tolist()
+    if duplicate_columns:
+        _LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
     if skip_duplicates:
         merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
@@ -344,9 +347,7 @@ def horizontal_merge_transform_excel(
     merged_df.to_csv(csv_path, index=False, encoding='utf-8')
     _LOGGER.info(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
-    if duplicate_columns:
-        _LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/path_manager.py RENAMED Viewed

@@ -102,7 +102,7 @@ class PathManager:
             for key in new_paths:
                 if key in self._paths:
                     raise KeyError(
-                        f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
+                        f"❌ Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
                     )
         # Resolve any string paths to Path objects before storing

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/ml_tools/utilities.py RENAMED Viewed

@@ -32,28 +32,42 @@ __all__ = [
 def make_fullpath(
         input_path: Union[str, Path],
         make: bool = False,
-        verbose: bool = False
+        verbose: bool = False,
+        enforce: Optional[Literal["directory", "file"]] = None
     ) -> Path:
     """
-    Resolves a string or Path into an absolute Path.
+    Resolves a string or Path into an absolute Path, optionally creating it.
     - If the path exists, it is returned.
     - If the path does not exist and `make=True`, it will:
-        - Create the file if the path has a suffix (i.e., is treated as a file)
+        - Create the file if the path has a suffix
         - Create the directory if it has no suffix
     - If `make=False` and the path does not exist, an error is raised.
+    - If `enforce`, raises an error if the resolved path is not what was enforced.
     - Optionally prints whether the resolved path is a file or directory.
     Parameters:
-        input_path (str | Path): Path to resolve.
-        make (bool): If True, attempt to create file or directory.
-        verbose (bool): Print classification after resolution.
+        input_path (str | Path):
+            Path to resolve.
+        make (bool):
+            If True, attempt to create file or directory.
+        verbose (bool):
+            Print classification after resolution.
+        enforce ("directory" | "file" | None):
+            Raises an error if the resolved path is not what was enforced.
     Returns:
         Path: Resolved absolute path.
     Raises:
         ValueError: If the path doesn't exist and can't be created.
+        TypeError: If the final path does not match the `enforce` parameter.
+    ## 🗒️ Note:
+    Directories with dots will be treated as files.
+    Files without extension will be treated as directories.
     """
     path = Path(input_path).expanduser()
@@ -75,6 +89,12 @@ def make_fullpath(
             resolved = path.resolve(strict=True)
         except Exception as e:
             raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
+    if enforce == "file" and not resolved.is_file():
+        raise TypeError(f"❌ Path was enforced as a file, but it is not: '{resolved}'")
+    if enforce == "directory" and not resolved.is_dir():
+        raise TypeError(f"❌ Path was enforced as a directory, but it is not: '{resolved}'")
     if verbose:
         if resolved.is_file():
@@ -87,7 +107,7 @@ def make_fullpath(
     return resolved
-def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
+def list_csv_paths(directory: Union[str,Path], verbose: bool=True) -> dict[str, Path]:
     """
     Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
@@ -101,19 +121,20 @@ def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
     csv_paths = list(dir_path.glob("*.csv"))
     if not csv_paths:
-        raise IOError(f"No CSV files found in directory: {dir_path.name}")
+        raise IOError(f"❌ No CSV files found in directory: {dir_path.name}")
     # make a dictionary of paths and names
     name_path_dict = {p.stem: p for p in csv_paths}
-    print("\n🗂️ CSV files found:")
-    for name in name_path_dict.keys():
-        print(f"\t{name}")
+    if verbose:
+        print("\n🗂️ CSV files found:")
+        for name in name_path_dict.keys():
+            print(f"\t{name}")
     return name_path_dict
-def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
+def list_files_by_extension(directory: Union[str,Path], extension: str, verbose: bool=True) -> dict[str, Path]:
     """
     Lists all files with the specified extension in the given directory and returns a mapping:
     filenames (without extensions) to their absolute paths.
@@ -133,13 +154,14 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
     matched_paths = list(dir_path.glob(pattern))
     if not matched_paths:
-        raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
+        raise IOError(f"❌ No '.{normalized_ext}' files found in directory: {dir_path}")
     name_path_dict = {p.stem: p for p in matched_paths}
-    print(f"\n📂 '{normalized_ext.upper()}' files found:")
-    for name in name_path_dict:
-        print(f"\t{name}")
+    if verbose:
+        print(f"\n📂 '{normalized_ext.upper()}' files found:")
+        for name in name_path_dict:
+            print(f"\t{name}")
     return name_path_dict
@@ -147,7 +169,8 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
 def load_dataframe(
     df_path: Union[str, Path],
     kind: Literal["pandas", "polars"] = "pandas",
-    all_strings: bool = False
+    all_strings: bool = False,
+    verbose: bool = True
 ) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
     """
     Load a CSV file into a DataFrame and extract its base name.
@@ -191,20 +214,21 @@ def load_dataframe(
             df = pl.read_csv(path, infer_schema_length=1000)
     else:
-        raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
+        raise ValueError(f"❌ Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
     # This check works for both pandas and polars DataFrames
     if df.shape[0] == 0:
-        raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
+        raise ValueError(f"❌ DataFrame '{df_name}' loaded from '{path}' is empty.")
-    print(f"\n💿 Loaded {kind} dataset: '{df_name}' with shape: {df.shape}")
+    if verbose:
+        print(f"\n💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
     return df, df_name
-def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
+def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
     """
-    Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
+    Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
     Parameters:
         datasets_dir (str | Path):
@@ -221,9 +245,10 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
     - Output is streamed via a generator to support lazy loading of multiple datasets.
     """
     datasets_path = make_fullpath(datasets_dir)
-    for df_name, df_path in list_csv_paths(datasets_path).items():
+    files_dict = list_csv_paths(datasets_path, verbose=verbose)
+    for df_name, df_path in files_dict.items():
         df: pd.DataFrame
-        df, _ = load_dataframe(df_path, kind="pandas") # type: ignore
+        df, _ = load_dataframe(df_path, kind="pandas", verbose=verbose) # type: ignore
         yield df, df_name
@@ -253,35 +278,35 @@ def merge_dataframes(
             - If column names or order differ for vertical merge.
     """
     if len(dfs) < 2:
-        raise ValueError("At least 2 DataFrames must be provided.")
+        raise ValueError("❌ At least 2 DataFrames must be provided.")
     if verbose:
         for i, df in enumerate(dfs, start=1):
-            print(f"DataFrame {i} shape: {df.shape}")
+            print(f"➡️ DataFrame {i} shape: {df.shape}")
     if direction == "horizontal":
         reference_index = dfs[0].index
         for i, df in enumerate(dfs, start=1):
             if not df.index.equals(reference_index):
-                raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
+                raise ValueError(f"❌ Indexes do not match: Dataset 1 and Dataset {i}.")
         merged_df = pd.concat(dfs, axis=1)
     elif direction == "vertical":
         reference_columns = dfs[0].columns
         for i, df in enumerate(dfs, start=1):
             if not df.columns.equals(reference_columns):
-                raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
+                raise ValueError(f"❌ Column names/order do not match: Dataset 1 and Dataset {i}.")
         merged_df = pd.concat(dfs, axis=0)
     else:
-        raise ValueError(f"Invalid merge direction: {direction}")
+        raise ValueError(f"❌ Invalid merge direction: {direction}")
     if reset_index:
         merged_df = merged_df.reset_index(drop=True)
     if verbose:
-        print(f"Merged DataFrame shape: {merged_df.shape}")
+        print(f"\n✅ Merged DataFrame shape: {merged_df.shape}")
     return merged_df
@@ -320,9 +345,9 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
         df.write_csv(output_path) # Polars defaults to utf8 and no index
     else:
         # This error handles cases where an unsupported type is passed
-        raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
+        raise TypeError(f"❌ Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
-    print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
+    print(f"\n✅ Saved dataset: '{filename}' with shape: {df.shape}")
 def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
@@ -356,7 +381,7 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
     # Raise for negative values
     if any(x < 0 for x in float_list):
-        raise ValueError("Negative values are not allowed in the input list.")
+        raise ValueError("❌ Negative values are not allowed in the input list.")
     # Step 2: Compute log10 of non-zero values
     nonzero = [x for x in float_list if x > 0]
@@ -395,7 +420,7 @@ def sanitize_filename(filename: str) -> str:
     - Removing or replacing characters invalid in filenames.
     Args:
-        name (str): Base filename.
+        filename (str): Base filename.
     Returns:
         str: A sanitized string suitable to use as a filename.
@@ -408,6 +433,10 @@ def sanitize_filename(filename: str) -> str:
     # Conservative filter to keep filenames safe across platforms
     sanitized = re.sub(r'[^\w\-.]', '', sanitized)
+    # Check for empty string after sanitization
+    if not sanitized:
+        raise ValueError("The sanitized filename is empty. The original input may have contained only invalid characters.")
     return sanitized
@@ -418,6 +447,8 @@ def threshold_binary_values(
 ) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
     """
     Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
+    Binary elements are converted to 0 or 1 using a 0.5 threshold.
     Parameters:
         input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
@@ -426,7 +457,8 @@ def threshold_binary_values(
             - If `int`, only this many last `binary_values` are thresholded.
     Returns:
-        Same type as input, with binary elements binarized to 0 or 1 using a 0.5 threshold.
+        Any:
+        Same type as input
     """
     original_type = type(input_array)
@@ -437,14 +469,14 @@ def threshold_binary_values(
     elif isinstance(input_array, (list, tuple)):
         array = np.array(input_array)
     else:
-        raise TypeError("Unsupported input type")
+        raise TypeError("❌ Unsupported input type")
     array = array.flatten()
     total = array.shape[0]
     bin_count = total if binary_values is None else binary_values
     if not (0 <= bin_count <= total):
-        raise ValueError("binary_values must be between 0 and the total number of elements")
+        raise ValueError("❌ binary_values must be between 0 and the total number of elements")
     if bin_count == 0:
         result = array
@@ -484,9 +516,9 @@ def threshold_binary_values_batch(
     np.ndarray
         Thresholded array, same shape as input.
     """
-    assert input_array.ndim == 2, f"Expected 2D array, got {input_array.ndim}D"
+    assert input_array.ndim == 2, f"❌ Expected 2D array, got {input_array.ndim}D"
     batch_size, total_features = input_array.shape
-    assert 0 <= binary_values <= total_features, "binary_values out of valid range"
+    assert 0 <= binary_values <= total_features, "❌ binary_values out of valid range"
     if binary_values == 0:
         return input_array.copy()
@@ -523,7 +555,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
         return None
     else:
         if verbose:
-            print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
+            print(f"\n✅ Object of type '{type(obj)}' saved to '{full_path}'")
         return None
@@ -550,7 +582,7 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
         return None
     else:
         if verbose:
-            print(f"✅ Loaded object of type '{type(obj)}'")
+            print(f"\n✅ Loaded object of type '{type(obj)}'")
         return obj

{dragon_ml_toolbox-3.11.0 → dragon_ml_toolbox-3.12.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "3.11.0"
+version = "3.12.1"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }