PyPI - dragon-ml-toolbox - Versions diffs - 10.8.0__py3-none-any.whl → 10.10.0__py3-none-any.whl - Mend

dragon-ml-toolbox 10.8.0py3-none-any.whl → 10.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (15) hide show

{dragon_ml_toolbox-10.8.0.dist-info → dragon_ml_toolbox-10.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.8.0
+Version: 10.10.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-10.8.0.dist-info → dragon_ml_toolbox-10.10.0.dist-info}/RECORD RENAMED Viewed

@@ -1,36 +1,36 @@
-dragon_ml_toolbox-10.8.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-10.8.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+dragon_ml_toolbox-10.10.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-10.10.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
 ml_tools/ETL_cleaning.py,sha256=lSP5q6-ukGhJBPV8dlsqJvPXAzj4du_0J-SbtEd0Pjg,19292
 ml_tools/ETL_engineering.py,sha256=a6KCWH6kRatZtjaFEF_o917ApPMK5_vRD-BjfCDAl-E,49400
 ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
 ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
 ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
-ml_tools/ML_datasetmaster.py,sha256=BMmdCVAZ-HSnnSPLzKla2TdZKvHkHj4t9A0V1Ba3i-I,30821
-ml_tools/ML_evaluation.py,sha256=28JJ2M71p4pxniwav2Hv3b1a5dsvaoIYNLm-UJQuXvY,16002
+ml_tools/ML_datasetmaster.py,sha256=vqKZhCXsvN5yeRJdOKqMPh5OhY1xe6xlNjM3WoH5lys,30821
+ml_tools/ML_evaluation.py,sha256=6FB6S-aDDpFzQdrp3flBVECzEsHhMbQknYVGhHooEFs,16207
 ml_tools/ML_evaluation_multi.py,sha256=2jTSNFCu8cz5C05EusnrDyffs59M2Fq3UXSHxo2TR1A,12515
 ml_tools/ML_inference.py,sha256=SGDPiPxs_OYDKKRZziFMyaWcC8A37c70W9t-dMP5niI,23066
-ml_tools/ML_models.py,sha256=FliuqGhxP7AWHCweTLlfssXFOjwvFhIYJsgj_w_-EI4,27901
+ml_tools/ML_models.py,sha256=8UOMg9Qn8qtecUGfgnLRedX-lCWYwEs-C5RJ2m8mZM4,27544
 ml_tools/ML_optimization.py,sha256=a2Uxe1g-y4I-gFa8ENIM8QDS-Pz3hoPRRaVXAWMbyQA,13491
-ml_tools/ML_scaler.py,sha256=IrZsAr1xjvuLi8s5IKR-qbk2mS_awl3mn_xoXg5TJyA,7535
-ml_tools/ML_trainer.py,sha256=xw1zMgYpdqwsTt604xe3GTQNvpg6z6Ze-avmitGBFeU,23539
+ml_tools/ML_scaler.py,sha256=h2ymq5u953Lx60Qb38Y0mAWj85x9PbnP0xYNQ3pd8-w,7535
+ml_tools/ML_trainer.py,sha256=_g48w5Ak-wQr5fGHdJqlcpnzv3gWyL1ghkOhy9VOZbo,23930
 ml_tools/PSO_optimization.py,sha256=q0VYpssQGbPum7xdnkDXlJQKhZMYZo8acHpKhajPK3c,22954
 ml_tools/RNN_forecast.py,sha256=8rNZr-eWOBXMiDQV22e_tQTPM5LM2IFggEAa1FaoXaI,1965
-ml_tools/SQL.py,sha256=WDgdZUYuLBUpv-4Am9XjVY_Aq_jxBWdLrbcgAIEwefI,10704
+ml_tools/SQL.py,sha256=givoz6CGWRUdqnBem3VGZxzGdo3ZbX00kyHNjzI8kWE,10803
 ml_tools/VIF_factor.py,sha256=MkMh_RIdsN2XUPzKNGRiEcmB17R_MmvGV4ezpL5zD2E,10403
 ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
 ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
 ml_tools/data_exploration.py,sha256=4McT2BR9muK4JVVTKUfvRyThe0m_o2vpy9RJ1f_1FeY,28692
-ml_tools/ensemble_evaluation.py,sha256=xMEMfXJ5MjTkTfr1LkFOeD7iUtnVDCW3S9lm3zT-6tY,24778
+ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
 ml_tools/ensemble_inference.py,sha256=EFHnbjbu31fcVp88NBx8lWAVdu2Gpg9MY9huVZJHFfM,9350
 ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
 ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
-ml_tools/keys.py,sha256=sZANLHvp_93pPigviMOz7AhampGlpokcop_llzsjWBw,1689
+ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
 ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
 ml_tools/path_manager.py,sha256=wLJlz3Y9_1-LB9em4B2VYDCVuTOX2eOc7D6hbbebjgM,14990
-ml_tools/utilities.py,sha256=xddY0uASKQWSuUsYJEcfDUkeC-ccbYlkycqHKdkPnhk,25105
-dragon_ml_toolbox-10.8.0.dist-info/METADATA,sha256=Ly11G7vOgCFbYwEYXQXa8RBgvWof9thiBxVjlk9DZu4,6968
-dragon_ml_toolbox-10.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-10.8.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-10.8.0.dist-info/RECORD,,
+ml_tools/utilities.py,sha256=30z0x1aDLyBGzF98_tgSaxwFafYwQS-GTFzXHopBSGc,29105
+dragon_ml_toolbox-10.10.0.dist-info/METADATA,sha256=hSrcYAuoE1H0uF77-8TClwrcdlQwg0f1BGixlh_Q0Wo,6969
+dragon_ml_toolbox-10.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-10.10.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-10.10.0.dist-info/RECORD,,

ml_tools/ML_datasetmaster.py CHANGED Viewed

@@ -200,7 +200,7 @@ class _BaseDatasetMaker(ABC):
         filepath = save_path / filename
         self.scaler.save(filepath, verbose=False)
         if verbose:
-            _LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
+            _LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
 # Single target dataset

ml_tools/ML_evaluation.py CHANGED Viewed

@@ -22,6 +22,7 @@ from .path_manager import make_fullpath
 from ._logger import _LOGGER
 from typing import Union, Optional, List
 from ._script_info import _script_info
+from .keys import SHAPKeys
 __all__ = [
@@ -333,7 +334,8 @@ def shap_summary_plot(model,
     plt.close()
     # Save Summary Data to CSV
-    summary_path = save_dir_path / "shap_summary.csv"
+    shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
+    summary_path = save_dir_path / shap_summary_filename
     # Ensure the array is 1D before creating the DataFrame
     mean_abs_shap = np.abs(shap_values).mean(axis=0).flatten()
@@ -341,9 +343,9 @@ def shap_summary_plot(model,
         feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
     summary_df = pd.DataFrame({
-        'feature': feature_names,
-        'mean_abs_shap_value': mean_abs_shap
-    }).sort_values('mean_abs_shap_value', ascending=False)
+        SHAPKeys.FEATURE_COLUMN: feature_names,
+        SHAPKeys.SHAP_VALUE_COLUMN: mean_abs_shap
+    }).sort_values(SHAPKeys.SHAP_VALUE_COLUMN, ascending=False)
     summary_df.to_csv(summary_path, index=False)
@@ -351,7 +353,7 @@ def shap_summary_plot(model,
     plt.ion()
-def plot_attention_importance(weights: List[torch.Tensor], feature_names: Optional[List[str]], save_dir: Union[str, Path]):
+def plot_attention_importance(weights: List[torch.Tensor], feature_names: Optional[List[str]], save_dir: Union[str, Path], top_n: int = 10):
     """
     Aggregates attention weights and plots global feature importance.
@@ -362,6 +364,7 @@ def plot_attention_importance(weights: List[torch.Tensor], feature_names: Option
         weights (List[torch.Tensor]): A list of attention weight tensors from each batch.
         feature_names (List[str] | None): Names of the features for plot labeling.
         save_dir (str | Path): Directory to save the plot and summary CSV.
+        top_n (int): The number of top features to display in the plot.
     """
     if not weights:
         _LOGGER.error("Attention weights list is empty. Skipping importance plot.")
@@ -390,11 +393,10 @@ def plot_attention_importance(weights: List[torch.Tensor], feature_names: Option
     summary_df.to_csv(summary_path, index=False)
     _LOGGER.info(f"📝 Attention summary data saved as '{summary_path.name}'")
-    # --- Step 3: Create and save the plot ---
-    plt.figure(figsize=(10, 8), dpi=100)
+    # --- Step 3: Create and save the plot for top N features ---
+    plot_df = summary_df.head(top_n).sort_values('mean_attention', ascending=True)
-    # Sort for plotting
-    plot_df = summary_df.sort_values('mean_attention', ascending=True)
+    plt.figure(figsize=(10, 8), dpi=100)
     # Create horizontal bar plot with error bars
     plt.barh(
@@ -408,7 +410,7 @@ def plot_attention_importance(weights: List[torch.Tensor], feature_names: Option
         color='cornflowerblue'
     )
-    plt.title('Global Feature Importance')
+    plt.title('Top Features by Attention')
     plt.xlabel('Average Attention Weight')
     plt.ylabel('Feature')
     plt.grid(axis='x', linestyle='--', alpha=0.6)

ml_tools/ML_models.py CHANGED Viewed

@@ -43,7 +43,7 @@ class _ArchitectureHandlerMixin:
             json.dump(config, f, indent=4)
         if verbose:
-            _LOGGER.info(f"Architecture for '{self.__class__.__name__}' saved to '{path_dir.name}'")
+            _LOGGER.info(f"Architecture for '{self.__class__.__name__}' saved as '{full_path.name}'")
     @classmethod
     def load(cls: type, file_or_dir: Union[str, Path], verbose: bool = True) -> nn.Module:
@@ -147,6 +147,30 @@ class _BaseMLP(nn.Module, _ArchitectureHandlerMixin):
         return f"{name}(arch: {arch_str})"
+class _BaseAttention(_BaseMLP):
+    """
+    Abstract base class for MLP models that incorporate an attention mechanism
+    before the main MLP layers.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # By default, models inheriting this do not have the flag.
+        self.has_interpretable_attention = False
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the standard forward pass."""
+        logits, _attention_weights = self.forward_attention(x)
+        return logits
+    def forward_attention(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Returns logits and attention weights."""
+        # This logic is now shared and defined in one place
+        x, attention_weights = self.attention(x)
+        x = self.mlp(x)
+        logits = self.output_layer(x)
+        return logits, attention_weights
 class MultilayerPerceptron(_BaseMLP):
     """
     Creates a versatile Multilayer Perceptron (MLP) for regression or classification tasks.
@@ -184,7 +208,7 @@ class MultilayerPerceptron(_BaseMLP):
         return self._repr_helper(name="MultilayerPerceptron", mlp_layers=layer_sizes)
-class AttentionMLP(_BaseMLP):
+class AttentionMLP(_BaseAttention):
     """
     A Multilayer Perceptron (MLP) that incorporates an Attention layer to dynamically weigh input features.
@@ -205,25 +229,7 @@ class AttentionMLP(_BaseMLP):
         super().__init__(in_features, out_targets, hidden_layers, drop_out)
         # Attention
         self.attention = _AttentionLayer(in_features)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Defines the standard forward pass.
-        """
-        logits, _attention_weights = self.forward_attention(x)
-        return logits
-    def forward_attention(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns logits and attention weights
-        """
-        # The attention layer returns the processed x and the weights
-        x, attention_weights = self.attention(x)
-        # Pass the attention-modified tensor through the MLP
-        logits = self.mlp(x)
-        return logits, attention_weights
+        self.has_interpretable_attention = True
     def __repr__(self) -> str:
         """Returns the developer-friendly string representation of the model."""
@@ -238,7 +244,7 @@ class AttentionMLP(_BaseMLP):
         return self._repr_helper(name="AttentionMLP", mlp_layers=arch)
-class MultiHeadAttentionMLP(_BaseMLP):
+class MultiHeadAttentionMLP(_BaseAttention):
     """
     An MLP that incorporates a standard `nn.MultiheadAttention` layer to process
     the input features.
@@ -267,24 +273,6 @@ class MultiHeadAttentionMLP(_BaseMLP):
             dropout=attention_dropout
         )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Defines the standard forward pass of the model."""
-        logits, _attention_weights = self.forward_attention(x)
-        return logits
-    def forward_attention(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns logits and attention weights.
-        """
-        # The attention layer returns the processed x and the weights
-        x, attention_weights = self.attention(x)
-        # Pass the attention-modified tensor through the MLP and prediction head
-        x = self.mlp(x)
-        logits = self.output_layer(x)
-        return logits, attention_weights
     def get_architecture_config(self) -> Dict[str, Any]:
         """Returns the full configuration of the model."""
         config = super().get_architecture_config()

ml_tools/ML_scaler.py CHANGED Viewed

@@ -164,7 +164,7 @@ class PytorchScaler:
         }
         torch.save(state, path_obj)
         if verbose:
-            _LOGGER.info(f"PytorchScaler state saved to '{path_obj.name}'.")
+            _LOGGER.info(f"PytorchScaler state saved as '{path_obj.name}'.")
     @staticmethod
     def load(filepath: Union[str, Path], verbose: bool=True) -> 'PytorchScaler':

ml_tools/ML_trainer.py CHANGED Viewed

@@ -472,23 +472,30 @@ class MLTrainer:
                 yield attention_weights
-    def explain_attention(self, save_dir: Union[str, Path], feature_names: Optional[List[str]], explain_dataset: Optional[Dataset] = None):
+    def explain_attention(self, save_dir: Union[str, Path],
+                          feature_names: Optional[List[str]],
+                          explain_dataset: Optional[Dataset] = None,
+                          plot_n_features: int = 10):
         """
         Generates and saves a feature importance plot based on attention weights.
-        This method only works for models with a `forward_attention` method.
+        This method only works for models with models with 'has_interpretable_attention'.
         Args:
             save_dir (str | Path): Directory to save the plot and summary data.
-            feature_names (List[str] | None): Names for the features for plot labeling.
+            feature_names (List[str] | None): Names for the features for plot labeling. If not given, generic names will be used.
             explain_dataset (Dataset, optional): A specific dataset to explain. If None, the trainer's test dataset is used.
+            plot_n_features (int): Number of top features to plot.
         """
         print("\n--- Attention Analysis ---")
         # --- Step 1: Check if the model supports this explanation ---
-        if not hasattr(self.model, 'forward_attention'):
-            _LOGGER.error("Model does not have a `forward_attention` method. Skipping attention explanation.")
+        if not getattr(self.model, 'has_interpretable_attention', False):
+            _LOGGER.warning(
+                "Model is not flagged for interpretable attention analysis. "
+                "Skipping. This is the correct behavior for models like MultiHeadAttentionMLP."
+            )
             return
         # --- Step 2: Set up the dataloader ---
@@ -514,7 +521,8 @@ class MLTrainer:
             plot_attention_importance(
                 weights=all_weights,
                 feature_names=feature_names,
-                save_dir=save_dir
+                save_dir=save_dir,
+                top_n=plot_n_features
             )
         else:
             _LOGGER.error("No attention weights were collected from the model.")

ml_tools/SQL.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pathlib import Path
 from typing import Union, Dict, Any, Optional, List, Literal
 from ._logger import _LOGGER
 from ._script_info import _script_info
-from .path_manager import make_fullpath
+from .path_manager import make_fullpath, sanitize_filename
 __all__ = [
@@ -94,11 +94,13 @@ class DatabaseManager:
         if not self.cursor:
             _LOGGER.error("Database connection is not open.")
             raise sqlite3.Error()
+        sanitized_table_name = sanitize_filename(table_name)
         columns_def = ", ".join([f'"{col_name}" {col_type}' for col_name, col_type in schema.items()])
         exists_clause = "IF NOT EXISTS" if if_not_exists else ""
-        query = f"CREATE TABLE {exists_clause} {table_name} ({columns_def})"
+        query = f"CREATE TABLE {exists_clause} {sanitized_table_name} ({columns_def})"
         _LOGGER.info(f"➡️ Executing: {query}")
         self.cursor.execute(query)

ml_tools/ensemble_evaluation.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import Union, Optional, Literal
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER
+from .keys import SHAPKeys
 __all__ = [
@@ -472,7 +473,7 @@ def get_shap_values(
         save_dir: Directory to save visualizations.
     """
     sanitized_target_name = sanitize_filename(target_name)
-    global_save_path = make_fullpath(save_dir, make=True)
+    global_save_path = make_fullpath(save_dir, make=True, enforce="directory")
     def _apply_plot_style():
         styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -539,6 +540,15 @@ def get_shap_values(
                         plot_type=plot_type,
                         title=f"{model_name} - {target_name} (Class {class_name})"
                     )
+                # Save the summary data for the current class
+                summary_save_path = global_save_path / f"SHAP_{sanitized_target_name}_{class_name}.csv"
+                _save_summary_csv(
+                    shap_values_for_summary=class_shap,
+                    feature_names=feature_names,
+                    save_path=summary_save_path
+                )
         else:
             values = shap_values[1] if isinstance(shap_values, list) else shap_values
             for plot_type in ["bar", "dot"]:
@@ -549,6 +559,15 @@ def get_shap_values(
                     plot_type=plot_type,
                     title=f"{model_name} - {target_name}"
                 )
+            # Save the summary data for the positive class
+            shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
+            summary_save_path = global_save_path / shap_summary_filename
+            _save_summary_csv(
+                shap_values_for_summary=values,
+                feature_names=feature_names,
+                save_path=summary_save_path
+            )
     def _plot_for_regression(shap_values):
         for plot_type in ["bar", "dot"]:
@@ -559,6 +578,34 @@ def get_shap_values(
                 plot_type=plot_type,
                 title=f"{model_name} - {target_name}"
             )
+        # Save the summary data to a CSV file
+        shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
+        summary_save_path = global_save_path / shap_summary_filename
+        _save_summary_csv(
+            shap_values_for_summary=shap_values,
+            feature_names=feature_names,
+            save_path=summary_save_path
+        )
+    def _save_summary_csv(shap_values_for_summary: np.ndarray, feature_names: list[str], save_path: Path):
+        """Calculates and saves the SHAP summary data to a CSV file."""
+        mean_abs_shap = np.abs(shap_values_for_summary).mean(axis=0)
+        # Create default feature names if none are provided
+        current_feature_names = feature_names
+        if current_feature_names is None:
+            current_feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
+        summary_df = pd.DataFrame({
+            SHAPKeys.FEATURE_COLUMN: feature_names,
+            SHAPKeys.SHAP_VALUE_COLUMN: mean_abs_shap
+        }).sort_values(SHAPKeys.SHAP_VALUE_COLUMN, ascending=False)
+        summary_df.to_csv(save_path, index=False)
+        # print(f"📝 SHAP summary data saved as '{save_path.name}'")
     #START_O
     explainer = shap.TreeExplainer(model)

ml_tools/keys.py CHANGED Viewed

@@ -61,6 +61,13 @@ class DatasetKeys:
     SCALER_PREFIX = "scaler_"
+class SHAPKeys:
+    """Keys for SHAP functions"""
+    FEATURE_COLUMN = "feature"
+    SHAP_VALUE_COLUMN = "mean_abs_shap_value"
+    SAVENAME = "shap_summary"
 class _OneHotOtherPlaceholder:
     """Used internally by GUI_tools."""
     OTHER_GUI = "OTHER"

ml_tools/utilities.py CHANGED Viewed

@@ -9,7 +9,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
 from .path_manager import sanitize_filename, make_fullpath, list_csv_paths, list_files_by_extension, list_subdirectories
 from ._script_info import _script_info
 from ._logger import _LOGGER
-from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys
+from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys, SHAPKeys
 # Keep track of available tools
@@ -26,7 +26,8 @@ __all__ = [
     "distribute_dataset_by_target",
     "train_dataset_orchestrator",
     "train_dataset_yielder",
-    "find_model_artifacts"
+    "find_model_artifacts",
+    "select_features_by_shap"
 ]
@@ -34,6 +35,7 @@ __all__ = [
 @overload
 def load_dataframe(
     df_path: Union[str, Path],
+    use_columns: Optional[list[str]] = None,
     kind: Literal["pandas"] = "pandas",
     all_strings: bool = False,
     verbose: bool = True
@@ -44,7 +46,8 @@ def load_dataframe(
 @overload
 def load_dataframe(
     df_path: Union[str, Path],
-    kind: Literal["polars"],
+    use_columns: Optional[list[str]] = None,
+    kind: Literal["polars"] = "polars",
     all_strings: bool = False,
     verbose: bool = True
 ) -> Tuple[pl.DataFrame, str]:
@@ -52,6 +55,7 @@ def load_dataframe(
 def load_dataframe(
     df_path: Union[str, Path],
+    use_columns: Optional[list[str]] = None,
     kind: Literal["pandas", "polars"] = "pandas",
     all_strings: bool = False,
     verbose: bool = True
@@ -60,11 +64,13 @@ def load_dataframe(
     Load a CSV file into a DataFrame and extract its base name.
     Can load data as either a pandas or a polars DataFrame. Allows for loading all
-    columns as string types to prevent type inference errors.
+    columns or a subset of columns as string types to prevent type inference errors.
     Args:
         df_path (str, Path):
             The path to the CSV file.
+        use_columns (list[str] | None):
+            If provided, only these columns will be loaded from the CSV.
         kind ("pandas", "polars"):
             The type of DataFrame to load. Defaults to "pandas".
         all_strings (bool):
@@ -78,28 +84,43 @@ def load_dataframe(
     Raises:
         FileNotFoundError: If the file does not exist at the given path.
-        ValueError: If the DataFrame is empty or an invalid 'kind' is provided.
+        ValueError: If the DataFrame is empty, an invalid 'kind' is provided, or a column in 'use_columns' is not found in the file.
     """
     path = make_fullpath(df_path)
     df_name = path.stem
-    if kind == "pandas":
-        if all_strings:
-            df = pd.read_csv(path, encoding='utf-8', dtype=str)
-        else:
-            df = pd.read_csv(path, encoding='utf-8')
-    elif kind == "polars":
-        if all_strings:
-            df = pl.read_csv(path, infer_schema=False)
+    try:
+        if kind == "pandas":
+            pd_kwargs: dict[str,Any]
+            pd_kwargs = {'encoding': 'utf-8'}
+            if use_columns:
+                pd_kwargs['usecols'] = use_columns
+            if all_strings:
+                pd_kwargs['dtype'] = str
+            df = pd.read_csv(path, **pd_kwargs)
+        elif kind == "polars":
+            pl_kwargs: dict[str,Any]
+            pl_kwargs = {}
+            if use_columns:
+                pl_kwargs['columns'] = use_columns
+            if all_strings:
+                pl_kwargs['infer_schema'] = False
+            else:
+                pl_kwargs['infer_schema_length'] = 1000
+            df = pl.read_csv(path, **pl_kwargs)
         else:
-            # Default behavior: infer the schema.
-            df = pl.read_csv(path, infer_schema_length=1000)
+            _LOGGER.error(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
+            raise ValueError()
-    else:
-        _LOGGER.error(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
-        raise ValueError()
+    except (ValueError, pl.exceptions.ColumnNotFoundError) as e:
+        _LOGGER.error(f"Failed to load '{df_name}'. A specified column may not exist in the file.")
+        raise e
     # This check works for both pandas and polars DataFrames
     if df.shape[0] == 0:
@@ -111,7 +132,6 @@ def load_dataframe(
     return df, df_name # type: ignore
 def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
     """
     Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
@@ -683,5 +703,84 @@ def find_model_artifacts(target_directory: Union[str,Path], load_scaler: bool, v
     return all_artifacts
+def select_features_by_shap(
+    root_directory: Union[str, Path],
+    shap_threshold: float = 1.0,
+    verbose: bool = True) -> list[str]:
+    """
+    Scans subdirectories to find SHAP summary CSVs, then extracts feature
+    names whose mean absolute SHAP value meets a specified threshold.
+    This function is useful for automated feature selection based on feature
+    importance scores aggregated from multiple models.
+    Args:
+        root_directory (Union[str, Path]):
+            The path to the root directory that contains model subdirectories.
+        shap_threshold (float):
+            The minimum mean absolute SHAP value for a feature to be included
+            in the final list.
+    Returns:
+        list[str]:
+            A single, sorted list of unique feature names that meet the
+            threshold criteria across all found files.
+    """
+    if verbose:
+        _LOGGER.info(f"Starting feature selection with SHAP threshold >= {shap_threshold}")
+    root_path = make_fullpath(root_directory, enforce="directory")
+    # --- Step 2: Directory and File Discovery ---
+    subdirectories = list_subdirectories(root_dir=root_path, verbose=False)
+    shap_filename = SHAPKeys.SAVENAME + ".csv"
+    valid_csv_paths = []
+    for dir_name, dir_path in subdirectories.items():
+        expected_path = dir_path / shap_filename
+        if expected_path.is_file():
+            valid_csv_paths.append(expected_path)
+        else:
+            _LOGGER.warning(f"No '{shap_filename}' found in subdirectory '{dir_name}'.")
+    if not valid_csv_paths:
+        _LOGGER.error(f"Process halted: No '{shap_filename}' files were found in any subdirectory.")
+        return []
+    if verbose:
+        _LOGGER.info(f"Found {len(valid_csv_paths)} SHAP summary files to process.")
+    # --- Step 3: Data Processing and Feature Extraction ---
+    master_feature_set = set()
+    for csv_path in valid_csv_paths:
+        try:
+            df, _ = load_dataframe(csv_path, kind="pandas", verbose=False)
+            # Validate required columns
+            required_cols = {SHAPKeys.FEATURE_COLUMN, SHAPKeys.SHAP_VALUE_COLUMN}
+            if not required_cols.issubset(df.columns):
+                _LOGGER.warning(f"Skipping '{csv_path}': missing required columns.")
+                continue
+            # Filter by threshold and extract features
+            filtered_df = df[df[SHAPKeys.SHAP_VALUE_COLUMN] >= shap_threshold]
+            features = filtered_df[SHAPKeys.FEATURE_COLUMN].tolist()
+            master_feature_set.update(features)
+        except (ValueError, pd.errors.EmptyDataError):
+            _LOGGER.warning(f"Skipping '{csv_path}' because it is empty or malformed.")
+            continue
+        except Exception as e:
+            _LOGGER.error(f"An unexpected error occurred while processing '{csv_path}': {e}")
+            continue
+    # --- Step 4: Finalize and Return ---
+    final_features = sorted(list(master_feature_set))
+    if verbose:
+        _LOGGER.info(f"Selected {len(final_features)} unique features across all files.")
+    return final_features
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-10.8.0.dist-info → dragon_ml_toolbox-10.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.8.0.dist-info → dragon_ml_toolbox-10.10.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.8.0.dist-info → dragon_ml_toolbox-10.10.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.8.0.dist-info → dragon_ml_toolbox-10.10.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 10.8.0__py3-none-any.whl → 10.10.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 10.8.0py3-none-any.whl → 10.10.0py3-none-any.whl