PyPI - dragon-ml-toolbox - Versions diffs - 10.7.0__py3-none-any.whl → 10.9.0__py3-none-any.whl - Mend

dragon-ml-toolbox 10.7.0py3-none-any.whl → 10.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (15) hide show

{dragon_ml_toolbox-10.7.0.dist-info → dragon_ml_toolbox-10.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.7.0
+Version: 10.9.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-10.7.0.dist-info → dragon_ml_toolbox-10.9.0.dist-info}/RECORD RENAMED Viewed

@@ -1,36 +1,36 @@
-dragon_ml_toolbox-10.7.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-10.7.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+dragon_ml_toolbox-10.9.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-10.9.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
 ml_tools/ETL_cleaning.py,sha256=lSP5q6-ukGhJBPV8dlsqJvPXAzj4du_0J-SbtEd0Pjg,19292
 ml_tools/ETL_engineering.py,sha256=a6KCWH6kRatZtjaFEF_o917ApPMK5_vRD-BjfCDAl-E,49400
 ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
 ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
 ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
-ml_tools/ML_datasetmaster.py,sha256=uenjHP-Mh4tn20rWSEGN_JsCPvuPNDGW-PElBhb2a4I,30346
-ml_tools/ML_evaluation.py,sha256=28JJ2M71p4pxniwav2Hv3b1a5dsvaoIYNLm-UJQuXvY,16002
+ml_tools/ML_datasetmaster.py,sha256=BMmdCVAZ-HSnnSPLzKla2TdZKvHkHj4t9A0V1Ba3i-I,30821
+ml_tools/ML_evaluation.py,sha256=q4_RsBjmidc_yDX-DQvpJW8RCHrOCJbgXKBORQdt-TM,16111
 ml_tools/ML_evaluation_multi.py,sha256=2jTSNFCu8cz5C05EusnrDyffs59M2Fq3UXSHxo2TR1A,12515
 ml_tools/ML_inference.py,sha256=SGDPiPxs_OYDKKRZziFMyaWcC8A37c70W9t-dMP5niI,23066
-ml_tools/ML_models.py,sha256=A_yeULMxT3IAuJuwIF5nXdAQwQDGsxHlbDSxtlzVG44,27699
+ml_tools/ML_models.py,sha256=FliuqGhxP7AWHCweTLlfssXFOjwvFhIYJsgj_w_-EI4,27901
 ml_tools/ML_optimization.py,sha256=a2Uxe1g-y4I-gFa8ENIM8QDS-Pz3hoPRRaVXAWMbyQA,13491
-ml_tools/ML_scaler.py,sha256=yKVrXW6dWV6UoC9ViLMzORfXQXvGTJvzkNbSrB0F5t0,7447
+ml_tools/ML_scaler.py,sha256=IrZsAr1xjvuLi8s5IKR-qbk2mS_awl3mn_xoXg5TJyA,7535
 ml_tools/ML_trainer.py,sha256=xw1zMgYpdqwsTt604xe3GTQNvpg6z6Ze-avmitGBFeU,23539
 ml_tools/PSO_optimization.py,sha256=q0VYpssQGbPum7xdnkDXlJQKhZMYZo8acHpKhajPK3c,22954
 ml_tools/RNN_forecast.py,sha256=8rNZr-eWOBXMiDQV22e_tQTPM5LM2IFggEAa1FaoXaI,1965
-ml_tools/SQL.py,sha256=WDgdZUYuLBUpv-4Am9XjVY_Aq_jxBWdLrbcgAIEwefI,10704
+ml_tools/SQL.py,sha256=givoz6CGWRUdqnBem3VGZxzGdo3ZbX00kyHNjzI8kWE,10803
 ml_tools/VIF_factor.py,sha256=MkMh_RIdsN2XUPzKNGRiEcmB17R_MmvGV4ezpL5zD2E,10403
 ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
 ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
 ml_tools/data_exploration.py,sha256=4McT2BR9muK4JVVTKUfvRyThe0m_o2vpy9RJ1f_1FeY,28692
-ml_tools/ensemble_evaluation.py,sha256=xMEMfXJ5MjTkTfr1LkFOeD7iUtnVDCW3S9lm3zT-6tY,24778
+ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
 ml_tools/ensemble_inference.py,sha256=EFHnbjbu31fcVp88NBx8lWAVdu2Gpg9MY9huVZJHFfM,9350
 ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
 ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
-ml_tools/keys.py,sha256=ThuyNnSV4iK712WRaGXEm9uGW8Dg3djKa7HFRmPCRr4,1228
+ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
 ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
-ml_tools/path_manager.py,sha256=7sRvAoNrboRY6ef9gH3_qdzoZ66iLs7Aii4P39K0kEk,13819
-ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
-dragon_ml_toolbox-10.7.0.dist-info/METADATA,sha256=3yKY50Qa3kt1lvDo_ELk3dUczIunDGuf6bB3UaiBl9g,6968
-dragon_ml_toolbox-10.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-10.7.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-10.7.0.dist-info/RECORD,,
+ml_tools/path_manager.py,sha256=wLJlz3Y9_1-LB9em4B2VYDCVuTOX2eOc7D6hbbebjgM,14990
+ml_tools/utilities.py,sha256=30z0x1aDLyBGzF98_tgSaxwFafYwQS-GTFzXHopBSGc,29105
+dragon_ml_toolbox-10.9.0.dist-info/METADATA,sha256=NK8z4StYOVR0ByF_l-vNjyrFgbb2qddBa6lOzlQsZrg,6968
+dragon_ml_toolbox-10.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-10.9.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-10.9.0.dist-info/RECORD,,

ml_tools/ML_datasetmaster.py CHANGED Viewed

@@ -15,6 +15,8 @@ from ._logger import _LOGGER
 from ._script_info import _script_info
 from .custom_logger import save_list_strings
 from .ML_scaler import PytorchScaler
+from .keys import DatasetKeys
 __all__ = [
     "DatasetMaker",
@@ -91,6 +93,7 @@ class _BaseDatasetMaker(ABC):
         self.scaler: Optional[PytorchScaler] = None
         self._id: Optional[str] = None
         self._feature_names: List[str] = []
+        self._target_names: List[str] = []
         self._X_train_shape = (0,0)
         self._X_test_shape = (0,0)
         self._y_train_shape = (0,)
@@ -142,6 +145,10 @@ class _BaseDatasetMaker(ABC):
     @property
     def feature_names(self) -> list[str]:
         return self._feature_names
+    @property
+    def target_names(self) -> list[str]:
+        return self._target_names
     @property
     def id(self) -> Optional[str]:
@@ -162,10 +169,17 @@ class _BaseDatasetMaker(ABC):
         """Saves a list of feature names as a text file"""
         save_list_strings(list_strings=self._feature_names,
                           directory=directory,
-                          filename="feature_names",
-                          verbose=verbose)
+                          filename=DatasetKeys.FEATURE_NAMES,
+                          verbose=verbose)
+    def save_target_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
+        """Saves a list of target names as a text file"""
+        save_list_strings(list_strings=self._target_names,
+                          directory=directory,
+                          filename=DatasetKeys.TARGET_NAMES,
+                          verbose=verbose)
-    def save_scaler(self, save_dir: Union[str, Path]):
+    def save_scaler(self, save_dir: Union[str, Path], verbose: bool=True) -> None:
         """
         Saves the fitted PytorchScaler's state to a .pth file.
@@ -178,14 +192,15 @@ class _BaseDatasetMaker(ABC):
             _LOGGER.error("No scaler was fitted or provided.")
             raise RuntimeError()
         if not self.id:
-            _LOGGER.error("Must set the `id` before saving scaler.")
+            _LOGGER.error("Must set the dataset `id` before saving scaler.")
             raise ValueError()
         save_path = make_fullpath(save_dir, make=True, enforce="directory")
         sanitized_id = sanitize_filename(self.id)
-        filename = f"scaler_{sanitized_id}.pth"
+        filename = f"{DatasetKeys.SCALER_PREFIX}{sanitized_id}.pth"
         filepath = save_path / filename
-        self.scaler.save(filepath)
-        _LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
+        self.scaler.save(filepath, verbose=False)
+        if verbose:
+            _LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
 # Single target dataset
@@ -203,7 +218,7 @@ class DatasetMaker(_BaseDatasetMaker):
         `train_dataset` -> PyTorch Dataset
         `test_dataset`  -> PyTorch Dataset
         `feature_names` -> list[str]
-        `target_name`   -> str
+        `target_names`  -> list[str]
         `id` -> str
     The ID can be manually set to any string if needed, it is the target name by default.
@@ -231,8 +246,8 @@ class DatasetMaker(_BaseDatasetMaker):
         features = pandas_df.iloc[:, :-1]
         target = pandas_df.iloc[:, -1]
         self._feature_names = features.columns.tolist()
-        self._target_name = str(target.name)
-        self._id = self._target_name
+        self._target_names = [str(target.name)]
+        self._id = self._target_names[0]
         # --- 2. Split ---
         X_train, X_test, y_train, y_test = train_test_split(
@@ -249,12 +264,8 @@ class DatasetMaker(_BaseDatasetMaker):
         )
         # --- 4. Create Datasets ---
-        self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=[self._target_name])
-        self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=[self._target_name])
-    @property
-    def target_name(self) -> str:
-        return self._target_name
+        self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
 # --- New Multi-Target Class ---
@@ -303,10 +314,6 @@ class DatasetMakerMulti(_BaseDatasetMaker):
         self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
         self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
-    @property
-    def target_names(self) -> list[str]:
-        return self._target_names
 # --- Private Base Class ---
 class _BaseMaker(ABC):

ml_tools/ML_evaluation.py CHANGED Viewed

@@ -22,6 +22,7 @@ from .path_manager import make_fullpath
 from ._logger import _LOGGER
 from typing import Union, Optional, List
 from ._script_info import _script_info
+from .keys import SHAPKeys
 __all__ = [
@@ -333,7 +334,8 @@ def shap_summary_plot(model,
     plt.close()
     # Save Summary Data to CSV
-    summary_path = save_dir_path / "shap_summary.csv"
+    shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
+    summary_path = save_dir_path / shap_summary_filename
     # Ensure the array is 1D before creating the DataFrame
     mean_abs_shap = np.abs(shap_values).mean(axis=0).flatten()
@@ -341,9 +343,9 @@ def shap_summary_plot(model,
         feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
     summary_df = pd.DataFrame({
-        'feature': feature_names,
-        'mean_abs_shap_value': mean_abs_shap
-    }).sort_values('mean_abs_shap_value', ascending=False)
+        SHAPKeys.FEATURE_COLUMN: feature_names,
+        SHAPKeys.SHAP_VALUE_COLUMN: mean_abs_shap
+    }).sort_values(SHAPKeys.SHAP_VALUE_COLUMN, ascending=False)
     summary_df.to_csv(summary_path, index=False)

ml_tools/ML_models.py CHANGED Viewed

@@ -6,7 +6,7 @@ import json
 from ._logger import _LOGGER
 from .path_manager import make_fullpath
 from ._script_info import _script_info
-from .keys import PytorchModelKeys
+from .keys import PytorchModelArchitectureKeys
 __all__ = [
@@ -29,11 +29,14 @@ class _ArchitectureHandlerMixin:
             raise AttributeError()
         path_dir = make_fullpath(directory, make=True, enforce="directory")
-        full_path = path_dir / PytorchModelKeys.SAVENAME
+        json_filename = PytorchModelArchitectureKeys.SAVENAME + ".json"
+        full_path = path_dir / json_filename
         config = {
-            PytorchModelKeys.MODEL: self.__class__.__name__,
-            PytorchModelKeys.CONFIG: self.get_architecture_config() # type: ignore
+            PytorchModelArchitectureKeys.MODEL: self.__class__.__name__,
+            PytorchModelArchitectureKeys.CONFIG: self.get_architecture_config() # type: ignore
         }
         with open(full_path, 'w') as f:
@@ -48,7 +51,8 @@ class _ArchitectureHandlerMixin:
         user_path = make_fullpath(file_or_dir)
         if user_path.is_dir():
-            target_path = make_fullpath(user_path / PytorchModelKeys.SAVENAME, enforce="file")
+            json_filename = PytorchModelArchitectureKeys.SAVENAME + ".json"
+            target_path = make_fullpath(user_path / json_filename, enforce="file")
         elif user_path.is_file():
             target_path = user_path
         else:
@@ -58,8 +62,8 @@ class _ArchitectureHandlerMixin:
         with open(target_path, 'r') as f:
             saved_data = json.load(f)
-        saved_class_name = saved_data[PytorchModelKeys.MODEL]
-        config = saved_data[PytorchModelKeys.CONFIG]
+        saved_class_name = saved_data[PytorchModelArchitectureKeys.MODEL]
+        config = saved_data[PytorchModelArchitectureKeys.CONFIG]
         if saved_class_name != cls.__name__:
             _LOGGER.error(f"Model class mismatch. File specifies '{saved_class_name}', but '{cls.__name__}' was expected.")

ml_tools/ML_scaler.py CHANGED Viewed

@@ -149,7 +149,7 @@ class PytorchScaler:
         return data_clone
-    def save(self, filepath: Union[str, Path]):
+    def save(self, filepath: Union[str, Path], verbose: bool=True):
         """
         Saves the scaler's state (mean, std, indices) to a .pth file.
@@ -163,10 +163,11 @@ class PytorchScaler:
             'continuous_feature_indices': self.continuous_feature_indices
         }
         torch.save(state, path_obj)
-        _LOGGER.info(f"PytorchScaler state saved to '{path_obj.name}'.")
+        if verbose:
+            _LOGGER.info(f"PytorchScaler state saved to '{path_obj.name}'.")
     @staticmethod
-    def load(filepath: Union[str, Path]) -> 'PytorchScaler':
+    def load(filepath: Union[str, Path], verbose: bool=True) -> 'PytorchScaler':
         """
         Loads a scaler's state from a .pth file.
@@ -178,7 +179,8 @@ class PytorchScaler:
         """
         path_obj = make_fullpath(filepath, enforce="file")
         state = torch.load(path_obj)
-        _LOGGER.info(f"PytorchScaler state loaded from '{path_obj.name}'.")
+        if verbose:
+            _LOGGER.info(f"PytorchScaler state loaded from '{path_obj.name}'.")
         return PytorchScaler(
             mean=state['mean'],
             std=state['std'],

ml_tools/SQL.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pathlib import Path
 from typing import Union, Dict, Any, Optional, List, Literal
 from ._logger import _LOGGER
 from ._script_info import _script_info
-from .path_manager import make_fullpath
+from .path_manager import make_fullpath, sanitize_filename
 __all__ = [
@@ -94,11 +94,13 @@ class DatabaseManager:
         if not self.cursor:
             _LOGGER.error("Database connection is not open.")
             raise sqlite3.Error()
+        sanitized_table_name = sanitize_filename(table_name)
         columns_def = ", ".join([f'"{col_name}" {col_type}' for col_name, col_type in schema.items()])
         exists_clause = "IF NOT EXISTS" if if_not_exists else ""
-        query = f"CREATE TABLE {exists_clause} {table_name} ({columns_def})"
+        query = f"CREATE TABLE {exists_clause} {sanitized_table_name} ({columns_def})"
         _LOGGER.info(f"➡️ Executing: {query}")
         self.cursor.execute(query)

ml_tools/ensemble_evaluation.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import Union, Optional, Literal
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER
+from .keys import SHAPKeys
 __all__ = [
@@ -472,7 +473,7 @@ def get_shap_values(
         save_dir: Directory to save visualizations.
     """
     sanitized_target_name = sanitize_filename(target_name)
-    global_save_path = make_fullpath(save_dir, make=True)
+    global_save_path = make_fullpath(save_dir, make=True, enforce="directory")
     def _apply_plot_style():
         styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -539,6 +540,15 @@ def get_shap_values(
                         plot_type=plot_type,
                         title=f"{model_name} - {target_name} (Class {class_name})"
                     )
+                # Save the summary data for the current class
+                summary_save_path = global_save_path / f"SHAP_{sanitized_target_name}_{class_name}.csv"
+                _save_summary_csv(
+                    shap_values_for_summary=class_shap,
+                    feature_names=feature_names,
+                    save_path=summary_save_path
+                )
         else:
             values = shap_values[1] if isinstance(shap_values, list) else shap_values
             for plot_type in ["bar", "dot"]:
@@ -549,6 +559,15 @@ def get_shap_values(
                     plot_type=plot_type,
                     title=f"{model_name} - {target_name}"
                 )
+            # Save the summary data for the positive class
+            shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
+            summary_save_path = global_save_path / shap_summary_filename
+            _save_summary_csv(
+                shap_values_for_summary=values,
+                feature_names=feature_names,
+                save_path=summary_save_path
+            )
     def _plot_for_regression(shap_values):
         for plot_type in ["bar", "dot"]:
@@ -559,6 +578,34 @@ def get_shap_values(
                 plot_type=plot_type,
                 title=f"{model_name} - {target_name}"
             )
+        # Save the summary data to a CSV file
+        shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
+        summary_save_path = global_save_path / shap_summary_filename
+        _save_summary_csv(
+            shap_values_for_summary=shap_values,
+            feature_names=feature_names,
+            save_path=summary_save_path
+        )
+    def _save_summary_csv(shap_values_for_summary: np.ndarray, feature_names: list[str], save_path: Path):
+        """Calculates and saves the SHAP summary data to a CSV file."""
+        mean_abs_shap = np.abs(shap_values_for_summary).mean(axis=0)
+        # Create default feature names if none are provided
+        current_feature_names = feature_names
+        if current_feature_names is None:
+            current_feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
+        summary_df = pd.DataFrame({
+            SHAPKeys.FEATURE_COLUMN: feature_names,
+            SHAPKeys.SHAP_VALUE_COLUMN: mean_abs_shap
+        }).sort_values(SHAPKeys.SHAP_VALUE_COLUMN, ascending=False)
+        summary_df.to_csv(save_path, index=False)
+        # print(f"📝 SHAP summary data saved as '{save_path.name}'")
     #START_O
     explainer = shap.TreeExplainer(model)

ml_tools/keys.py CHANGED Viewed

@@ -38,11 +38,34 @@ class PyTorchInferenceKeys:
     PROBABILITIES = "probabilities"
-class PytorchModelKeys:
-    """Keys for saving and loading models"""
+class PytorchModelArchitectureKeys:
+    """Keys for saving and loading model architecture."""
     MODEL = 'model_class'
     CONFIG = "config"
-    SAVENAME = "architecture.json"
+    SAVENAME = "architecture"
+class PytorchArtifactPathKeys:
+    """Keys for model artifact paths."""
+    FEATURES_PATH = "feature_names_path"
+    TARGETS_PATH = "target_names_path"
+    ARCHITECTURE_PATH = "model_architecture_path"
+    WEIGHTS_PATH = "model_weights_path"
+    SCALER_PATH = "scaler_path"
+class DatasetKeys:
+    """Keys for saving dataset artifacts"""
+    FEATURE_NAMES = "feature_names"
+    TARGET_NAMES = "target_names"
+    SCALER_PREFIX = "scaler_"
+class SHAPKeys:
+    """Keys for SHAP functions"""
+    FEATURE_COLUMN = "feature"
+    SHAP_VALUE_COLUMN = "mean_abs_shap_value"
+    SAVENAME = "shap_summary"
 class _OneHotOtherPlaceholder:

ml_tools/path_manager.py CHANGED Viewed

@@ -13,6 +13,7 @@ __all__ = [
     "sanitize_filename",
     "list_csv_paths",
     "list_files_by_extension",
+    "list_subdirectories"
 ]
@@ -385,5 +386,37 @@ def list_files_by_extension(directory: Union[str,Path], extension: str, verbose:
     return name_path_dict
+def list_subdirectories(root_dir: Union[str,Path], verbose: bool=True) -> dict[str, Path]:
+    """
+    Scans a directory and returns a dictionary of its immediate subdirectories.
+    Args:
+        root_dir (str | Path): The path to the directory to scan.
+        verbose (bool): If True, prints the number of directories found.
+    Returns:
+        dict[str, Path]: A dictionary mapping subdirectory names (str) to their full Path objects.
+    """
+    root_path = make_fullpath(root_dir, enforce="directory")
+    directories = [p.resolve() for p in root_path.iterdir() if p.is_dir()]
+    if len(directories) < 1:
+        _LOGGER.error(f"No subdirectories found inside '{root_path}'")
+        raise IOError()
+    if verbose:
+        count = len(directories)
+        # Use pluralization for better readability
+        plural = 'ies' if count != 1 else 'y'
+        print(f"Found {count} subdirector{plural} in '{root_path.name}'.")
+    # Create a dictionary where the key is the directory's name (a string)
+    # and the value is the full Path object.
+    dir_map = {p.name: p for p in directories}
+    return dir_map
 def info():
     _script_info(__all__)

ml_tools/utilities.py CHANGED Viewed

@@ -6,9 +6,10 @@ from pathlib import Path
 from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple, overload
 import joblib
 from joblib.externals.loky.process_executor import TerminatedWorkerError
-from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
+from .path_manager import sanitize_filename, make_fullpath, list_csv_paths, list_files_by_extension, list_subdirectories
 from ._script_info import _script_info
 from ._logger import _LOGGER
+from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys, SHAPKeys
 # Keep track of available tools
@@ -24,7 +25,9 @@ __all__ = [
     "deserialize_object",
     "distribute_dataset_by_target",
     "train_dataset_orchestrator",
-    "train_dataset_yielder"
+    "train_dataset_yielder",
+    "find_model_artifacts",
+    "select_features_by_shap"
 ]
@@ -32,6 +35,7 @@ __all__ = [
 @overload
 def load_dataframe(
     df_path: Union[str, Path],
+    use_columns: Optional[list[str]] = None,
     kind: Literal["pandas"] = "pandas",
     all_strings: bool = False,
     verbose: bool = True
@@ -42,7 +46,8 @@ def load_dataframe(
 @overload
 def load_dataframe(
     df_path: Union[str, Path],
-    kind: Literal["polars"],
+    use_columns: Optional[list[str]] = None,
+    kind: Literal["polars"] = "polars",
     all_strings: bool = False,
     verbose: bool = True
 ) -> Tuple[pl.DataFrame, str]:
@@ -50,6 +55,7 @@ def load_dataframe(
 def load_dataframe(
     df_path: Union[str, Path],
+    use_columns: Optional[list[str]] = None,
     kind: Literal["pandas", "polars"] = "pandas",
     all_strings: bool = False,
     verbose: bool = True
@@ -58,11 +64,13 @@ def load_dataframe(
     Load a CSV file into a DataFrame and extract its base name.
     Can load data as either a pandas or a polars DataFrame. Allows for loading all
-    columns as string types to prevent type inference errors.
+    columns or a subset of columns as string types to prevent type inference errors.
     Args:
         df_path (str, Path):
             The path to the CSV file.
+        use_columns (list[str] | None):
+            If provided, only these columns will be loaded from the CSV.
         kind ("pandas", "polars"):
             The type of DataFrame to load. Defaults to "pandas".
         all_strings (bool):
@@ -76,28 +84,43 @@ def load_dataframe(
     Raises:
         FileNotFoundError: If the file does not exist at the given path.
-        ValueError: If the DataFrame is empty or an invalid 'kind' is provided.
+        ValueError: If the DataFrame is empty, an invalid 'kind' is provided, or a column in 'use_columns' is not found in the file.
     """
     path = make_fullpath(df_path)
     df_name = path.stem
-    if kind == "pandas":
-        if all_strings:
-            df = pd.read_csv(path, encoding='utf-8', dtype=str)
-        else:
-            df = pd.read_csv(path, encoding='utf-8')
-    elif kind == "polars":
-        if all_strings:
-            df = pl.read_csv(path, infer_schema=False)
+    try:
+        if kind == "pandas":
+            pd_kwargs: dict[str,Any]
+            pd_kwargs = {'encoding': 'utf-8'}
+            if use_columns:
+                pd_kwargs['usecols'] = use_columns
+            if all_strings:
+                pd_kwargs['dtype'] = str
+            df = pd.read_csv(path, **pd_kwargs)
+        elif kind == "polars":
+            pl_kwargs: dict[str,Any]
+            pl_kwargs = {}
+            if use_columns:
+                pl_kwargs['columns'] = use_columns
+            if all_strings:
+                pl_kwargs['infer_schema'] = False
+            else:
+                pl_kwargs['infer_schema_length'] = 1000
+            df = pl.read_csv(path, **pl_kwargs)
         else:
-            # Default behavior: infer the schema.
-            df = pl.read_csv(path, infer_schema_length=1000)
+            _LOGGER.error(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
+            raise ValueError()
-    else:
-        _LOGGER.error(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
-        raise ValueError()
+    except (ValueError, pl.exceptions.ColumnNotFoundError) as e:
+        _LOGGER.error(f"Failed to load '{df_name}'. A specified column may not exist in the file.")
+        raise e
     # This check works for both pandas and polars DataFrames
     if df.shape[0] == 0:
@@ -109,7 +132,6 @@ def load_dataframe(
     return df, df_name # type: ignore
 def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
     """
     Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
@@ -560,5 +582,205 @@ def train_dataset_yielder(
         yield (df_features, df_target, feature_names, target_col)
+def find_model_artifacts(target_directory: Union[str,Path], load_scaler: bool, verbose: bool=False) -> list[dict[str,Any]]:
+    """
+    Scans subdirectories to find paths to model weights, target names, feature names, and model architecture. Optionally an scaler path if `load_scaler` is True.
+    This function operates on a specific directory structure. It expects the
+    `target_directory` to contain one or more subdirectories, where each
+    subdirectory represents a single trained model result.
+    The expected directory structure for each model is as follows:
+    ```
+        target_directory
+        ├── model_1
+        │   ├── *.pth
+        │   ├── scaler_*.pth          (Required if `load_scaler` is True)
+        │   ├── feature_names.txt
+        │   ├── target_names.txt
+        │   └── architecture.json
+        └── model_2/
+            └── ...
+    ```
+    Args:
+        target_directory (str | Path): The path to the root directory that contains model subdirectories.
+        load_scaler (bool): If True, the function requires and searches for a scaler file (`.pth`) in each model subdirectory.
+        verbose (bool): If True, enables detailed logging during the file paths search process.
+    Returns:
+        (list[dict[str, Path]]): A list of dictionaries, where each dictionary
+            corresponds to a model found in a subdirectory. The dictionary
+            maps standardized keys to the absolute paths of the model's
+            artifacts (weights, architecture, features, targets, and scaler).
+            The scaler path will be `None` if `load_scaler` is False.
+    """
+    # validate directory
+    root_path = make_fullpath(target_directory, enforce="directory")
+    # store results
+    all_artifacts: list[dict] = list()
+    # find model directories
+    result_dirs_dict = list_subdirectories(root_dir=root_path, verbose=verbose)
+    for dir_name, dir_path in result_dirs_dict.items():
+        # find files
+        model_pth_dict = list_files_by_extension(directory=dir_path, extension="pth", verbose=verbose)
+        # restriction
+        if load_scaler:
+            if len(model_pth_dict) != 2:
+                _LOGGER.error(f"Directory {dir_path} should contain exactly 2 '.pth' files: scaler and weights.")
+                raise IOError()
+        else:
+            if len(model_pth_dict) != 1:
+                _LOGGER.error(f"Directory {dir_path} should contain exactly 1 '.pth' file: weights.")
+                raise IOError()
+        ##### Scaler and Weights #####
+        scaler_path = None
+        weights_path = None
+        # load weights and scaler if present
+        for pth_filename, pth_path in model_pth_dict.items():
+            if load_scaler and pth_filename.lower().startswith(DatasetKeys.SCALER_PREFIX):
+                scaler_path = pth_path
+            else:
+                weights_path = pth_path
+        # validation
+        if not weights_path:
+            _LOGGER.error(f"Error parsing the model weights path from '{dir_name}'")
+            raise IOError()
+        if load_scaler and not scaler_path:
+            _LOGGER.error(f"Error parsing the scaler path from '{dir_name}'")
+            raise IOError()
+        ##### Target and Feature names #####
+        target_names_path = None
+        feature_names_path = None
+        # load feature and target names
+        model_txt_dict = list_files_by_extension(directory=dir_path, extension="txt", verbose=verbose)
+        for txt_filename, txt_path in model_txt_dict.items():
+            if txt_filename == DatasetKeys.FEATURE_NAMES:
+                feature_names_path = txt_path
+            elif txt_filename == DatasetKeys.TARGET_NAMES:
+                target_names_path = txt_path
+        # validation
+        if not target_names_path or not feature_names_path:
+            _LOGGER.error(f"Error parsing features path or targets path from '{dir_name}'")
+            raise IOError()
+        ##### load model architecture path #####
+        architecture_path = None
+        model_json_dict = list_files_by_extension(directory=dir_path, extension="json", verbose=verbose)
+        for json_filename, json_path in model_json_dict.items():
+            if json_filename == PytorchModelArchitectureKeys.SAVENAME:
+                architecture_path = json_path
+        # validation
+        if not architecture_path:
+            _LOGGER.error(f"Error parsing the model architecture path from '{dir_name}'")
+            raise IOError()
+        ##### Paths dictionary #####
+        parsing_dict = {
+            PytorchArtifactPathKeys.WEIGHTS_PATH: weights_path,
+            PytorchArtifactPathKeys.ARCHITECTURE_PATH: architecture_path,
+            PytorchArtifactPathKeys.FEATURES_PATH: feature_names_path,
+            PytorchArtifactPathKeys.TARGETS_PATH: target_names_path,
+            PytorchArtifactPathKeys.SCALER_PATH: scaler_path
+        }
+        all_artifacts.append(parsing_dict)
+    return all_artifacts
+def select_features_by_shap(
+    root_directory: Union[str, Path],
+    shap_threshold: float = 1.0,
+    verbose: bool = True) -> list[str]:
+    """
+    Scans subdirectories to find SHAP summary CSVs, then extracts feature
+    names whose mean absolute SHAP value meets a specified threshold.
+    This function is useful for automated feature selection based on feature
+    importance scores aggregated from multiple models.
+    Args:
+        root_directory (Union[str, Path]):
+            The path to the root directory that contains model subdirectories.
+        shap_threshold (float):
+            The minimum mean absolute SHAP value for a feature to be included
+            in the final list.
+    Returns:
+        list[str]:
+            A single, sorted list of unique feature names that meet the
+            threshold criteria across all found files.
+    """
+    if verbose:
+        _LOGGER.info(f"Starting feature selection with SHAP threshold >= {shap_threshold}")
+    root_path = make_fullpath(root_directory, enforce="directory")
+    # --- Step 2: Directory and File Discovery ---
+    subdirectories = list_subdirectories(root_dir=root_path, verbose=False)
+    shap_filename = SHAPKeys.SAVENAME + ".csv"
+    valid_csv_paths = []
+    for dir_name, dir_path in subdirectories.items():
+        expected_path = dir_path / shap_filename
+        if expected_path.is_file():
+            valid_csv_paths.append(expected_path)
+        else:
+            _LOGGER.warning(f"No '{shap_filename}' found in subdirectory '{dir_name}'.")
+    if not valid_csv_paths:
+        _LOGGER.error(f"Process halted: No '{shap_filename}' files were found in any subdirectory.")
+        return []
+    if verbose:
+        _LOGGER.info(f"Found {len(valid_csv_paths)} SHAP summary files to process.")
+    # --- Step 3: Data Processing and Feature Extraction ---
+    master_feature_set = set()
+    for csv_path in valid_csv_paths:
+        try:
+            df, _ = load_dataframe(csv_path, kind="pandas", verbose=False)
+            # Validate required columns
+            required_cols = {SHAPKeys.FEATURE_COLUMN, SHAPKeys.SHAP_VALUE_COLUMN}
+            if not required_cols.issubset(df.columns):
+                _LOGGER.warning(f"Skipping '{csv_path}': missing required columns.")
+                continue
+            # Filter by threshold and extract features
+            filtered_df = df[df[SHAPKeys.SHAP_VALUE_COLUMN] >= shap_threshold]
+            features = filtered_df[SHAPKeys.FEATURE_COLUMN].tolist()
+            master_feature_set.update(features)
+        except (ValueError, pd.errors.EmptyDataError):
+            _LOGGER.warning(f"Skipping '{csv_path}' because it is empty or malformed.")
+            continue
+        except Exception as e:
+            _LOGGER.error(f"An unexpected error occurred while processing '{csv_path}': {e}")
+            continue
+    # --- Step 4: Finalize and Return ---
+    final_features = sorted(list(master_feature_set))
+    if verbose:
+        _LOGGER.info(f"Selected {len(final_features)} unique features across all files.")
+    return final_features
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-10.7.0.dist-info → dragon_ml_toolbox-10.9.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.7.0.dist-info → dragon_ml_toolbox-10.9.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.7.0.dist-info → dragon_ml_toolbox-10.9.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.7.0.dist-info → dragon_ml_toolbox-10.9.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 10.7.0__py3-none-any.whl → 10.9.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 10.7.0py3-none-any.whl → 10.9.0py3-none-any.whl