PyPI - dragon-ml-toolbox - Versions diffs - 13.3.2__tar.gz → 13.5.0__tar.gz - Mend

dragon-ml-toolbox 13.3.2tar.gz → 13.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (46) hide show

{dragon_ml_toolbox-13.3.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-13.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 13.3.2
+Version: 13.5.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-13.3.2 → dragon_ml_toolbox-13.5.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 13.3.2
+Version: 13.5.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-13.3.2 → dragon_ml_toolbox-13.5.0}/ml_tools/ML_datasetmaster.py RENAMED Viewed

@@ -126,8 +126,8 @@ class _BaseDatasetMaker(ABC):
         else:
             _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
-        X_train_values = X_train.values
-        X_test_values = X_test.values
+        X_train_values = X_train.to_numpy()
+        X_test_values = X_test.to_numpy()
         # continuous_feature_indices is derived
         if self.scaler is None and continuous_feature_indices:
@@ -253,26 +253,42 @@ class DatasetMaker(_BaseDatasetMaker):
                  pandas_df: pandas.DataFrame,
                  schema: FeatureSchema,
                  kind: Literal["regression", "classification"],
+                 scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
                  test_size: float = 0.2,
-                 random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None):
+                 random_state: int = 42):
         """
         Args:
             pandas_df (pandas.DataFrame):
                 The pre-processed input DataFrame containing all columns. (features and single target).
             schema (FeatureSchema):
                 The definitive schema object from data_exploration.
-            kind (Literal["regression", "classification"]):
+            kind ("regression" | "classification"):
                 The type of ML task. This determines the data type of the labels.
+            scaler ("fit" | "none" | PytorchScaler):
+                Strategy for data scaling:
+                - "fit": Fit a new PytorchScaler on continuous features.
+                - "none": Do not scale data (e.g., for TabularTransformer).
+                - PytorchScaler instance: Use a pre-fitted scaler to transform data.
             test_size (float):
                 The proportion of the dataset to allocate to the test split.
             random_state (int):
                 The seed for the random number of generator for reproducibility.
-            scaler (PytorchScaler | None):
-                A pre-fitted PytorchScaler instance, if None a new scaler will be created.
         """
         super().__init__()
-        self.scaler = scaler
+        _apply_scaling: bool = False
+        if scaler == "fit":
+            self.scaler = None # To be created
+            _apply_scaling = True
+        elif scaler == "none":
+            self.scaler = None
+        elif isinstance(scaler, PytorchScaler):
+            self.scaler = scaler # Use the provided one
+            _apply_scaling = True
+        else:
+            _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
+            raise ValueError()
         # --- 1. Identify features (from schema) ---
         self._feature_names = list(schema.feature_names)
@@ -310,9 +326,14 @@ class DatasetMaker(_BaseDatasetMaker):
         label_dtype = torch.float32 if kind == "regression" else torch.int64
         # --- 4. Scale (using the schema) ---
-        X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, schema
-        )
+        if _apply_scaling:
+            X_train_final, X_test_final = self._prepare_scaler(
+                X_train, y_train, X_test, label_dtype, schema
+            )
+        else:
+            _LOGGER.info("Features have not been scaled as specified.")
+            X_train_final = X_train.to_numpy()
+            X_test_final = X_test.to_numpy()
         # --- 5. Create Datasets ---
         self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
@@ -336,9 +357,9 @@ class DatasetMakerMulti(_BaseDatasetMaker):
                  pandas_df: pandas.DataFrame,
                  target_columns: List[str],
                  schema: FeatureSchema,
+                 scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
                  test_size: float = 0.2,
-                 random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None):
+                 random_state: int = 42):
         """
         Args:
             pandas_df (pandas.DataFrame):
@@ -348,20 +369,35 @@ class DatasetMakerMulti(_BaseDatasetMaker):
                 List of target column names.
             schema (FeatureSchema):
                 The definitive schema object from data_exploration.
+            scaler ("fit" | "none" | PytorchScaler):
+                Strategy for data scaling:
+                - "fit": Fit a new PytorchScaler on continuous features.
+                - "none": Do not scale data (e.g., for TabularTransformer).
+                - PytorchScaler instance: Use a pre-fitted scaler to transform data.
             test_size (float):
                 The proportion of the dataset to allocate to the test split.
             random_state (int):
                 The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None):
-                A pre-fitted PytorchScaler instance.
         ## Note:
         For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
         This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
         """
         super().__init__()
-        self.scaler = scaler
+        _apply_scaling: bool = False
+        if scaler == "fit":
+            self.scaler = None
+            _apply_scaling = True
+        elif scaler == "none":
+            self.scaler = None
+        elif isinstance(scaler, PytorchScaler):
+            self.scaler = scaler # Use the provided one
+            _apply_scaling = True
+        else:
+            _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
+            raise ValueError()
         # --- 1. Get features and targets from schema/args ---
         self._feature_names = list(schema.feature_names)
         self._target_names = target_columns
@@ -403,9 +439,14 @@ class DatasetMakerMulti(_BaseDatasetMaker):
         label_dtype = torch.float32
         # --- 4. Scale (using the schema) ---
-        X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, schema
-        )
+        if _apply_scaling:
+            X_train_final, X_test_final = self._prepare_scaler(
+                X_train, y_train, X_test, label_dtype, schema
+            )
+        else:
+            _LOGGER.info("Features have not been scaled as specified.")
+            X_train_final = X_train.to_numpy()
+            X_test_final = X_test.to_numpy()
         # --- 5. Create Datasets ---
         # _PytorchDataset now correctly handles y_train (a DataFrame)

{dragon_ml_toolbox-13.3.2 → dragon_ml_toolbox-13.5.0}/ml_tools/ML_evaluation.py RENAMED Viewed

@@ -258,7 +258,7 @@ def shap_summary_plot(model,
                       feature_names: Optional[list[str]],
                       save_dir: Union[str, Path],
                       device: torch.device = torch.device('cpu'),
-                      explainer_type: Literal['deep', 'kernel'] = 'deep'):
+                      explainer_type: Literal['deep', 'kernel'] = 'kernel'):
     """
     Calculates SHAP values and saves summary plots and data.
@@ -270,7 +270,7 @@ def shap_summary_plot(model,
         save_dir (str | Path): Directory to save SHAP artifacts.
         device (torch.device): The torch device for SHAP calculations.
         explainer_type (Literal['deep', 'kernel']): The explainer to use.
-            - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient for
+            - 'deep': Uses shap.DeepExplainer. Fast and efficient for
               PyTorch models.
             - 'kernel': Uses shap.KernelExplainer. Model-agnostic but EXTREMELY
               slow and memory-intensive.
@@ -285,7 +285,7 @@ def shap_summary_plot(model,
     instances_to_explain_np = None
     if explainer_type == 'deep':
-        # --- 1. Use DeepExplainer (Preferred) ---
+        # --- 1. Use DeepExplainer  ---
         # Ensure data is torch.Tensor
         if isinstance(background_data, np.ndarray):
@@ -309,10 +309,9 @@ def shap_summary_plot(model,
         instances_to_explain_np = instances_to_explain.cpu().numpy()
     elif explainer_type == 'kernel':
-        # --- 2. Use KernelExplainer (Slow Fallback) ---
+        # --- 2. Use KernelExplainer ---
         _LOGGER.warning(
-            "Using KernelExplainer. This is memory-intensive and slow. "
-            "Consider reducing 'n_samples' if the process terminates unexpectedly."
+            "KernelExplainer is memory-intensive and slow. Consider reducing the number of instances to explain if the process terminates unexpectedly."
         )
         # Ensure data is np.ndarray
@@ -348,14 +347,26 @@ def shap_summary_plot(model,
     else:
         _LOGGER.error(f"Invalid explainer_type: '{explainer_type}'. Must be 'deep' or 'kernel'.")
         raise ValueError()
+    if not isinstance(shap_values, list) and shap_values.ndim == 3 and shap_values.shape[2] == 1:
+        # _LOGGER.info("Squeezing SHAP values from (N, F, 1) to (N, F) for regression plot.")
+        shap_values = shap_values.squeeze(-1)
     # --- 3. Plotting and Saving ---
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
     plt.ioff()
+    # Convert instances to a DataFrame. robust way to ensure SHAP correctly maps values to feature names.
+    if feature_names is None:
+        # Create generic names if none were provided
+        num_features = instances_to_explain_np.shape[1]
+        feature_names = [f'feature_{i}' for i in range(num_features)]
+    instances_df = pd.DataFrame(instances_to_explain_np, columns=feature_names)
     # Save Bar Plot
     bar_path = save_dir_path / "shap_bar_plot.svg"
-    shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="bar", show=False)
+    shap.summary_plot(shap_values, instances_df, plot_type="bar", show=False)
     ax = plt.gca()
     ax.set_xlabel("SHAP Value Impact", labelpad=10)
     plt.title("SHAP Feature Importance")
@@ -366,7 +377,7 @@ def shap_summary_plot(model,
     # Save Dot Plot
     dot_path = save_dir_path / "shap_dot_plot.svg"
-    shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot", show=False)
+    shap.summary_plot(shap_values, instances_df, plot_type="dot", show=False)
     ax = plt.gca()
     ax.set_xlabel("SHAP Value Impact", labelpad=10)
     if plt.gcf().axes and len(plt.gcf().axes) > 1:
@@ -389,9 +400,6 @@ def shap_summary_plot(model,
         mean_abs_shap = np.abs(shap_values).mean(axis=0)
     mean_abs_shap = mean_abs_shap.flatten()
-    if feature_names is None:
-        feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
     summary_df = pd.DataFrame({
         SHAPKeys.FEATURE_COLUMN: feature_names,
@@ -401,7 +409,7 @@ def shap_summary_plot(model,
     summary_df.to_csv(summary_path, index=False)
     _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
-    plt.ion()
+    plt.ion()
 def plot_attention_importance(weights: List[torch.Tensor], feature_names: Optional[List[str]], save_dir: Union[str, Path], top_n: int = 10):

{dragon_ml_toolbox-13.3.2 → dragon_ml_toolbox-13.5.0}/ml_tools/ML_evaluation_multi.py RENAMED Viewed

@@ -235,7 +235,7 @@ def multi_target_shap_summary_plot(
     target_names: List[str],
     save_dir: Union[str, Path],
     device: torch.device = torch.device('cpu'),
-    explainer_type: Literal['deep', 'kernel'] = 'deep'
+    explainer_type: Literal['deep', 'kernel'] = 'kernel'
 ):
     """
     Calculates SHAP values for a multi-target model and saves summary plots and data for each target.
@@ -249,7 +249,7 @@ def multi_target_shap_summary_plot(
         save_dir (str | Path): Directory to save SHAP artifacts.
         device (torch.device): The torch device for SHAP calculations.
         explainer_type (Literal['deep', 'kernel']): The explainer to use.
-            - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient.
+            - 'deep': Uses shap.DeepExplainer. Fast and efficient.
             - 'kernel': Uses shap.KernelExplainer. Model-agnostic but slow and memory-intensive.
     """
     _LOGGER.info(f"--- Multi-Target SHAP Value Explanation (Using: {explainer_type.upper()}Explainer) ---")
@@ -260,7 +260,7 @@ def multi_target_shap_summary_plot(
     instances_to_explain_np = None
     if explainer_type == 'deep':
-        # --- 1. Use DeepExplainer (Preferred) ---
+        # --- 1. Use DeepExplainer ---
         # Ensure data is torch.Tensor
         if isinstance(background_data, np.ndarray):
@@ -285,10 +285,9 @@ def multi_target_shap_summary_plot(
         instances_to_explain_np = instances_to_explain.cpu().numpy()
     elif explainer_type == 'kernel':
-        # --- 2. Use KernelExplainer (Slow Fallback) ---
+        # --- 2. Use KernelExplainer  ---
         _LOGGER.warning(
-            "Using KernelExplainer. This is memory-intensive and slow. "
-            "Consider reducing 'n_samples' if the process terminates."
+            "KernelExplainer is memory-intensive and slow. Consider reducing the number of instances to explain if the process terminates unexpectedly."
         )
         # Convert all data to numpy

{dragon_ml_toolbox-13.3.2 → dragon_ml_toolbox-13.5.0}/ml_tools/ML_trainer.py RENAMED Viewed

@@ -9,7 +9,7 @@ from .ML_callbacks import Callback, History, TqdmProgressBar, ModelCheckpoint
 from .ML_evaluation import classification_metrics, regression_metrics, plot_losses, shap_summary_plot, plot_attention_importance
 from .ML_evaluation_multi import multi_target_regression_metrics, multi_label_classification_metrics, multi_target_shap_summary_plot
 from ._script_info import _script_info
-from .keys import PyTorchLogKeys, PyTorchCheckpointKeys
+from .keys import PyTorchLogKeys, PyTorchCheckpointKeys, DatasetKeys
 from ._logger import _LOGGER
 from .path_manager import make_fullpath
@@ -408,7 +408,7 @@ class MLTrainer:
                 n_samples: int = 300,
                 feature_names: Optional[List[str]] = None,
                 target_names: Optional[List[str]] = None,
-                explainer_type: Literal['deep', 'kernel'] = 'deep'):
+                explainer_type: Literal['deep', 'kernel'] = 'kernel'):
         """
         Explains model predictions using SHAP and saves all artifacts.
@@ -422,11 +422,11 @@ class MLTrainer:
             explain_dataset (Dataset | None): A specific dataset to explain.
                                                  If None, the trainer's test dataset is used.
             n_samples (int): The number of samples to use for both background and explanation.
-            feature_names (list[str] | None): Feature names.
+            feature_names (list[str] | None): Feature names. If None, the names will be extracted from the Dataset and raise an error on failure.
             target_names (list[str] | None): Target names for multi-target tasks.
             save_dir (str | Path): Directory to save all SHAP artifacts.
             explainer_type (Literal['deep', 'kernel']): The explainer to use.
-                - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient for PyTorch models.
+                - 'deep': Uses shap.DeepExplainer. Fast and efficient for PyTorch models.
                 - 'kernel': Uses shap.KernelExplainer. Model-agnostic but EXTREMELY slow and memory-intensive. Use with a very low 'n_samples'< 100.
         """
         # Internal helper to create a dataloader and get a random sample
@@ -474,10 +474,10 @@ class MLTrainer:
         # attempt to get feature names
         if feature_names is None:
             # _LOGGER.info("`feature_names` not provided. Attempting to extract from dataset...")
-            if hasattr(target_dataset, "feature_names"):
+            if hasattr(target_dataset, DatasetKeys.FEATURE_NAMES):
                 feature_names = target_dataset.feature_names # type: ignore
             else:
-                _LOGGER.error("Could not extract `feature_names` from the dataset. It must be provided if the dataset object does not have a `feature_names` attribute.")
+                _LOGGER.error(f"Could not extract `feature_names` from the dataset. It must be provided if the dataset object does not have a '{DatasetKeys.FEATURE_NAMES}' attribute.")
                 raise ValueError()
         # move model to device
@@ -498,7 +498,7 @@ class MLTrainer:
             # try to get target names
             if target_names is None:
                 target_names = []
-                if hasattr(target_dataset, 'target_names'):
+                if hasattr(target_dataset, DatasetKeys.TARGET_NAMES):
                     target_names = target_dataset.target_names # type: ignore
                 else:
                     # Infer number of targets from the model's output layer
@@ -549,7 +549,7 @@ class MLTrainer:
                 yield attention_weights
     def explain_attention(self, save_dir: Union[str, Path],
-                          feature_names: Optional[List[str]],
+                          feature_names: Optional[List[str]] = None,
                           explain_dataset: Optional[Dataset] = None,
                           plot_n_features: int = 10):
         """
@@ -559,7 +559,7 @@ class MLTrainer:
         Args:
             save_dir (str | Path): Directory to save the plot and summary data.
-            feature_names (List[str] | None): Names for the features for plot labeling. If not given, generic names will be used.
+            feature_names (List[str] | None): Names for the features for plot labeling. If None, the names will be extracted from the Dataset and raise an error on failure.
             explain_dataset (Dataset, optional): A specific dataset to explain. If None, the trainer's test dataset is used.
             plot_n_features (int): Number of top features to plot.
         """
@@ -580,6 +580,14 @@ class MLTrainer:
             _LOGGER.error("The explanation dataset is empty or invalid. Skipping attention analysis.")
             return
+        # Get feature names
+        if feature_names is None:
+            if hasattr(dataset_to_use, DatasetKeys.FEATURE_NAMES):
+                feature_names = dataset_to_use.feature_names # type: ignore
+            else:
+                _LOGGER.error(f"Could not extract `feature_names` from the dataset for attention plot. It must be provided if the dataset object does not have a '{DatasetKeys.FEATURE_NAMES}' attribute.")
+                raise ValueError()
         explain_loader = DataLoader(
             dataset=dataset_to_use, batch_size=32, shuffle=False,
             num_workers=0 if self.device.type == 'mps' else self.dataloader_workers,

{dragon_ml_toolbox-13.3.2 → dragon_ml_toolbox-13.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "13.3.2"
+version = "13.5.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl L. Loza Vidaurre", email = "luigiloza@gmail.com" }