PyPI - dragon-ml-toolbox - Versions diffs - 12.13.0__py3-none-any.whl → 13.1.0__py3-none-any.whl - Mend

dragon-ml-toolbox 12.13.0py3-none-any.whl → 13.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (22) hide show

{dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/METADATA +1 -1
dragon_ml_toolbox-13.1.0.dist-info/RECORD +41 -0
ml_tools/ML_callbacks.py +40 -8
ml_tools/ML_datasetmaster.py +144 -63
ml_tools/ML_evaluation.py +6 -2
ml_tools/ML_evaluation_multi.py +8 -4
ml_tools/ML_inference.py +14 -4
ml_tools/ML_models.py +119 -55
ml_tools/ML_optimization.py +49 -36
ml_tools/ML_trainer.py +98 -11
ml_tools/PSO_optimization.py +5 -1
ml_tools/_schema.py +19 -0
ml_tools/data_exploration.py +75 -46
ml_tools/keys.py +9 -0
ml_tools/optimization_tools.py +65 -86
ml_tools/serde.py +1 -2
dragon_ml_toolbox-12.13.0.dist-info/RECORD +0 -41
ml_tools/ML_simple_optimization.py +0 -413
{dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/top_level.txt +0 -0

{dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 12.13.0
+Version: 13.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT

dragon_ml_toolbox-13.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,41 @@
+dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
+ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
+ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
+ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
+ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
+ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
+ml_tools/ML_datasetmaster.py,sha256=7QJnOM6GWFklKt2fiukITM3DK49i3ThK8wazb5szwpE,34396
+ml_tools/ML_evaluation.py,sha256=3u5dOhS77gn3kAshKr2GwSa5xZBF0YM77ZkFevqNPvA,18528
+ml_tools/ML_evaluation_multi.py,sha256=L6Ub_uObXsI7ToVCF6DtmAFekHRcga5wWMOnRYRR-BY,16121
+ml_tools/ML_inference.py,sha256=yq2gdN6s_OUYC5ZLQrIJC5BA5H33q8UKODXwb-_0M2c,23549
+ml_tools/ML_models.py,sha256=4Kb23pSusPMRH8h-R9ztK6JoH1lMuckxq7ihorll-H8,29965
+ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
+ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
+ml_tools/ML_trainer.py,sha256=9BP6JFClqGfe7GL-FGG3n5e-no9ssjEOLol7P6baGrI,29019
+ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
+ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
+ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
+ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
+ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
+ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
+ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
+ml_tools/_schema.py,sha256=MYYAO8CYygIvwv9TkGBAxzZpG7xQ2IV8_yB5zzFin0c,710
+ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
+ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
+ml_tools/custom_logger.py,sha256=7tSAgRL7e-Ekm7rS1FLDocaPLCnaoKc7VSrtfwCtCEg,10067
+ml_tools/data_exploration.py,sha256=aVcxjoXVqrmFBpwBSbLvrG8quzJfr92On48Sy3K58Vs,51900
+ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
+ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
+ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
+ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
+ml_tools/keys.py,sha256=eJ4St5fl8uHstEGO1XVdP8G-ddwjOxV9zqG0D6W8pCI,2124
+ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
+ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
+ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
+ml_tools/serde.py,sha256=Wjf8N1thSfJ4r6Vm_pWxP2UTPcP2f3s2FiGz0z6kqKI,4925
+ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
+dragon_ml_toolbox-13.1.0.dist-info/METADATA,sha256=8n0bhl_rSVdg6MDh51r7tl5JflbqIOdqZx5gjaBWk0o,6166
+dragon_ml_toolbox-13.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-13.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-13.1.0.dist-info/RECORD,,

ml_tools/ML_callbacks.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Union, Literal, Optional
 from pathlib import Path
 from .path_manager import make_fullpath, sanitize_filename
-from .keys import PyTorchLogKeys
+from .keys import PyTorchLogKeys, PyTorchCheckpointKeys
 from ._logger import _LOGGER
 from ._script_info import _script_info
@@ -189,7 +189,7 @@ class EarlyStopping(Callback):
 class ModelCheckpoint(Callback):
     """
-    Saves the model weights to a directory with automated filename generation and rotation.
+    Saves the model weights, optimizer state, LR scheduler state (if any), and epoch number to a directory with automated filename generation and rotation.
     """
     def __init__(self, save_dir: Union[str,Path], checkpoint_name: Optional[str]=None, monitor: str = PyTorchLogKeys.VAL_LOSS,
                  save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
@@ -200,7 +200,7 @@ class ModelCheckpoint(Callback):
         Args:
             save_dir (str): Directory where checkpoint files will be saved.
             checkpoint_name (str| None): If None, the filename will include the epoch and score.
-            monitor (str): Metric to monitor for `save_best_only=True`.
+            monitor (str): Metric to monitor.
             save_best_only (bool): If true, save only the best model.
             mode (str): One of {'auto', 'min', 'max'}.
             verbose (int): Verbosity mode.
@@ -270,15 +270,29 @@ class ModelCheckpoint(Callback):
             if self.verbose > 0:
                 _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
+            # Update best score *before* saving
+            self.best = current
+            # Create a comprehensive checkpoint dictionary
+            checkpoint_data = {
+                PyTorchCheckpointKeys.EPOCH: epoch,
+                PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
+                PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
+                PyTorchCheckpointKeys.BEST_SCORE: self.best,
+            }
+            # Check for scheduler
+            if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
+                checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
             # Save the new best model
-            torch.save(self.trainer.model.state_dict(), new_filepath) # type: ignore
+            torch.save(checkpoint_data, new_filepath)
             # Delete the old best model file
             if self.last_best_filepath and self.last_best_filepath.exists():
                 self.last_best_filepath.unlink()
             # Update state
-            self.best = current
             self.last_best_filepath = new_filepath
     def _save_rolling_checkpoints(self, epoch, logs):
@@ -292,7 +306,19 @@ class ModelCheckpoint(Callback):
         if self.verbose > 0:
             _LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
-        torch.save(self.trainer.model.state_dict(), filepath) # type: ignore
+        # Create a comprehensive checkpoint dictionary
+        checkpoint_data = {
+            PyTorchCheckpointKeys.EPOCH: epoch,
+            PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
+            PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
+            PyTorchCheckpointKeys.BEST_SCORE: self.best, # Save the current best score
+        }
+        if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
+            checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
+        torch.save(checkpoint_data, filepath)
         self.saved_checkpoints.append(filepath)
@@ -309,19 +335,25 @@ class LRScheduler(Callback):
     """
     Callback to manage a PyTorch learning rate scheduler.
     """
-    def __init__(self, scheduler, monitor: Optional[str] = None):
+    def __init__(self, scheduler, monitor: Optional[str] = PyTorchLogKeys.VAL_LOSS):
         """
         This callback automatically calls the scheduler's `step()` method at the
         end of each epoch. It also logs a message when the learning rate changes.
         Args:
             scheduler: An initialized PyTorch learning rate scheduler.
-            monitor (str, optional): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
+            monitor (str): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
         """
         super().__init__()
         self.scheduler = scheduler
         self.monitor = monitor
         self.previous_lr = None
+    def set_trainer(self, trainer):
+        """This is called by the Trainer to associate itself with the callback."""
+        super().set_trainer(trainer)
+        # Register the scheduler with the trainer so it can be added to the checkpoint
+        self.trainer.scheduler = self.scheduler # type: ignore
     def on_train_begin(self, logs=None):
         """Store the initial learning rate."""

ml_tools/ML_datasetmaster.py CHANGED Viewed

@@ -17,6 +17,7 @@ from ._script_info import _script_info
 from .custom_logger import save_list_strings
 from .ML_scaler import PytorchScaler
 from .keys import DatasetKeys
+from ._schema import FeatureSchema
 __all__ = [
@@ -35,7 +36,7 @@ class _PytorchDataset(Dataset):
     Converts numpy/pandas data into tensors for model consumption.
     """
     def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
-                 labels: Union[numpy.ndarray, pandas.Series],
+                 labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
                  labels_dtype: torch.dtype,
                  features_dtype: torch.dtype = torch.float32,
                  feature_names: Optional[List[str]] = None,
@@ -48,13 +49,16 @@ class _PytorchDataset(Dataset):
         if isinstance(features, numpy.ndarray):
             self.features = torch.tensor(features, dtype=features_dtype)
-        else:
-            self.features = torch.tensor(features.values, dtype=features_dtype)
+        else: # It's a pandas.DataFrame
+            self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
         if isinstance(labels, numpy.ndarray):
             self.labels = torch.tensor(labels, dtype=labels_dtype)
+        elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
+            self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
         else:
-            self.labels = torch.tensor(labels.values, dtype=labels_dtype)
+             # Fallback for other types (though your type hints don't cover this)
+            self.labels = torch.tensor(labels, dtype=labels_dtype)
         self._feature_names = feature_names
         self._target_names = target_names
@@ -98,27 +102,34 @@ class _BaseDatasetMaker(ABC):
         self._X_test_shape = (0,0)
         self._y_train_shape = (0,)
         self._y_test_shape = (0,)
-    def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
-        """Internal helper to fit and apply a PytorchScaler."""
+    def _prepare_scaler(self,
+                        X_train: pandas.DataFrame,
+                        y_train: Union[pandas.Series, pandas.DataFrame],
+                        X_test: pandas.DataFrame,
+                        label_dtype: torch.dtype,
+                        schema: FeatureSchema):
+        """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
         continuous_feature_indices: Optional[List[int]] = None
-        if continuous_feature_columns:
-            if all(isinstance(c, str) for c in continuous_feature_columns):
-                name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
-                try:
-                    continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
-                except KeyError as e:
-                    _LOGGER.error(f"Feature column '{e.args[0]}' not found.")
-                    raise ValueError()
-            elif all(isinstance(c, int) for c in continuous_feature_columns):
-                continuous_feature_indices = continuous_feature_columns # type: ignore
-            else:
-                _LOGGER.error("'continuous_feature_columns' must be a list of all strings or all integers.")
-                raise TypeError()
+        # Get continuous feature indices *from the schema*
+        if schema.continuous_feature_names:
+            _LOGGER.info("Getting continuous feature indices from schema.")
+            try:
+                # Convert columns to a standard list for .index()
+                train_cols_list = X_train.columns.to_list()
+                # Map names from schema to column indices in the training DataFrame
+                continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
+            except ValueError as e: #
+                _LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
+                raise ValueError()
+        else:
+            _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
         X_train_values = X_train.values
         X_test_values = X_test.values
+        # continuous_feature_indices is derived
         if self.scaler is None and continuous_feature_indices:
             _LOGGER.info("Fitting a new PytorchScaler on training data.")
             temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
@@ -225,10 +236,8 @@ class DatasetMaker(_BaseDatasetMaker):
     """
     Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
-    This class takes a DataFrame, automatically splits it into training and
-    testing sets, and converts them into PyTorch Datasets. It assumes the
-    target variable is the last column. It can also create, apply, and
-    save a PytorchScaler for standardizing continuous features.
+    This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
+    It can also create and apply a PytorchScaler using the schema.
     Attributes:
         `scaler` -> PytorchScaler | None
@@ -242,92 +251,164 @@ class DatasetMaker(_BaseDatasetMaker):
     """
     def __init__(self,
                  pandas_df: pandas.DataFrame,
+                 schema: FeatureSchema,
                  kind: Literal["regression", "classification"],
                  test_size: float = 0.2,
                  random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None,
-                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
+                 scaler: Optional[PytorchScaler] = None):
         """
         Args:
-            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
-            kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
-            test_size (float): The proportion of the dataset to allocate to the test split.
-            random_state (int): The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
-            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
+            pandas_df (pandas.DataFrame):
+                The pre-processed input DataFrame containing all columns. (features and single target).
+            schema (FeatureSchema):
+                The definitive schema object from data_exploration.
+            kind (Literal["regression", "classification"]):
+                The type of ML task. This determines the data type of the labels.
+            test_size (float):
+                The proportion of the dataset to allocate to the test split.
+            random_state (int):
+                The seed for the random number of generator for reproducibility.
+            scaler (PytorchScaler | None):
+                A pre-fitted PytorchScaler instance, if None a new scaler will be created.
         """
         super().__init__()
         self.scaler = scaler
-        # --- 1. Identify features and target (single-target logic) ---
-        features = pandas_df.iloc[:, :-1]
-        target = pandas_df.iloc[:, -1]
-        self._feature_names = features.columns.tolist()
-        self._target_names = [str(target.name)]
-        self._id = self._target_names[0]
+        # --- 1. Identify features (from schema) ---
+        self._feature_names = list(schema.feature_names)
+        # --- 2. Infer target (by set difference) ---
+        all_cols_set = set(pandas_df.columns)
+        feature_cols_set = set(self._feature_names)
+        target_cols_set = all_cols_set - feature_cols_set
+        if len(target_cols_set) == 0:
+            _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
+            raise ValueError("No target column found in DataFrame.")
+        if len(target_cols_set) > 1:
+            _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
+            raise ValueError("Ambiguous target: More than one non-feature column found.")
+        target_name = list(target_cols_set)[0]
+        self._target_names = [target_name]
+        self._id = target_name
+        # --- 3. Split Data ---
+        features_df = pandas_df[self._feature_names]
+        target_series = pandas_df[target_name]
-        # --- 2. Split ---
         X_train, X_test, y_train, y_test = train_test_split(
-            features, target, test_size=test_size, random_state=random_state
+            features_df,
+            target_series,
+            test_size=test_size,
+            random_state=random_state
         )
         self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
         self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
         label_dtype = torch.float32 if kind == "regression" else torch.int64
-        # --- 3. Scale ---
+        # --- 4. Scale (using the schema) ---
         X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, continuous_feature_columns
+            X_train, y_train, X_test, label_dtype, schema
         )
-        # --- 4. Create Datasets ---
-        self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
-        self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        # --- 5. Create Datasets ---
+        self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
-# --- New Multi-Target Class ---
+# --- Multi-Target Class ---
 class DatasetMakerMulti(_BaseDatasetMaker):
     """
-    Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
+    Dataset maker for pre-processed, numerical pandas DataFrames with
+    multiple target columns.
-    This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
+    This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
+    *target_columns*. It validates that the schema's features and the
+    target columns are mutually exclusive and together account for all
+    columns in the DataFrame.
+    Targets dtype is torch.float32
     """
     def __init__(self,
                  pandas_df: pandas.DataFrame,
                  target_columns: List[str],
+                 schema: FeatureSchema,
                  test_size: float = 0.2,
                  random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None,
-                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
+                 scaler: Optional[PytorchScaler] = None):
         """
         Args:
-            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
-            target_columns (list[str]): List of target column names.
-            test_size (float): The proportion of the dataset to allocate to the test split.
-            random_state (int): The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
-            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
+            pandas_df (pandas.DataFrame):
+                The pre-processed input DataFrame with *all* columns
+                (features and targets).
+            target_columns (list[str]):
+                List of target column names.
+            schema (FeatureSchema):
+                The definitive schema object from data_exploration.
+            test_size (float):
+                The proportion of the dataset to allocate to the test split.
+            random_state (int):
+                The seed for the random number generator for reproducibility.
+            scaler (PytorchScaler | None):
+                A pre-fitted PytorchScaler instance.
+        ## Note:
+        For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
+        This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
         """
         super().__init__()
         self.scaler = scaler
+        # --- 1. Get features and targets from schema/args ---
+        self._feature_names = list(schema.feature_names)
         self._target_names = target_columns
-        self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
-        features = pandas_df[self._feature_names]
-        target = pandas_df[self._target_names]
+        # --- 2. Validation ---
+        all_cols_set = set(pandas_df.columns)
+        feature_cols_set = set(self._feature_names)
+        target_cols_set = set(self._target_names)
+        overlap = feature_cols_set.intersection(target_cols_set)
+        if overlap:
+            _LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
+            raise ValueError("Features and targets overlap.")
+        schema_plus_targets = feature_cols_set.union(target_cols_set)
+        missing_cols = all_cols_set - schema_plus_targets
+        if missing_cols:
+            _LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
+        extra_cols = schema_plus_targets - all_cols_set
+        if extra_cols:
+            _LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
+            raise ValueError("Schema/target definition mismatch with DataFrame.")
+        # --- 3. Split Data ---
+        features_df = pandas_df[self._feature_names]
+        target_df = pandas_df[self._target_names]
         X_train, X_test, y_train, y_test = train_test_split(
-            features, target, test_size=test_size, random_state=random_state
+            features_df,
+            target_df,
+            test_size=test_size,
+            random_state=random_state
         )
         self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
         self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
-        label_dtype = torch.float32
+        # Multi-target for regression or multi-binary
+        label_dtype = torch.float32
+        # --- 4. Scale (using the schema) ---
         X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, continuous_feature_columns
+            X_train, y_train, X_test, label_dtype, schema
         )
+        # --- 5. Create Datasets ---
+        # _PytorchDataset now correctly handles y_train (a DataFrame)
         self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
         self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)

ml_tools/ML_evaluation.py CHANGED Viewed

@@ -19,6 +19,7 @@ import torch
 import shap
 from pathlib import Path
 from typing import Union, Optional, List, Literal
+import warnings
 from .path_manager import make_fullpath
 from ._logger import _LOGGER
@@ -298,8 +299,11 @@ def shap_summary_plot(model,
         background_data = background_data.to(device)
         instances_to_explain = instances_to_explain.to(device)
-        explainer = shap.DeepExplainer(model, background_data)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=UserWarning)
+            explainer = shap.DeepExplainer(model, background_data)
         # print("Calculating SHAP values with DeepExplainer...")
         shap_values = explainer.shap_values(instances_to_explain)
         instances_to_explain_np = instances_to_explain.cpu().numpy()

ml_tools/ML_evaluation_multi.py CHANGED Viewed

@@ -20,6 +20,7 @@ from sklearn.metrics import (
 )
 from pathlib import Path
 from typing import Union, List, Literal
+import warnings
 from .path_manager import make_fullpath, sanitize_filename
 from ._logger import _LOGGER
@@ -273,9 +274,12 @@ def multi_target_shap_summary_plot(
         background_data = background_data.to(device)
         instances_to_explain = instances_to_explain.to(device)
-        explainer = shap.DeepExplainer(model, background_data)
-        print("Calculating SHAP values with DeepExplainer...")
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=UserWarning)
+            explainer = shap.DeepExplainer(model, background_data)
+        # print("Calculating SHAP values with DeepExplainer...")
         # DeepExplainer returns a list of arrays for multi-output models
         shap_values_list = explainer.shap_values(instances_to_explain)
         instances_to_explain_np = instances_to_explain.cpu().numpy()
@@ -304,7 +308,7 @@ def multi_target_shap_summary_plot(
             return output.cpu().numpy() # Return full multi-output array
         explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
-        print("Calculating SHAP values with KernelExplainer...")
+        # print("Calculating SHAP values with KernelExplainer...")
         # KernelExplainer also returns a list of arrays for multi-output models
         shap_values_list = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
         # instances_to_explain_np is already set

ml_tools/ML_inference.py CHANGED Viewed

@@ -9,7 +9,7 @@ from .ML_scaler import PytorchScaler
 from ._script_info import _script_info
 from ._logger import _LOGGER
 from .path_manager import make_fullpath
-from .keys import PyTorchInferenceKeys
+from .keys import PyTorchInferenceKeys, PyTorchCheckpointKeys
 __all__ = [
@@ -56,11 +56,21 @@ class _BaseInferenceHandler(ABC):
         model_p = make_fullpath(state_dict, enforce="file")
         try:
-            # Load the state dictionary and apply it to the model structure
-            self.model.load_state_dict(torch.load(model_p, map_location=self.device))
+            # Load whatever is in the file
+            loaded_data = torch.load(model_p, map_location=self.device)
+            # Check if it's the new checkpoint dictionary or an old weights-only file
+            if isinstance(loaded_data, dict) and PyTorchCheckpointKeys.MODEL_STATE in loaded_data:
+                # It's a new training checkpoint, extract the weights
+                self.model.load_state_dict(loaded_data[PyTorchCheckpointKeys.MODEL_STATE])
+            else:
+                # It's an old-style file (or just a state_dict), load it directly
+                self.model.load_state_dict(loaded_data)
+            _LOGGER.info(f"Model state loaded from '{model_p.name}'.")
             self.model.to(self.device)
             self.model.eval()  # Set the model to evaluation mode
-            _LOGGER.info(f"Model state loaded from '{model_p.name}' and set to evaluation mode.")
         except Exception as e:
             _LOGGER.error(f"Failed to load model state from '{model_p}': {e}")
             raise

dragon-ml-toolbox 12.13.0__py3-none-any.whl → 13.1.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 12.13.0py3-none-any.whl → 13.1.0py3-none-any.whl