PyPI - dragon-ml-toolbox - Versions diffs - 7.0.0__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

dragon-ml-toolbox 7.0.0py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (14) hide show

{dragon_ml_toolbox-7.0.0.dist-info → dragon_ml_toolbox-8.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 7.0.0
+Version: 8.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -147,6 +147,7 @@ ensemble_learning
 ETL_engineering
 ML_callbacks
 ML_datasetmaster
+ML_evaluation_multi
 ML_evaluation
 ML_inference
 ML_models

{dragon_ml_toolbox-7.0.0.dist-info → dragon_ml_toolbox-8.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,25 +1,27 @@
-dragon_ml_toolbox-7.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-7.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+dragon_ml_toolbox-8.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-8.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
 ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
 ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
 ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
 ml_tools/ML_callbacks.py,sha256=noedVMmHZ72Odbg28zqx5wkhhvX2v-jXicKE_NCAiqU,13838
-ml_tools/ML_datasetmaster.py,sha256=dqda1E4ZEdSBwpRpdCO7pTLWcD6hzp-LzwreDRlNQdg,26452
-ml_tools/ML_evaluation.py,sha256=lTynGYqp3nAKc3kJDc7DDuWLYXW395zbWAZqTicQm9U,15949
-ml_tools/ML_inference.py,sha256=ctgVtpsnQw9OZ-lxIRih93u2JRHBq5zzVtaYOQrZ5eA,14465
-ml_tools/ML_models.py,sha256=NhlRHL_eoAQDih59SpoCmdNXLiLK6ZzS02tV_vYqko0,28159
+ml_tools/ML_datasetmaster.py,sha256=tN-GBPEwXRWFBT8r8K0v9b3Bd77DhqSH5FkjDP6BHTw,28847
+ml_tools/ML_evaluation.py,sha256=BER5dOvSTySNzO92gm8tIpqJ5vT-s0iHMmaoly1uUH8,16018
+ml_tools/ML_evaluation_multi.py,sha256=uVtKGYWgOLv34Xj_jz6E_HAYzNb0HwRbMwA8oFZWpUk,12395
+ml_tools/ML_inference.py,sha256=hwtAdyDCE1xtqLgJgyOTAPck0eTmkOCJK1cM_IJSdck,22824
+ml_tools/ML_models.py,sha256=xZiSFh7S6eitl-VjjvNpsikojDvurK8n_ueLEh6_5pM,27979
 ml_tools/ML_optimization.py,sha256=GX-qZ2mCI3gWRCTP5w7lXrZpfGle3J_mE0O68seIoio,13475
 ml_tools/ML_scaler.py,sha256=pGkp1nUpeuoBvbq5hUkieQdxex6kNef1mEbeS_HUCJs,7471
-ml_tools/ML_trainer.py,sha256=DrGF7fTbYNOa34v5lN__770F59ZJ_75t3vnW_8GRfwA,18255
+ml_tools/ML_trainer.py,sha256=6JSmEQaCPSo-S_5plNBTPw-SYgzZpyMNwiqpShJf7qU,23726
 ml_tools/PSO_optimization.py,sha256=9Y074d-B5h4Wvp9YPiy6KAeXM-Yv6Il3gWalKvOLVgo,22705
 ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
 ml_tools/SQL.py,sha256=bkSTmMV4CtEqa67hApYWaRxTqwAlKIc5_b28P1bnDwg,10475
 ml_tools/VIF_factor.py,sha256=2nUMupfUoogf8o6ghoFZk_OwWhFXU0R3C9Gj0HOlI14,10415
+ml_tools/_ML_optimization_multi.py,sha256=DrNG3Vf1uUw-3CpYfXREgSGuR4dTpLWY1F3R9j-PYqQ,9816
 ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
 ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/custom_logger.py,sha256=nyLRxaRxkqYOFdSjI0X2BWXB8C2IU18QfmqIFKqSedI,5820
-ml_tools/data_exploration.py,sha256=P4f8OpRa7Q4i-11nkppxXw5Lx2lwlpn20GwWBbN_xbM,23901
+ml_tools/data_exploration.py,sha256=RuMHWagXrSQi1MzAMlYeBeVg7UxhVvEq8gJ9bIam2BM,27103
 ml_tools/ensemble_evaluation.py,sha256=wnqoTPg4WYWf2A8z5XT0eSlW4snEuLCXQVj88sZKzQ4,24683
 ml_tools/ensemble_inference.py,sha256=rtU7eUaQne615n2g7IHZCJI-OvrBCcjxbTkEIvtCGFQ,9414
 ml_tools/ensemble_learning.py,sha256=dAyFgSTyvxJWjc_enJ_8EUoWwiekBeoNyJNxVY-kcUU,21868
@@ -28,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
 ml_tools/optimization_tools.py,sha256=EL5tgNFwRo-82pbRE1CFVy9noNhULD7wprWuKadPheg,5090
 ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
 ml_tools/utilities.py,sha256=LqXXTovaHbA5AOKRk6Ru6DgAPAM0wPfYU70kUjYBryo,19231
-dragon_ml_toolbox-7.0.0.dist-info/METADATA,sha256=FrQMXquPof2JQ7Kt5B7q7yytVJ2rI8fDKFbytcvJ4nU,6758
-dragon_ml_toolbox-7.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-7.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-7.0.0.dist-info/RECORD,,
+dragon_ml_toolbox-8.1.0.dist-info/METADATA,sha256=qGTl4__H1ZsbyJHtExcDt14i8ziWXpEy2WaRAELPmTI,6778
+dragon_ml_toolbox-8.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-8.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-8.1.0.dist-info/RECORD,,

ml_tools/ML_datasetmaster.py CHANGED Viewed

@@ -18,6 +18,7 @@ from .ML_scaler import PytorchScaler
 __all__ = [
     "DatasetMaker",
+    "DatasetMakerMulti",
     "VisionDatasetMaker",
     "SequenceMaker",
     "ResizeAspectFill",
@@ -57,71 +58,26 @@ class _PytorchDataset(Dataset):
         return self.features[index], self.labels[index]
-# Streamlined DatasetMaker version
-class DatasetMaker:
+# --- Abstract Base Class (New) ---
+# --- Abstract Base Class (Corrected) ---
+class _BaseDatasetMaker(ABC):
     """
-    A simplified dataset maker for pre-processed, numerical pandas DataFrames.
-    This class takes a DataFrame, automatically splits it into training and
-    testing sets, and converts them into PyTorch Datasets. It assumes the
-    target variable is the last column. It can also create, apply, and
-    save a PytorchScaler for standardizing continuous features.
-    Attributes:
-        `scaler` -> PytorchScaler | None
-        `train_dataset` -> PyTorch Dataset
-        `test_dataset`  -> PyTorch Dataset
-        `feature_names` -> list[str]
-        `target_name`   -> str
-        `id` -> str | None
-    The ID can be manually set to any string if needed, it is `None` by default.
+    Abstract base class for dataset makers. Contains shared logic for
+    splitting, scaling, and accessing datasets to reduce code duplication.
     """
-    def __init__(self,
-                 pandas_df: pandas.DataFrame,
-                 kind: Literal["regression", "classification"],
-                 test_size: float = 0.2,
-                 random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None,
-                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
-        """
-        Args:
-            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
-            kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
-            test_size (float): The proportion of the dataset to allocate to the test split.
-            random_state (int): The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
-            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
-        """
-        # Validation
-        if not isinstance(pandas_df, pandas.DataFrame):
-            raise TypeError("Input must be a pandas.DataFrame.")
-        if kind not in ["regression", "classification"]:
-            raise ValueError("`kind` must be 'regression' or 'classification'.")
-        # 1. Identify features and target
-        features = pandas_df.iloc[:, :-1]
-        target = pandas_df.iloc[:, -1]
-        self._feature_names = features.columns.tolist()
-        self._target_name = str(target.name)
-        #set id
+    def __init__(self):
+        self._train_ds: Optional[Dataset] = None
+        self._test_ds: Optional[Dataset] = None
+        self.scaler: Optional[PytorchScaler] = None
         self._id: Optional[str] = None
-        # set scaler
-        self.scaler = scaler
-        # 2. Split the data
-        X_train, X_test, y_train, y_test = train_test_split(
-            features, target, test_size=test_size, random_state=random_state
-        )
-        self._X_train_shape = X_train.shape
-        self._X_test_shape = X_test.shape
-        self._y_train_shape = y_train.shape
-        self._y_test_shape = y_test.shape
-        # 3. Handle Column to Index Conversion
+        self._feature_names: List[str] = []
+        self._X_train_shape = (0,0)
+        self._X_test_shape = (0,0)
+        self._y_train_shape = (0,)
+        self._y_test_shape = (0,)
+    def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
+        """Internal helper to fit and apply a PytorchScaler."""
         continuous_feature_indices: Optional[List[int]] = None
         if continuous_feature_columns:
             if all(isinstance(c, str) for c in continuous_feature_columns):
@@ -129,108 +85,201 @@ class DatasetMaker:
                 try:
                     continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
                 except KeyError as e:
-                    raise ValueError(f"Feature column '{e.args[0]}' not found in DataFrame.")
+                    raise ValueError(f"Feature column '{e.args[0]}' not found.")
             elif all(isinstance(c, int) for c in continuous_feature_columns):
                 continuous_feature_indices = continuous_feature_columns # type: ignore
             else:
                 raise TypeError("`continuous_feature_columns` must be a list of all strings or all integers.")
-        # 4. Handle Scaling
         X_train_values = X_train.values
         X_test_values = X_test.values
-        # If no scaler is provided, fit a new one from the training data
-        if self.scaler is None:
-            if continuous_feature_indices:
-                _LOGGER.info("Feature indices provided. Fitting a new PytorchScaler on training data.")
-                # A temporary dataset is needed for the PytorchScaler.fit method
-                temp_label_dtype = torch.float32 if kind == "regression" else torch.int64
-                temp_train_ds = _PytorchDataset(X_train_values, y_train.values, labels_dtype=temp_label_dtype)
-                self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
-        # If a scaler exists (either passed in or just fitted), apply it
+        if self.scaler is None and continuous_feature_indices:
+            _LOGGER.info("Fitting a new PytorchScaler on training data.")
+            temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
+            self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
         if self.scaler and self.scaler.mean_ is not None:
             _LOGGER.info("Applying scaler transformation to train and test feature sets.")
             X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
             X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
-            # Convert back to numpy for the _PytorchDataset class
-            X_train_values = X_train_tensor.numpy()
-            X_test_values = X_test_tensor.numpy()
+            return X_train_tensor.numpy(), X_test_tensor.numpy()
-        # 5. Convert to final PyTorch Datasets
-        label_dtype = torch.float32 if kind == "regression" else torch.int64
-        self._train_ds = _PytorchDataset(X_train_values, y_train.values, labels_dtype=label_dtype)
-        self._test_ds = _PytorchDataset(X_test_values, y_test.values, labels_dtype=label_dtype)
+        return X_train_values, X_test_values
     @property
     def train_dataset(self) -> Dataset:
-        """Returns the training PyTorch dataset."""
+        if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
         return self._train_ds
     @property
     def test_dataset(self) -> Dataset:
-        """Returns the testing PyTorch dataset."""
+        if self._test_ds is None: raise RuntimeError("Dataset not yet created.")
         return self._test_ds
     @property
     def feature_names(self) -> list[str]:
-        """Returns the list of feature column names."""
         return self._feature_names
-    @property
-    def target_name(self) -> str:
-        """Returns the name of the target column."""
-        return self._target_name
     @property
     def id(self) -> Optional[str]:
-        """Returns the object identifier if any."""
         return self._id
     @id.setter
     def id(self, dataset_id: str):
-        """Sets the ID value"""
-        if not isinstance(dataset_id, str):
-            raise ValueError(f"Dataset ID '{type(dataset_id)}' is not a string.")
+        if not isinstance(dataset_id, str): raise ValueError("ID must be a string.")
         self._id = dataset_id
     def dataframes_info(self) -> None:
-        """Prints the shape information of the split pandas DataFrames."""
-        print("--- Original DataFrame Shapes After Split ---")
-        print(f"  X_train shape: {self._X_train_shape}")
-        print(f"  y_train shape: {self._y_train_shape}\n")
-        print(f"  X_test shape:  {self._X_test_shape}")
-        print(f"  y_test shape:  {self._y_test_shape}")
-        print("-------------------------------------------")
+        print("--- DataFrame Shapes After Split ---")
+        print(f"  X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
+        print(f"  X_test shape:  {self._X_test_shape}, y_test shape:  {self._y_test_shape}")
+        print("------------------------------------")
     def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
         """Saves a list of feature names as a text file"""
         save_list_strings(list_strings=self._feature_names,
                           directory=directory,
                           filename="feature_names",
-                          verbose=verbose)
+                          verbose=verbose)
     def save_scaler(self, save_dir: Union[str, Path]):
         """
         Saves the fitted PytorchScaler's state to a .pth file.
-        The filename is automatically generated based on the target name.
+        The filename is automatically generated based on the dataset id.
         Args:
             save_dir (str | Path): The directory where the scaler will be saved.
         """
-        if not self.scaler:
-            _LOGGER.error("❌ No scaler was fitted or provided.")
-            return
+        if not self.scaler: raise RuntimeError("No scaler was fitted or provided.")
+        if not self.id: raise ValueError("Must set the `id` before saving scaler.")
         save_path = make_fullpath(save_dir, make=True, enforce="directory")
-        # Sanitize the target name for use in a filename
-        sanitized_target = sanitize_filename(self.target_name)
-        filename = f"scaler_{sanitized_target}.pth"
+        sanitized_id = sanitize_filename(self.id)
+        filename = f"scaler_{sanitized_id}.pth"
         filepath = save_path / filename
         self.scaler.save(filepath)
+        _LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
+# Single target dataset
+class DatasetMaker(_BaseDatasetMaker):
+    """
+    Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
+    This class takes a DataFrame, automatically splits it into training and
+    testing sets, and converts them into PyTorch Datasets. It assumes the
+    target variable is the last column. It can also create, apply, and
+    save a PytorchScaler for standardizing continuous features.
+    Attributes:
+        `scaler` -> PytorchScaler | None
+        `train_dataset` -> PyTorch Dataset
+        `test_dataset`  -> PyTorch Dataset
+        `feature_names` -> list[str]
+        `target_name`   -> str
+        `id` -> str
+    The ID can be manually set to any string if needed, it is the target name by default.
+    """
+    def __init__(self,
+                 pandas_df: pandas.DataFrame,
+                 kind: Literal["regression", "classification"],
+                 test_size: float = 0.2,
+                 random_state: int = 42,
+                 scaler: Optional[PytorchScaler] = None,
+                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
+        """
+        Args:
+            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
+            kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
+            test_size (float): The proportion of the dataset to allocate to the test split.
+            random_state (int): The seed for the random number generator for reproducibility.
+            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
+            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
+        """
+        super().__init__()
+        self.scaler = scaler
+        # --- 1. Identify features and target (single-target logic) ---
+        features = pandas_df.iloc[:, :-1]
+        target = pandas_df.iloc[:, -1]
+        self._feature_names = features.columns.tolist()
+        self._target_name = str(target.name)
+        self._id = self._target_name
+        # --- 2. Split ---
+        X_train, X_test, y_train, y_test = train_test_split(
+            features, target, test_size=test_size, random_state=random_state
+        )
+        self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
+        self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
+        label_dtype = torch.float32 if kind == "regression" else torch.int64
+        # --- 3. Scale ---
+        X_train_final, X_test_final = self._prepare_scaler(
+            X_train, y_train, X_test, label_dtype, continuous_feature_columns
+        )
+        # --- 4. Create Datasets ---
+        self._train_ds = _PytorchDataset(X_train_final, y_train.values, label_dtype)
+        self._test_ds = _PytorchDataset(X_test_final, y_test.values, label_dtype)
+    @property
+    def target_name(self) -> str:
+        return self._target_name
+# --- New Multi-Target Class ---
+class DatasetMakerMulti(_BaseDatasetMaker):
+    """
+    Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
+    This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
+    """
+    def __init__(self,
+                 pandas_df: pandas.DataFrame,
+                 target_columns: List[str],
+                 test_size: float = 0.2,
+                 random_state: int = 42,
+                 scaler: Optional[PytorchScaler] = None,
+                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
+        """
+        Args:
+            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
+            target_columns (list[str]): List of target column names.
+            test_size (float): The proportion of the dataset to allocate to the test split.
+            random_state (int): The seed for the random number generator for reproducibility.
+            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
+            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
+        """
+        super().__init__()
+        self.scaler = scaler
+        self._target_names = target_columns
+        self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
+        features = pandas_df[self._feature_names]
+        target = pandas_df[self._target_names]
+        X_train, X_test, y_train, y_test = train_test_split(
+            features, target, test_size=test_size, random_state=random_state
+        )
+        self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
+        self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
+        label_dtype = torch.float32
+        X_train_final, X_test_final = self._prepare_scaler(
+            X_train, y_train, X_test, label_dtype, continuous_feature_columns
+        )
+        self._train_ds = _PytorchDataset(X_train_final, y_train, label_dtype)
+        self._test_ds = _PytorchDataset(X_test_final, y_test, label_dtype)
+    @property
+    def target_names(self) -> list[str]:
+        return self._target_names
 # --- Private Base Class ---

ml_tools/ML_evaluation.py CHANGED Viewed

@@ -249,8 +249,11 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Union[s
     plt.close(fig_hist)
-def shap_summary_plot(model, background_data: Union[torch.Tensor,np.ndarray], instances_to_explain: Union[torch.Tensor,np.ndarray],
-                      feature_names: Optional[list[str]], save_dir: Union[str, Path]):
+def shap_summary_plot(model,
+                      background_data: Union[torch.Tensor,np.ndarray],
+                      instances_to_explain: Union[torch.Tensor,np.ndarray],
+                      feature_names: Optional[list[str]],
+                      save_dir: Union[str, Path]):
     """
     Calculates SHAP values and saves summary plots and data.

dragon-ml-toolbox 7.0.0__py3-none-any.whl → 8.1.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 7.0.0py3-none-any.whl → 8.1.0py3-none-any.whl