PyPI - dragon-ml-toolbox - Versions diffs - 10.6.0__tar.gz → 10.8.0__tar.gz - Mend

dragon-ml-toolbox 10.6.0tar.gz → 10.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show

{dragon_ml_toolbox-10.6.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.6.0
+Version: 10.8.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.6.0
+Version: 10.8.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0}/ml_tools/ML_datasetmaster.py RENAMED Viewed

@@ -15,6 +15,8 @@ from ._logger import _LOGGER
 from ._script_info import _script_info
 from .custom_logger import save_list_strings
 from .ML_scaler import PytorchScaler
+from .keys import DatasetKeys
 __all__ = [
     "DatasetMaker",
@@ -34,7 +36,9 @@ class _PytorchDataset(Dataset):
     def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
                  labels: Union[numpy.ndarray, pandas.Series],
                  labels_dtype: torch.dtype,
-                 features_dtype: torch.dtype = torch.float32):
+                 features_dtype: torch.dtype = torch.float32,
+                 feature_names: Optional[List[str]] = None,
+                 target_names: Optional[List[str]] = None):
         """
         integer labels for classification.
@@ -50,12 +54,30 @@ class _PytorchDataset(Dataset):
             self.labels = torch.tensor(labels, dtype=labels_dtype)
         else:
             self.labels = torch.tensor(labels.values, dtype=labels_dtype)
+        self._feature_names = feature_names
+        self._target_names = target_names
     def __len__(self):
         return len(self.features)
     def __getitem__(self, index):
         return self.features[index], self.labels[index]
+    @property
+    def feature_names(self):
+        if self._feature_names is not None:
+            return self._feature_names
+        else:
+            _LOGGER.error(f"Dataset {self.__class__} has not been initialized with any feature names.")
+            raise ValueError()
+    @property
+    def target_names(self):
+        if self._target_names is not None:
+            return self._target_names
+        else:
+            _LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
 # --- Abstract Base Class (New) ---
@@ -71,6 +93,7 @@ class _BaseDatasetMaker(ABC):
         self.scaler: Optional[PytorchScaler] = None
         self._id: Optional[str] = None
         self._feature_names: List[str] = []
+        self._target_names: List[str] = []
         self._X_train_shape = (0,0)
         self._X_test_shape = (0,0)
         self._y_train_shape = (0,)
@@ -122,6 +145,10 @@ class _BaseDatasetMaker(ABC):
     @property
     def feature_names(self) -> list[str]:
         return self._feature_names
+    @property
+    def target_names(self) -> list[str]:
+        return self._target_names
     @property
     def id(self) -> Optional[str]:
@@ -142,10 +169,17 @@ class _BaseDatasetMaker(ABC):
         """Saves a list of feature names as a text file"""
         save_list_strings(list_strings=self._feature_names,
                           directory=directory,
-                          filename="feature_names",
-                          verbose=verbose)
+                          filename=DatasetKeys.FEATURE_NAMES,
+                          verbose=verbose)
+    def save_target_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
+        """Saves a list of target names as a text file"""
+        save_list_strings(list_strings=self._target_names,
+                          directory=directory,
+                          filename=DatasetKeys.TARGET_NAMES,
+                          verbose=verbose)
-    def save_scaler(self, save_dir: Union[str, Path]):
+    def save_scaler(self, save_dir: Union[str, Path], verbose: bool=True) -> None:
         """
         Saves the fitted PytorchScaler's state to a .pth file.
@@ -158,14 +192,15 @@ class _BaseDatasetMaker(ABC):
             _LOGGER.error("No scaler was fitted or provided.")
             raise RuntimeError()
         if not self.id:
-            _LOGGER.error("Must set the `id` before saving scaler.")
+            _LOGGER.error("Must set the dataset `id` before saving scaler.")
             raise ValueError()
         save_path = make_fullpath(save_dir, make=True, enforce="directory")
         sanitized_id = sanitize_filename(self.id)
-        filename = f"scaler_{sanitized_id}.pth"
+        filename = f"{DatasetKeys.SCALER_PREFIX}{sanitized_id}.pth"
         filepath = save_path / filename
-        self.scaler.save(filepath)
-        _LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
+        self.scaler.save(filepath, verbose=False)
+        if verbose:
+            _LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
 # Single target dataset
@@ -183,7 +218,7 @@ class DatasetMaker(_BaseDatasetMaker):
         `train_dataset` -> PyTorch Dataset
         `test_dataset`  -> PyTorch Dataset
         `feature_names` -> list[str]
-        `target_name`   -> str
+        `target_names`  -> list[str]
         `id` -> str
     The ID can be manually set to any string if needed, it is the target name by default.
@@ -211,8 +246,8 @@ class DatasetMaker(_BaseDatasetMaker):
         features = pandas_df.iloc[:, :-1]
         target = pandas_df.iloc[:, -1]
         self._feature_names = features.columns.tolist()
-        self._target_name = str(target.name)
-        self._id = self._target_name
+        self._target_names = [str(target.name)]
+        self._id = self._target_names[0]
         # --- 2. Split ---
         X_train, X_test, y_train, y_test = train_test_split(
@@ -229,12 +264,8 @@ class DatasetMaker(_BaseDatasetMaker):
         )
         # --- 4. Create Datasets ---
-        self._train_ds = _PytorchDataset(X_train_final, y_train.values, label_dtype)
-        self._test_ds = _PytorchDataset(X_test_final, y_test.values, label_dtype)
-    @property
-    def target_name(self) -> str:
-        return self._target_name
+        self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
 # --- New Multi-Target Class ---
@@ -280,12 +311,8 @@ class DatasetMakerMulti(_BaseDatasetMaker):
             X_train, y_train, X_test, label_dtype, continuous_feature_columns
         )
-        self._train_ds = _PytorchDataset(X_train_final, y_train, label_dtype)
-        self._test_ds = _PytorchDataset(X_test_final, y_test, label_dtype)
-    @property
-    def target_names(self) -> list[str]:
-        return self._target_names
+        self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
 # --- Private Base Class ---

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0}/ml_tools/ML_models.py RENAMED Viewed

@@ -6,7 +6,7 @@ import json
 from ._logger import _LOGGER
 from .path_manager import make_fullpath
 from ._script_info import _script_info
-from .keys import PytorchModelKeys
+from .keys import PytorchModelArchitectureKeys
 __all__ = [
@@ -29,11 +29,14 @@ class _ArchitectureHandlerMixin:
             raise AttributeError()
         path_dir = make_fullpath(directory, make=True, enforce="directory")
-        full_path = path_dir / PytorchModelKeys.SAVENAME
+        json_filename = PytorchModelArchitectureKeys.SAVENAME + ".json"
+        full_path = path_dir / json_filename
         config = {
-            PytorchModelKeys.MODEL: self.__class__.__name__,
-            PytorchModelKeys.CONFIG: self.get_architecture_config() # type: ignore
+            PytorchModelArchitectureKeys.MODEL: self.__class__.__name__,
+            PytorchModelArchitectureKeys.CONFIG: self.get_architecture_config() # type: ignore
         }
         with open(full_path, 'w') as f:
@@ -48,7 +51,8 @@ class _ArchitectureHandlerMixin:
         user_path = make_fullpath(file_or_dir)
         if user_path.is_dir():
-            target_path = make_fullpath(user_path / PytorchModelKeys.SAVENAME, enforce="file")
+            json_filename = PytorchModelArchitectureKeys.SAVENAME + ".json"
+            target_path = make_fullpath(user_path / json_filename, enforce="file")
         elif user_path.is_file():
             target_path = user_path
         else:
@@ -58,8 +62,8 @@ class _ArchitectureHandlerMixin:
         with open(target_path, 'r') as f:
             saved_data = json.load(f)
-        saved_class_name = saved_data[PytorchModelKeys.MODEL]
-        config = saved_data[PytorchModelKeys.CONFIG]
+        saved_class_name = saved_data[PytorchModelArchitectureKeys.MODEL]
+        config = saved_data[PytorchModelArchitectureKeys.CONFIG]
         if saved_class_name != cls.__name__:
             _LOGGER.error(f"Model class mismatch. File specifies '{saved_class_name}', but '{cls.__name__}' was expected.")

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0}/ml_tools/ML_scaler.py RENAMED Viewed

@@ -149,24 +149,25 @@ class PytorchScaler:
         return data_clone
-    def save(self, filepath: Union[str, Path]):
+    def save(self, filepath: Union[str, Path], verbose: bool=True):
         """
         Saves the scaler's state (mean, std, indices) to a .pth file.
         Args:
             filepath (str | Path): The path to save the file.
         """
-        path_obj = make_fullpath(filepath)
+        path_obj = make_fullpath(filepath, make=True, enforce="file")
         state = {
             'mean': self.mean_,
             'std': self.std_,
             'continuous_feature_indices': self.continuous_feature_indices
         }
         torch.save(state, path_obj)
-        _LOGGER.info(f"PytorchScaler state saved to '{path_obj.name}'.")
+        if verbose:
+            _LOGGER.info(f"PytorchScaler state saved to '{path_obj.name}'.")
     @staticmethod
-    def load(filepath: Union[str, Path]) -> 'PytorchScaler':
+    def load(filepath: Union[str, Path], verbose: bool=True) -> 'PytorchScaler':
         """
         Loads a scaler's state from a .pth file.
@@ -178,7 +179,8 @@ class PytorchScaler:
         """
         path_obj = make_fullpath(filepath, enforce="file")
         state = torch.load(path_obj)
-        _LOGGER.info(f"PytorchScaler state loaded from '{path_obj.name}'.")
+        if verbose:
+            _LOGGER.info(f"PytorchScaler state loaded from '{path_obj.name}'.")
         return PytorchScaler(
             mean=state['mean'],
             std=state['std'],

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0}/ml_tools/ML_trainer.py RENAMED Viewed

@@ -357,7 +357,7 @@ class MLTrainer:
                                                  If None, the trainer's test dataset is used.
             n_samples (int): The number of samples to use for both background and explanation.
             feature_names (list[str] | None): Feature names.
-            target_names (list[str] | None): Target names
+            target_names (list[str] | None): Target names for multi-target tasks.
             save_dir (str | Path): Directory to save all SHAP artifacts.
         """
         # Internal helper to create a dataloader and get a random sample
@@ -408,12 +408,8 @@ class MLTrainer:
             if hasattr(target_dataset, "feature_names"):
                 feature_names = target_dataset.feature_names # type: ignore
             else:
-                try:
-                # Handle PyTorch Subset
-                    feature_names = target_dataset.dataset.feature_names # type: ignore
-                except AttributeError:
-                    _LOGGER.error("Could not extract `feature_names` from the dataset. It must be provided if the dataset object does not have a `feature_names` attribute.")
-                    raise ValueError()
+                _LOGGER.error("Could not extract `feature_names` from the dataset. It must be provided if the dataset object does not have a `feature_names` attribute.")
+                raise ValueError()
         # 3. Call the plotting function
         if self.kind in ["regression", "classification"]:

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0}/ml_tools/keys.py RENAMED Viewed

@@ -38,11 +38,27 @@ class PyTorchInferenceKeys:
     PROBABILITIES = "probabilities"
-class PytorchModelKeys:
-    """Keys for saving and loading models"""
-    MODEL = 'model_class',
-    CONFIG = "config",
-    SAVENAME = "architecture.json"
+class PytorchModelArchitectureKeys:
+    """Keys for saving and loading model architecture."""
+    MODEL = 'model_class'
+    CONFIG = "config"
+    SAVENAME = "architecture"
+class PytorchArtifactPathKeys:
+    """Keys for model artifact paths."""
+    FEATURES_PATH = "feature_names_path"
+    TARGETS_PATH = "target_names_path"
+    ARCHITECTURE_PATH = "model_architecture_path"
+    WEIGHTS_PATH = "model_weights_path"
+    SCALER_PATH = "scaler_path"
+class DatasetKeys:
+    """Keys for saving dataset artifacts"""
+    FEATURE_NAMES = "feature_names"
+    TARGET_NAMES = "target_names"
+    SCALER_PREFIX = "scaler_"
 class _OneHotOtherPlaceholder:

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0}/ml_tools/path_manager.py RENAMED Viewed

@@ -13,6 +13,7 @@ __all__ = [
     "sanitize_filename",
     "list_csv_paths",
     "list_files_by_extension",
+    "list_subdirectories"
 ]
@@ -385,5 +386,37 @@ def list_files_by_extension(directory: Union[str,Path], extension: str, verbose:
     return name_path_dict
+def list_subdirectories(root_dir: Union[str,Path], verbose: bool=True) -> dict[str, Path]:
+    """
+    Scans a directory and returns a dictionary of its immediate subdirectories.
+    Args:
+        root_dir (str | Path): The path to the directory to scan.
+        verbose (bool): If True, prints the number of directories found.
+    Returns:
+        dict[str, Path]: A dictionary mapping subdirectory names (str) to their full Path objects.
+    """
+    root_path = make_fullpath(root_dir, enforce="directory")
+    directories = [p.resolve() for p in root_path.iterdir() if p.is_dir()]
+    if len(directories) < 1:
+        _LOGGER.error(f"No subdirectories found inside '{root_path}'")
+        raise IOError()
+    if verbose:
+        count = len(directories)
+        # Use pluralization for better readability
+        plural = 'ies' if count != 1 else 'y'
+        print(f"Found {count} subdirector{plural} in '{root_path.name}'.")
+    # Create a dictionary where the key is the directory's name (a string)
+    # and the value is the full Path object.
+    dir_map = {p.name: p for p in directories}
+    return dir_map
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0}/ml_tools/utilities.py RENAMED Viewed

@@ -6,9 +6,10 @@ from pathlib import Path
 from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple, overload
 import joblib
 from joblib.externals.loky.process_executor import TerminatedWorkerError
-from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
+from .path_manager import sanitize_filename, make_fullpath, list_csv_paths, list_files_by_extension, list_subdirectories
 from ._script_info import _script_info
 from ._logger import _LOGGER
+from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys
 # Keep track of available tools
@@ -24,7 +25,8 @@ __all__ = [
     "deserialize_object",
     "distribute_dataset_by_target",
     "train_dataset_orchestrator",
-    "train_dataset_yielder"
+    "train_dataset_yielder",
+    "find_model_artifacts"
 ]
@@ -560,5 +562,126 @@ def train_dataset_yielder(
         yield (df_features, df_target, feature_names, target_col)
+def find_model_artifacts(target_directory: Union[str,Path], load_scaler: bool, verbose: bool=False) -> list[dict[str,Any]]:
+    """
+    Scans subdirectories to find paths to model weights, target names, feature names, and model architecture. Optionally an scaler path if `load_scaler` is True.
+    This function operates on a specific directory structure. It expects the
+    `target_directory` to contain one or more subdirectories, where each
+    subdirectory represents a single trained model result.
+    The expected directory structure for each model is as follows:
+    ```
+        target_directory
+        ├── model_1
+        │   ├── *.pth
+        │   ├── scaler_*.pth          (Required if `load_scaler` is True)
+        │   ├── feature_names.txt
+        │   ├── target_names.txt
+        │   └── architecture.json
+        └── model_2/
+            └── ...
+    ```
+    Args:
+        target_directory (str | Path): The path to the root directory that contains model subdirectories.
+        load_scaler (bool): If True, the function requires and searches for a scaler file (`.pth`) in each model subdirectory.
+        verbose (bool): If True, enables detailed logging during the file paths search process.
+    Returns:
+        (list[dict[str, Path]]): A list of dictionaries, where each dictionary
+            corresponds to a model found in a subdirectory. The dictionary
+            maps standardized keys to the absolute paths of the model's
+            artifacts (weights, architecture, features, targets, and scaler).
+            The scaler path will be `None` if `load_scaler` is False.
+    """
+    # validate directory
+    root_path = make_fullpath(target_directory, enforce="directory")
+    # store results
+    all_artifacts: list[dict] = list()
+    # find model directories
+    result_dirs_dict = list_subdirectories(root_dir=root_path, verbose=verbose)
+    for dir_name, dir_path in result_dirs_dict.items():
+        # find files
+        model_pth_dict = list_files_by_extension(directory=dir_path, extension="pth", verbose=verbose)
+        # restriction
+        if load_scaler:
+            if len(model_pth_dict) != 2:
+                _LOGGER.error(f"Directory {dir_path} should contain exactly 2 '.pth' files: scaler and weights.")
+                raise IOError()
+        else:
+            if len(model_pth_dict) != 1:
+                _LOGGER.error(f"Directory {dir_path} should contain exactly 1 '.pth' file: weights.")
+                raise IOError()
+        ##### Scaler and Weights #####
+        scaler_path = None
+        weights_path = None
+        # load weights and scaler if present
+        for pth_filename, pth_path in model_pth_dict.items():
+            if load_scaler and pth_filename.lower().startswith(DatasetKeys.SCALER_PREFIX):
+                scaler_path = pth_path
+            else:
+                weights_path = pth_path
+        # validation
+        if not weights_path:
+            _LOGGER.error(f"Error parsing the model weights path from '{dir_name}'")
+            raise IOError()
+        if load_scaler and not scaler_path:
+            _LOGGER.error(f"Error parsing the scaler path from '{dir_name}'")
+            raise IOError()
+        ##### Target and Feature names #####
+        target_names_path = None
+        feature_names_path = None
+        # load feature and target names
+        model_txt_dict = list_files_by_extension(directory=dir_path, extension="txt", verbose=verbose)
+        for txt_filename, txt_path in model_txt_dict.items():
+            if txt_filename == DatasetKeys.FEATURE_NAMES:
+                feature_names_path = txt_path
+            elif txt_filename == DatasetKeys.TARGET_NAMES:
+                target_names_path = txt_path
+        # validation
+        if not target_names_path or not feature_names_path:
+            _LOGGER.error(f"Error parsing features path or targets path from '{dir_name}'")
+            raise IOError()
+        ##### load model architecture path #####
+        architecture_path = None
+        model_json_dict = list_files_by_extension(directory=dir_path, extension="json", verbose=verbose)
+        for json_filename, json_path in model_json_dict.items():
+            if json_filename == PytorchModelArchitectureKeys.SAVENAME:
+                architecture_path = json_path
+        # validation
+        if not architecture_path:
+            _LOGGER.error(f"Error parsing the model architecture path from '{dir_name}'")
+            raise IOError()
+        ##### Paths dictionary #####
+        parsing_dict = {
+            PytorchArtifactPathKeys.WEIGHTS_PATH: weights_path,
+            PytorchArtifactPathKeys.ARCHITECTURE_PATH: architecture_path,
+            PytorchArtifactPathKeys.FEATURES_PATH: feature_names_path,
+            PytorchArtifactPathKeys.TARGETS_PATH: target_names_path,
+            PytorchArtifactPathKeys.SCALER_PATH: scaler_path
+        }
+        all_artifacts.append(parsing_dict)
+    return all_artifacts
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-10.6.0 → dragon_ml_toolbox-10.8.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "10.6.0"
+version = "10.8.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }