PyPI - dragon-ml-toolbox - Versions diffs - 13.3.0__py3-none-any.whl → 16.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 13.3.0py3-none-any.whl → 16.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +20 -6
dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
ml_tools/ETL_cleaning.py +20 -20
ml_tools/ETL_engineering.py +23 -25
ml_tools/GUI_tools.py +20 -20
ml_tools/MICE_imputation.py +207 -5
ml_tools/ML_callbacks.py +43 -26
ml_tools/ML_configuration.py +788 -0
ml_tools/ML_datasetmaster.py +303 -448
ml_tools/ML_evaluation.py +351 -93
ml_tools/ML_evaluation_multi.py +139 -42
ml_tools/ML_inference.py +290 -209
ml_tools/ML_models.py +33 -106
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_optimization.py +12 -12
ml_tools/ML_scaler.py +11 -11
ml_tools/ML_sequence_datasetmaster.py +341 -0
ml_tools/ML_sequence_evaluation.py +219 -0
ml_tools/ML_sequence_inference.py +391 -0
ml_tools/ML_sequence_models.py +139 -0
ml_tools/ML_trainer.py +1604 -179
ml_tools/ML_utilities.py +351 -4
ml_tools/ML_vision_datasetmaster.py +1540 -0
ml_tools/ML_vision_evaluation.py +284 -0
ml_tools/ML_vision_inference.py +405 -0
ml_tools/ML_vision_models.py +641 -0
ml_tools/ML_vision_transformers.py +284 -0
ml_tools/PSO_optimization.py +6 -6
ml_tools/SQL.py +4 -4
ml_tools/_keys.py +171 -0
ml_tools/_schema.py +1 -1
ml_tools/custom_logger.py +37 -14
ml_tools/data_exploration.py +502 -93
ml_tools/ensemble_evaluation.py +54 -11
ml_tools/ensemble_inference.py +7 -33
ml_tools/ensemble_learning.py +1 -1
ml_tools/math_utilities.py +1 -1
ml_tools/optimization_tools.py +2 -2
ml_tools/path_manager.py +5 -5
ml_tools/serde.py +2 -2
ml_tools/utilities.py +192 -4
dragon_ml_toolbox-13.3.0.dist-info/RECORD +0 -41
ml_tools/RNN_forecast.py +0 -56
ml_tools/keys.py +0 -87
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -7,19 +7,20 @@ from plotnine import ggplot, labs, theme, element_blank # type: ignore
 from typing import Optional, Union
 from .utilities import load_dataframe, merge_dataframes, save_dataframe_filename
-from .math_utilities import threshold_binary_values
+from .math_utilities import threshold_binary_values, discretize_categorical_values
 from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
 from ._logger import _LOGGER
 from ._script_info import _script_info
+from ._schema import FeatureSchema
 __all__ = [
+    "DragonMICE",
     "apply_mice",
     "save_imputed_datasets",
-    "get_na_column_names",
     "get_convergence_diagnostic",
     "get_imputed_distributions",
-    "run_mice_pipeline"
+    "run_mice_pipeline",
 ]
@@ -79,7 +80,7 @@ def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df
 #Get names of features that had missing values before imputation
-def get_na_column_names(df: pd.DataFrame):
+def _get_na_column_names(df: pd.DataFrame):
     return [col for col in df.columns if df[col].isna().any()]
@@ -264,7 +265,7 @@ def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str]
         save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
-        imputed_column_names = get_na_column_names(df=df)
+        imputed_column_names = _get_na_column_names(df=df)
         get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
@@ -278,5 +279,206 @@ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
     return df_feats, df_targets
+# modern implementation
+class DragonMICE:
+    """
+    A modern MICE imputation pipeline that uses a FeatureSchema
+    to correctly discretize categorical features after imputation.
+    """
+    def __init__(self,
+                 schema: FeatureSchema,
+                 iterations: int=20,
+                 resulting_datasets: int = 1,
+                 random_state: int = 101):
+        self.schema = schema
+        self.random_state = random_state
+        self.iterations = iterations
+        self.resulting_datasets = resulting_datasets
+        # --- Store schema info ---
+        # 1. Categorical info
+        if not self.schema.categorical_index_map:
+            _LOGGER.warning("FeatureSchema has no 'categorical_index_map'. No discretization will be applied.")
+            self.cat_info = {}
+        else:
+            self.cat_info = self.schema.categorical_index_map
+        # 2. Ordered feature names (critical for index mapping)
+        self.ordered_features = list(self.schema.feature_names)
+        # 3. Names of categorical features
+        self.categorical_features = list(self.schema.categorical_feature_names)
+        _LOGGER.info(f"DragonMICE initialized. Found {len(self.cat_info)} categorical features to discretize.")
+    def _post_process(self, imputed_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Applies schema-based discretization to a completed dataframe.
+        This method works around the behavior of `discretize_categorical_values`
+        (which returns a full int32 array) by:
+        1. Calling it on the full, ordered feature array.
+        2. Extracting *only* the valid discretized categorical columns.
+        3. Updating the original float dataframe with these integer values.
+        """
+        # If no categorical features are defined, return the df as-is.
+        if not self.cat_info:
+            return imputed_df
+        try:
+            # 1. Ensure DataFrame columns match the schema order
+            # This is critical for the index-based categorical_info
+            df_ordered: pd.DataFrame = imputed_df[self.ordered_features] # type: ignore
+            # 2. Convert to NumPy array
+            array_ordered = df_ordered.to_numpy()
+            # 3. Apply discretization utility (which returns a full int32 array)
+            # This array has *correct* categorical values but *truncated* continuous values.
+            discretized_array_int32 = discretize_categorical_values(
+                array_ordered,
+                self.cat_info,
+                start_at_zero=True  # Assuming 0-based indexing
+            )
+            # 4. Create a new DF from the int32 array, keeping the categorical columns.
+            df_discretized_cats = pd.DataFrame(
+                discretized_array_int32,
+                columns=self.ordered_features,
+                index=df_ordered.index  # <-- Critical: align index
+            )[self.categorical_features] # <-- Select only cat features
+            # 5. "Rejoin": Start with a fresh copy of the *original* imputed DF (which has correct continuous floats).
+            final_df = df_ordered.copy()
+            # 6. Use .update() to "paste" the integer categorical values
+            # over the old float categorical values. Continuous floats are unaffected.
+            final_df.update(df_discretized_cats)
+            return final_df
+        except Exception as e:
+            _LOGGER.error(f"Failed during post-processing discretization:\n\tInput DF shape: {imputed_df.shape}\n\tSchema features: {len(self.ordered_features)}\n\tCategorical info keys: {list(self.cat_info.keys())}\n{e}")
+            raise
+    def _run_mice(self,
+                  df: pd.DataFrame,
+                  df_name: str) -> tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
+        """
+        Runs the MICE kernel and applies schema-based post-processing.
+        Parameters:
+            df (pd.DataFrame): The input dataframe *with NaNs*. Should only contain feature columns.
+            df_name (str): The base name for the dataset.
+        Returns:
+            tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
+                - The trained MICE kernel
+                - A list of imputed and processed DataFrames
+                - A list of names for the new DataFrames
+        """
+        # Ensure input df only contains features from the schema and is in the correct order.
+        try:
+            df_feats = df[self.ordered_features]
+        except KeyError as e:
+            _LOGGER.error(f"Input DataFrame is missing required schema columns: {e}")
+            raise
+        # 1. Initialize kernel
+        kernel = mf.ImputationKernel(
+            data=df_feats,
+            num_datasets=self.resulting_datasets,
+            random_state=self.random_state
+        )
+        _LOGGER.info("➡️ Schema-based MICE imputation running...")
+        # 2. Perform MICE
+        kernel.mice(self.iterations)
+        # 3. Retrieve, process, and collect datasets
+        imputed_datasets = []
+        for i in range(self.resulting_datasets):
+            # complete_data returns a pd.DataFrame
+            completed_df = kernel.complete_data(dataset=i)
+            # Apply our new discretization and ordering
+            processed_df = self._post_process(completed_df)
+            imputed_datasets.append(processed_df)
+        if not imputed_datasets:
+            _LOGGER.error("No imputed datasets were generated.")
+            raise ValueError()
+        # 4. Generate names
+        if self.resulting_datasets == 1:
+            imputed_dataset_names = [f"{df_name}_MICE"]
+        else:
+            imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(self.resulting_datasets)]
+        # 5. Validate indexes
+        for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
+            assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}"
+            assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}"
+        _LOGGER.info("Schema-based MICE imputation complete.")
+        return kernel, imputed_datasets, imputed_dataset_names
+    def run_pipeline(self,
+                     df_path_or_dir: Union[str,Path],
+                     save_datasets_dir: Union[str,Path],
+                     save_metrics_dir: Union[str,Path],
+                     ):
+        """
+        Runs the complete MICE imputation pipeline.
+        This method automates the entire workflow:
+            1. Loads data from a CSV file path or a directory with CSV files.
+            2. Separates features and targets based on the `FeatureSchema`.
+            3. Runs the MICE algorithm on the feature set.
+            4. Applies schema-based post-processing to discretize categorical features.
+            5. Saves the final, processed, and imputed dataset(s) (re-joined with targets) to `save_datasets_dir`.
+            6. Generates and saves convergence and distribution plots for all imputed columns to `save_metrics_dir`.
+        Parameters
+        ----------
+        df_path_or_dir :[str,Path]
+            Path to a single CSV file or a directory containing multiple CSV files to impute.
+        save_datasets_dir : [str,Path]
+            Directory where the final imputed and processed dataset(s) will be saved as CSVs.
+        save_metrics_dir : [str,Path]
+            Directory where convergence and distribution plots will be saved.
+        """
+        # Check paths
+        save_datasets_path = make_fullpath(save_datasets_dir, make=True)
+        save_metrics_path = make_fullpath(save_metrics_dir, make=True)
+        input_path = make_fullpath(df_path_or_dir)
+        if input_path.is_file():
+            all_file_paths = [input_path]
+        else:
+            all_file_paths = list(list_csv_paths(input_path).values())
+        for df_path in all_file_paths:
+            df, df_name = load_dataframe(df_path=df_path, kind="pandas")
+            df_features: pd.DataFrame = df[self.schema.feature_names] # type: ignore
+            df_targets = df.drop(columns=self.schema.feature_names)
+            imputed_column_names = _get_na_column_names(df=df_features)
+            kernel, imputed_datasets, imputed_dataset_names = self._run_mice(df=df_features, df_name=df_name)
+            save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
+            get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
+            get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
 def info():
     _script_info(__all__)

ml_tools/ML_callbacks.py CHANGED Viewed

@@ -4,23 +4,22 @@ from tqdm.auto import tqdm
 from typing import Union, Literal, Optional
 from pathlib import Path
-from .path_manager import make_fullpath, sanitize_filename
-from .keys import PyTorchLogKeys, PyTorchCheckpointKeys
+from .path_manager import make_fullpath
+from ._keys import PyTorchLogKeys, PyTorchCheckpointKeys
 from ._logger import _LOGGER
 from ._script_info import _script_info
 __all__ = [
-    "Callback",
     "History",
     "TqdmProgressBar",
-    "EarlyStopping",
-    "ModelCheckpoint",
-    "LRScheduler"
+    "DragonEarlyStopping",
+    "DragonModelCheckpoint",
+    "DragonLRScheduler"
 ]
-class Callback:
+class _Callback:
     """
     Abstract base class used to build new callbacks.
@@ -60,7 +59,7 @@ class Callback:
         pass
-class History(Callback):
+class History(_Callback):
     """
     Callback that records events into a `history` dictionary.
@@ -79,7 +78,7 @@ class History(Callback):
             self.trainer.history.setdefault(k, []).append(v) # type: ignore
-class TqdmProgressBar(Callback):
+class TqdmProgressBar(_Callback):
     """Callback that provides a tqdm progress bar for training."""
     def __init__(self):
         self.epoch_bar = None
@@ -110,7 +109,7 @@ class TqdmProgressBar(Callback):
         self.epoch_bar.close() # type: ignore
-class EarlyStopping(Callback):
+class DragonEarlyStopping(_Callback):
     """
     Stop training when a monitored metric has stopped improving.
     """
@@ -187,11 +186,11 @@ class EarlyStopping(Callback):
                     _LOGGER.info(f"Epoch {epoch+1}: early stopping after {self.wait} epochs with no improvement.")
-class ModelCheckpoint(Callback):
+class DragonModelCheckpoint(_Callback):
     """
     Saves the model weights, optimizer state, LR scheduler state (if any), and epoch number to a directory with automated filename generation and rotation.
     """
-    def __init__(self, save_dir: Union[str,Path], checkpoint_name: Optional[str]=None, monitor: str = PyTorchLogKeys.VAL_LOSS,
+    def __init__(self, save_dir: Union[str,Path], monitor: str = PyTorchLogKeys.VAL_LOSS,
                  save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
         """
         - If `save_best_only` is True, it saves the single best model, deleting the previous best.
@@ -199,7 +198,6 @@ class ModelCheckpoint(Callback):
         Args:
             save_dir (str): Directory where checkpoint files will be saved.
-            checkpoint_name (str| None): If None, the filename will include the epoch and score.
             monitor (str): Metric to monitor.
             save_best_only (bool): If true, save only the best model.
             mode (str): One of {'auto', 'min', 'max'}.
@@ -215,9 +213,8 @@ class ModelCheckpoint(Callback):
         self.monitor = monitor
         self.save_best_only = save_best_only
         self.verbose = verbose
-        if checkpoint_name:
-            checkpoint_name = sanitize_filename(checkpoint_name)
-        self.checkpoint_name = checkpoint_name
+        self._latest_checkpoint_path = None
+        self._checkpoint_name = PyTorchCheckpointKeys.CHECKPOINT_NAME
         # State variables to be managed during training
         self.saved_checkpoints = []
@@ -261,10 +258,7 @@ class ModelCheckpoint(Callback):
             old_best_str = f"{self.best:.4f}" if self.best not in [np.inf, -np.inf] else "inf"
             # Create a descriptive filename
-            if self.checkpoint_name is None:
-                filename = f"epoch_{epoch}-{self.monitor}_{current:.4f}.pth"
-            else:
-                filename = f"epoch{epoch}_{self.checkpoint_name}.pth"
+            filename = f"epoch{epoch}_{self._checkpoint_name}_{current:.4f}.pth"
             new_filepath = self.save_dir / filename
             if self.verbose > 0:
@@ -279,6 +273,7 @@ class ModelCheckpoint(Callback):
                 PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
                 PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
                 PyTorchCheckpointKeys.BEST_SCORE: self.best,
+                PyTorchCheckpointKeys.HISTORY: self.trainer.history, # type: ignore
             }
             # Check for scheduler
@@ -287,6 +282,7 @@ class ModelCheckpoint(Callback):
             # Save the new best model
             torch.save(checkpoint_data, new_filepath)
+            self._latest_checkpoint_path = new_filepath
             # Delete the old best model file
             if self.last_best_filepath and self.last_best_filepath.exists():
@@ -298,10 +294,8 @@ class ModelCheckpoint(Callback):
     def _save_rolling_checkpoints(self, epoch, logs):
         """Saves the latest model and keeps only the most recent ones."""
         current = logs.get(self.monitor)
-        if self.checkpoint_name is None:
-            filename = f"epoch_{epoch}-{self.monitor}_{current:.4f}.pth"
-        else:
-            filename = f"epoch{epoch}_{self.checkpoint_name}.pth"
+        filename = f"epoch{epoch}_{self._checkpoint_name}_{current:.4f}.pth"
         filepath = self.save_dir / filename
         if self.verbose > 0:
@@ -313,12 +307,15 @@ class ModelCheckpoint(Callback):
             PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
             PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
             PyTorchCheckpointKeys.BEST_SCORE: self.best, # Save the current best score
+            PyTorchCheckpointKeys.HISTORY: self.trainer.history, # type: ignore
         }
         if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
             checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
         torch.save(checkpoint_data, filepath)
+        self._latest_checkpoint_path = filepath
         self.saved_checkpoints.append(filepath)
@@ -330,8 +327,16 @@ class ModelCheckpoint(Callback):
                     _LOGGER.info(f"  -> Deleting old checkpoint: {file_to_delete.name}")
                 file_to_delete.unlink()
+    @property
+    def best_checkpoint_path(self):
+        if self._latest_checkpoint_path:
+            return self._latest_checkpoint_path
+        else:
+            _LOGGER.error("No checkpoint paths saved.")
+            raise ValueError()
-class LRScheduler(Callback):
+class DragonLRScheduler(_Callback):
     """
     Callback to manage a PyTorch learning rate scheduler.
     """
@@ -361,6 +366,8 @@ class LRScheduler(Callback):
     def on_epoch_end(self, epoch, logs=None):
         """Step the scheduler and log any change in learning rate."""
+        logs = logs or {}
         # For schedulers that need a metric (e.g., val_loss)
         if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
             if self.monitor is None:
@@ -376,12 +383,22 @@ class LRScheduler(Callback):
         # For all other schedulers
         else:
             self.scheduler.step()
+        # Get the current learning rate
+        current_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
         # Log the change if the LR was updated
-        current_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
         if current_lr != self.previous_lr:
             _LOGGER.info(f"Epoch {epoch}: Learning rate changed to {current_lr:.6f}")
             self.previous_lr = current_lr
+        # --- Add LR to logs and history ---
+        # Add to the logs dict for any subsequent callbacks
+        logs[PyTorchLogKeys.LEARNING_RATE] = current_lr
+        # Also add directly to the trainer's history dict
+        if hasattr(self.trainer, 'history'):
+            self.trainer.history.setdefault(PyTorchLogKeys.LEARNING_RATE, []).append(current_lr) # type: ignore
 def info():

dragon-ml-toolbox 13.3.0__py3-none-any.whl → 16.2.0__py3-none-any.whl

dragon-ml-toolbox 13.3.0py3-none-any.whl → 16.2.0py3-none-any.whl