PyPI - dragon-ml-toolbox - Versions diffs - 2.4.0__py3-none-any.whl → 3.0.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.4.0py3-none-any.whl → 3.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/METADATA +7 -4
dragon_ml_toolbox-3.0.0.dist-info/RECORD +25 -0
ml_tools/ETL_engineering.py +8 -7
ml_tools/GUI_tools.py +24 -25
ml_tools/MICE_imputation.py +8 -4
ml_tools/ML_callbacks.py +341 -0
ml_tools/ML_evaluation.py +255 -0
ml_tools/ML_trainer.py +344 -0
ml_tools/ML_tutorial.py +300 -0
ml_tools/PSO_optimization.py +27 -20
ml_tools/RNN_forecast.py +49 -0
ml_tools/VIF_factor.py +6 -5
ml_tools/datasetmaster.py +601 -527
ml_tools/ensemble_learning.py +12 -9
ml_tools/handle_excel.py +9 -10
ml_tools/logger.py +45 -8
ml_tools/utilities.py +18 -1
dragon_ml_toolbox-2.4.0.dist-info/RECORD +0 -22
ml_tools/trainer.py +0 -346
ml_tools/vision_helpers.py +0 -231
{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/top_level.txt +0 -0
/ml_tools/{pytorch_models.py → _pytorch_models.py} +0 -0

ml_tools/ML_tutorial.py ADDED Viewed

@@ -0,0 +1,300 @@
+import json
+from typing import Literal, Optional, Union
+from pathlib import Path
+from .logger import _LOGGER
+from .utilities import make_fullpath, sanitize_filename
+__all__ = [
+    "generate_notebook"
+]
+def _get_notebook_content(kind: str):
+    """Helper function to generate the cell content for the notebook."""
+    # --- Common Cells ---
+    imports_cell = {
+        "cell_type": "code",
+        "source": [
+            "import torch\n",
+            "from torch import nn\n",
+            "from torch.utils.data import TensorDataset, DataLoader\n",
+            "import numpy as np\n",
+            "from pathlib import Path\n",
+            "\n",
+            "# Import from dragon_ml_toolbox\n",
+            "from ml_tools.ML_trainer import MyTrainer\n",
+            "from ml_tools.ML_callbacks import EarlyStopping, ModelCheckpoint"
+            "from ml_tools.utilities import LogKeys"
+        ]
+    }
+    device_cell = {
+        "cell_type": "code",
+        "source": [
+            "import torch\\n",
+            "if torch.cuda.is_available():\\n",
+            "    device = 'cuda'\\n",
+            "elif torch.backends.mps.is_available():\\n",
+            "    device = 'mps'\\n",
+            "else:\\n",
+            "    device = 'cpu'\\n",
+            "\\n",
+            "print(f'Using device: {device}')"
+        ]
+    }
+    model_definition_cell = {
+        "cell_type": "markdown",
+        "source": [
+            "### 3. Define the Model, Criterion, and Optimizer\n",
+            "Next, we define a simple neural network for our task. We also need to choose a loss function (`criterion`) and an `optimizer`."
+        ]
+    }
+    callbacks_cell = {
+        "cell_type": "code",
+        "source": [
+            "# Define callbacks for training\n",
+            "model_filepath = 'best_model.pth'\n",
+            "monitor_metric = LogKeys.VAL_LOSS\n",
+            "\n",
+            "model_checkpoint = ModelCheckpoint(\n",
+            "    filepath=model_filepath, \n",
+            "    save_best_only=True, \n",
+            "    monitor=monitor_metric, \n",
+            "    mode='min'\n",
+            ")\n",
+            "\n",
+            "early_stopping = EarlyStopping(\n",
+            "    patience=10, \n",
+            "    monitor=monitor_metric, \n",
+            "    mode='min'\n",
+            ")"
+        ]
+    }
+    trainer_instantiation_cell = {
+        "cell_type": "code",
+        "source": [
+            "trainer = MyTrainer(\n",
+            "    model=model,\n",
+            "    train_dataset=train_dataset,\n",
+            "    test_dataset=test_dataset,\n",
+            f"    kind='{kind}',\n",
+            "    criterion=criterion,\n",
+            "    optimizer=optimizer,\n",
+            "    device=device,\\n",
+            "    callbacks=[model_checkpoint, early_stopping]\n",
+            ")"
+        ]
+    }
+    fit_cell = {
+        "cell_type": "code",
+        "source": [
+            "history = trainer.fit(epochs=100, batch_size=16)"
+        ]
+    }
+    evaluation_cell = {
+        "cell_type": "code",
+        "source": [
+            "save_dir = Path('tutorial_results')\n",
+            "\n",
+            "# The evaluate method will automatically use the test_loader.\n",
+            "# First, we load the best weights saved by ModelCheckpoint.\n",
+            "model_path = Path(model_filepath)\n",
+            "if model_path.exists():\n",
+            "    print(f'Loading best model from {model_path}')\n",
+            "    trainer.model.load_state_dict(torch.load(model_path))\n",
+            "\n",
+            "print('\\n--- Evaluating Model ---')\n",
+            "# All evaluation artifacts will be saved in the 'evaluation' subdirectory.\n",
+            "trainer.evaluate(save_dir=save_dir / 'evaluation')"
+        ]
+    }
+    explanation_cell = {
+        "cell_type": "code",
+        "source": [
+            "print('\\n--- Explaining Model ---')\n",
+            "# We can also generate SHAP plots to explain the model's predictions.\n",
+            "# All SHAP artifacts will be saved in the 'explanation' subdirectory.\n",
+            "trainer.explain(\n",
+            "    background_loader=trainer.train_loader,\n",
+            "    explain_loader=trainer.test_loader,\n",
+            "    save_dir=save_dir / 'explanation'\n",
+            ")"
+        ]
+    }
+    # --- Task-Specific Cells ---
+    if kind == 'classification':
+        title = "Classification Tutorial"
+        data_prep_source = [
+            "### 2. Prepare the Data\n",
+            "For this example, we'll generate some simple, linearly separable mock data for a binary classification task. We'll then wrap it in PyTorch `TensorDataset` objects."
+        ]
+        data_creation_source = [
+            "from sklearn.datasets import make_classification\n",
+            "from sklearn.model_selection import train_test_split\n",
+            "\n",
+            "X, y = make_classification(n_samples=200, n_features=10, n_informative=5, n_redundant=0, random_state=42)\n",
+            "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+            "\n",
+            "# Convert to PyTorch tensors\n",
+            "X_train = torch.FloatTensor(X_train)\n",
+            "y_train = torch.LongTensor(y_train)\n",
+            "X_test = torch.FloatTensor(X_test)\n",
+            "y_test = torch.LongTensor(y_test)\n",
+            "\n",
+            "# Create TensorDatasets\n",
+            "train_dataset = TensorDataset(X_train, y_train)\n",
+            "test_dataset = TensorDataset(X_test, y_test)"
+        ]
+        model_creation_source = [
+            "class SimpleClassifier(nn.Module):\n",
+            "    def __init__(self, input_features, num_classes):\n",
+            "        super().__init__()\n",
+            "        self.layer_1 = nn.Linear(input_features, 32)\n",
+            "        self.layer_2 = nn.Linear(32, num_classes)\n",
+            "        self.relu = nn.ReLU()\n",
+            "    \n",
+            "    def forward(self, x):\n",
+            "        return self.layer_2(self.relu(self.layer_1(x)))\n",
+            "\n",
+            "model = SimpleClassifier(input_features=10, num_classes=2)\n",
+            "criterion = nn.CrossEntropyLoss()\n",
+            "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
+        ]
+    elif kind == 'regression':
+        title = "Regression Tutorial"
+        data_prep_source = [
+            "### 2. Prepare the Data\n",
+            "For this example, we'll generate some simple mock data for a regression task. We'll then wrap it in PyTorch `TensorDataset` objects."
+        ]
+        data_creation_source = [
+            "from sklearn.datasets import make_regression\n",
+            "from sklearn.model_selection import train_test_split\n",
+            "\n",
+            "X, y = make_regression(n_samples=200, n_features=5, noise=15, random_state=42)\n",
+            "y = y.reshape(-1, 1) # Reshape for compatibility with MSELoss\n",
+            "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+            "\n",
+            "# Convert to PyTorch tensors\n",
+            "X_train = torch.FloatTensor(X_train)\n",
+            "y_train = torch.FloatTensor(y_train)\n",
+            "X_test = torch.FloatTensor(X_test)\n",
+            "y_test = torch.FloatTensor(y_test)\n",
+            "\n",
+            "# Create TensorDatasets\n",
+            "train_dataset = TensorDataset(X_train, y_train)\n",
+            "test_dataset = TensorDataset(X_test, y_test)"
+        ]
+        model_creation_source = [
+            "class SimpleRegressor(nn.Module):\n",
+            "    def __init__(self, input_features, output_features):\n",
+            "        super().__init__()\n",
+            "        self.layer_1 = nn.Linear(input_features, 32)\n",
+            "        self.layer_2 = nn.Linear(32, output_features)\n",
+            "        self.relu = nn.ReLU()\n",
+            "    \n",
+            "    def forward(self, x):\n",
+            "        return self.layer_2(self.relu(self.layer_1(x)))\n",
+            "\n",
+            "model = SimpleRegressor(input_features=5, output_features=1)\n",
+            "criterion = nn.MSELoss()\n",
+            "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
+        ]
+    else:
+        raise ValueError("kind must be 'classification' or 'regression'")
+    # --- Assemble Notebook ---
+    cells = [
+        {"cell_type": "markdown", "source": [f"# Dragon ML Toolbox - {title}\n", "This notebook demonstrates how to use the `MyTrainer` class for a complete training and evaluation workflow."]},
+        {"cell_type": "markdown", "source": ["### 1. Imports\n", "First, let's import all the necessary components."]},
+        imports_cell,
+        {"cell_type": "markdown", "source": data_prep_source},
+        {"cell_type": "code", "source": data_creation_source},
+        model_definition_cell,
+        {"cell_type": "code", "source": model_creation_source},
+        {"cell_type": "markdown", "source": ["### 4. Configure Callbacks\n", "We'll set up `ModelCheckpoint` to save the best model and `EarlyStopping` to prevent overfitting."]},
+        callbacks_cell,
+        {"cell_type": "markdown", "source": ["### 5. Initialize the Trainer\\n", "First, we'll determine the best device to run on. Then, we can instantiate `MyTrainer` with all our components."]},
+        device_cell,
+        trainer_instantiation_cell,
+        {"cell_type": "markdown", "source": ["### 6. Train the Model\n", "Call the `.fit()` method to start training."]},
+        fit_cell,
+        {"cell_type": "markdown", "source": ["### 7. Evaluate the Model\n", "Finally, call the `.evaluate()` method to see the performance report and save all plots and metrics."]},
+        evaluation_cell,
+        {"cell_type": "markdown", "source": ["### 8. Explain the Model\n", "We can also use the `.explain()` method to generate and save SHAP plots for model interpretability."]},
+        explanation_cell,
+    ]
+    # Add execution counts to code cells
+    for cell in cells:
+        if cell['cell_type'] == 'code':
+            cell['execution_count'] = None
+            cell['metadata'] = {}
+            cell['outputs'] = []
+    return cells
+def generate_notebook(kind: Literal['classification', 'regression'] = 'classification', filepath: Optional[Union[str,Path]] = None):
+    """
+    Generates a tutorial Jupyter Notebook (.ipynb) file.
+    This function creates a complete, runnable notebook with mock data,
+    a simple model, and a full training/evaluation cycle using MyTrainer.
+    Args:
+        kind (str): The type of tutorial to generate, either 'classification' or 'regression'.
+        filepath (str | Path | None): The path to save the notebook file.
+                                  If None, defaults to 'classification_tutorial.ipynb' or
+                                  'regression_tutorial.ipynb' in the current directory.
+    """
+    if kind not in ["classification", "regression"]:
+        raise ValueError("kind must be 'classification' or 'regression'")
+    if filepath is None:
+        sanitized_filepath = f"{kind}_tutorial.ipynb"
+    else:
+        sanitized_filepath = sanitize_filename(str(filepath))
+    # check suffix
+    if not sanitized_filepath.endswith(".ipynb"):
+        sanitized_filepath = sanitized_filepath + ".ipynb"
+    new_filepath = make_fullpath(sanitized_filepath, make=True)
+    _LOGGER.info(f"Generating {kind} tutorial notebook at: {filepath}")
+    cells = _get_notebook_content(kind)
+    notebook = {
+        "cells": cells,
+        "metadata": {
+            "kernelspec": {
+                "display_name": "Python 3",
+                "language": "python",
+                "name": "python3"
+            },
+            "language_info": {
+                "name": "python",
+                "version": "3.10.0"
+            }
+        },
+        "nbformat": 4,
+        "nbformat_minor": 2
+    }
+    try:
+        with open(new_filepath, 'w') as f:
+            json.dump(notebook, f, indent=2)
+        _LOGGER.info("Notebook generated successfully.")
+    except Exception as e:
+        _LOGGER.error(f"Error generating notebook: {e}")

ml_tools/PSO_optimization.py CHANGED Viewed

@@ -7,20 +7,23 @@ from sklearn.base import ClassifierMixin
 from typing import Literal, Union, Tuple, Dict, Optional
 import pandas as pd
 from copy import deepcopy
-from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath, yield_dataframes_from_dir, sanitize_filename
+from .utilities import (
+    _script_info,
+    list_csv_paths,
+    threshold_binary_values,
+    threshold_binary_values_batch,
+    deserialize_object,
+    list_files_by_extension,
+    save_dataframe,
+    make_fullpath,
+    yield_dataframes_from_dir,
+    sanitize_filename)
 import torch
 from tqdm import trange
-import logging
 import matplotlib.pyplot as plt
 import seaborn as sns
 from collections import defaultdict
-# Configure logger
-logging.basicConfig(
-    level=logging.INFO,
-    format="[%(asctime)s] [%(levelname)s] - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S"
-)
+from .logger import _LOGGER
 __all__ = [
@@ -304,7 +307,7 @@ def run_pso(lower_boundaries: list[float],
     else:
         device = torch.device("cpu")
-    logging.info(f"Using device: '{device}'")
+    _LOGGER.info(f"Using device: '{device}'")
     # set local deep copies to prevent in place list modification
     local_lower_boundaries = deepcopy(lower_boundaries)
@@ -352,7 +355,7 @@ def run_pso(lower_boundaries: list[float],
     save_results_path = make_fullpath(save_results_dir, make=True)
     _save_results(features, target, save_dir=save_results_path, target_name=target_name)
-    return features, target
+    return features, target # type: ignore
 def _pso(func: ObjectiveFunction,
@@ -526,19 +529,23 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
         If True, generates comparative plots with distributions colored by their source target.
     """
     mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
-    logging.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
+    _LOGGER.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
+    # Check results_dir
+    results_path = make_fullpath(results_dir)
+    # make output path
     output_path = make_fullpath(save_dir, make=True)
-    all_files = list(yield_dataframes_from_dir(results_dir))
+    all_csvs = list_csv_paths(results_path)
-    if not all_files:
-        logging.warning("No data found. No plots will be generated.")
+    if not all_csvs:
+        _LOGGER.warning("No data found. No plots will be generated.")
         return
     # --- MODE 1: Color-coded plots by target ---
     if color_by_target:
         data_to_plot = []
-        for df, df_name in all_files:
+        for df, df_name in yield_dataframes_from_dir(results_path):
             # Assumes last col is target, rest are features
             melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
             # Sanitize target name for cleaner legend labels
@@ -547,7 +554,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
         long_df = pd.concat(data_to_plot, ignore_index=True)
         features = long_df['feature'].unique()
-        logging.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
+        _LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
         for feature_name in features:
             plt.figure(figsize=(12, 7))
@@ -569,12 +576,12 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
     # --- MODE 2: Aggregate plot ---
     else:
         feature_distributions = defaultdict(list)
-        for df, _ in all_files:
+        for df, _ in yield_dataframes_from_dir(results_path):
             feature_columns = df.iloc[:, :-1]
             for feature_name in feature_columns:
                 feature_distributions[feature_name].extend(df[feature_name].tolist())
-        logging.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
+        _LOGGER.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
         for feature_name, values in feature_distributions.items():
             plt.figure(figsize=(12, 7))
             sns.histplot(x=values, kde=True, bins='auto', stat="density")
@@ -589,7 +596,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
             plt.savefig(plot_filename, bbox_inches='tight')
             plt.close()
-    logging.info(f"✅ All plots saved successfully to: {output_path}")
+    _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
 def info():

ml_tools/RNN_forecast.py ADDED Viewed

@@ -0,0 +1,49 @@
+import torch
+from torch import nn
+import numpy as np
+__all__ = [
+    "rnn_forecast"
+]
+def rnn_forecast(model: nn.Module, start_sequence: torch.Tensor, steps: int, device: str = 'cpu'):
+    """
+    Runs a sequential forecast for a trained RNN-based model.
+    This function iteratively predicts future time steps, where each new prediction
+    is generated by feeding the previous prediction back into the model.
+    Args:
+        model (nn.Module): The trained PyTorch RNN model (e.g., LSTM, GRU).
+        start_sequence (torch.Tensor): The initial sequence to start the forecast from.
+                                       Shape should be (sequence_length, num_features).
+        steps (int): The number of future time steps to predict.
+        device (str, optional): The device to run the forecast on ('cpu', 'cuda', 'mps').
+                                Defaults to 'cpu'.
+    Returns:
+        np.ndarray: A numpy array containing the forecasted values.
+    """
+    model.eval()
+    model.to(device)
+    predictions = []
+    current_sequence = start_sequence.to(device)
+    with torch.no_grad():
+        for _ in range(steps):
+            # Get the model's prediction for the current sequence
+            output = model(current_sequence.unsqueeze(0)) # Add batch dimension
+            # The prediction is the last element of the output sequence
+            next_pred = output[0, -1, :].view(1, -1)
+            # Store the prediction
+            predictions.append(next_pred.cpu().numpy())
+            # Update the sequence for the next iteration:
+            # Drop the first element and append the new prediction
+            current_sequence = torch.cat([current_sequence[1:], next_pred], dim=0)
+    # Concatenate all predictions and flatten the array for easy use
+    return np.concatenate(predictions).flatten()

ml_tools/VIF_factor.py CHANGED Viewed

@@ -8,6 +8,7 @@ from statsmodels.tools.tools import add_constant
 import warnings
 from pathlib import Path
 from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
+from .logger import _LOGGER
 __all__ = [
@@ -54,20 +55,20 @@ def compute_vif(
         sanitized_columns = df.select_dtypes(include='number').columns.tolist()
         missing_features = set(ground_truth_cols) - set(sanitized_columns)
         if missing_features and verbose:
-            print(f"⚠️ These columns are not Numeric:\n{missing_features}")
+            _LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
     else:
         sanitized_columns = list()
         for feature in use_columns:
             if feature not in ground_truth_cols:
                 if verbose:
-                    print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
+                    _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
             else:
                 sanitized_columns.append(feature)
     if ignore_columns is not None and use_columns is None:
         missing_ignore = set(ignore_columns) - set(ground_truth_cols)
         if missing_ignore and verbose:
-            print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
+            _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
         sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
     X = df[sanitized_columns].copy()
@@ -167,12 +168,12 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
     # Identify features to drop
     to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
-    print(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
+    _LOGGER.info(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
     result_df = df.drop(columns=to_drop)
     if result_df.empty:
-        print(f"\t⚠️ Warning: All columns were dropped.")
+        _LOGGER.warning(f"\t⚠️ All columns were dropped.")
     return result_df, to_drop

dragon-ml-toolbox 2.4.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

dragon-ml-toolbox 2.4.0py3-none-any.whl → 3.0.0py3-none-any.whl