PyPI - dragon-ml-toolbox - Versions diffs - 2.4.0__py3-none-any.whl → 3.1.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.4.0py3-none-any.whl → 3.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (26) hide show

{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/METADATA +7 -4
dragon_ml_toolbox-3.1.0.dist-info/RECORD +25 -0
ml_tools/ETL_engineering.py +49 -19
ml_tools/GUI_tools.py +24 -25
ml_tools/MICE_imputation.py +8 -4
ml_tools/ML_callbacks.py +341 -0
ml_tools/ML_evaluation.py +255 -0
ml_tools/ML_trainer.py +344 -0
ml_tools/ML_tutorial.py +300 -0
ml_tools/PSO_optimization.py +27 -20
ml_tools/RNN_forecast.py +49 -0
ml_tools/VIF_factor.py +6 -5
ml_tools/data_exploration.py +2 -2
ml_tools/datasetmaster.py +601 -527
ml_tools/ensemble_learning.py +12 -9
ml_tools/handle_excel.py +9 -10
ml_tools/logger.py +45 -8
ml_tools/utilities.py +18 -1
dragon_ml_toolbox-2.4.0.dist-info/RECORD +0 -22
ml_tools/trainer.py +0 -346
ml_tools/vision_helpers.py +0 -231
{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/top_level.txt +0 -0
/ml_tools/{pytorch_models.py → _pytorch_models.py} +0 -0

ml_tools/PSO_optimization.py CHANGED Viewed

@@ -7,20 +7,23 @@ from sklearn.base import ClassifierMixin
 from typing import Literal, Union, Tuple, Dict, Optional
 import pandas as pd
 from copy import deepcopy
-from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath, yield_dataframes_from_dir, sanitize_filename
+from .utilities import (
+    _script_info,
+    list_csv_paths,
+    threshold_binary_values,
+    threshold_binary_values_batch,
+    deserialize_object,
+    list_files_by_extension,
+    save_dataframe,
+    make_fullpath,
+    yield_dataframes_from_dir,
+    sanitize_filename)
 import torch
 from tqdm import trange
-import logging
 import matplotlib.pyplot as plt
 import seaborn as sns
 from collections import defaultdict
-# Configure logger
-logging.basicConfig(
-    level=logging.INFO,
-    format="[%(asctime)s] [%(levelname)s] - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S"
-)
+from .logger import _LOGGER
 __all__ = [
@@ -304,7 +307,7 @@ def run_pso(lower_boundaries: list[float],
     else:
         device = torch.device("cpu")
-    logging.info(f"Using device: '{device}'")
+    _LOGGER.info(f"Using device: '{device}'")
     # set local deep copies to prevent in place list modification
     local_lower_boundaries = deepcopy(lower_boundaries)
@@ -352,7 +355,7 @@ def run_pso(lower_boundaries: list[float],
     save_results_path = make_fullpath(save_results_dir, make=True)
     _save_results(features, target, save_dir=save_results_path, target_name=target_name)
-    return features, target
+    return features, target # type: ignore
 def _pso(func: ObjectiveFunction,
@@ -526,19 +529,23 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
         If True, generates comparative plots with distributions colored by their source target.
     """
     mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
-    logging.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
+    _LOGGER.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
+    # Check results_dir
+    results_path = make_fullpath(results_dir)
+    # make output path
     output_path = make_fullpath(save_dir, make=True)
-    all_files = list(yield_dataframes_from_dir(results_dir))
+    all_csvs = list_csv_paths(results_path)
-    if not all_files:
-        logging.warning("No data found. No plots will be generated.")
+    if not all_csvs:
+        _LOGGER.warning("No data found. No plots will be generated.")
         return
     # --- MODE 1: Color-coded plots by target ---
     if color_by_target:
         data_to_plot = []
-        for df, df_name in all_files:
+        for df, df_name in yield_dataframes_from_dir(results_path):
             # Assumes last col is target, rest are features
             melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
             # Sanitize target name for cleaner legend labels
@@ -547,7 +554,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
         long_df = pd.concat(data_to_plot, ignore_index=True)
         features = long_df['feature'].unique()
-        logging.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
+        _LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
         for feature_name in features:
             plt.figure(figsize=(12, 7))
@@ -569,12 +576,12 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
     # --- MODE 2: Aggregate plot ---
     else:
         feature_distributions = defaultdict(list)
-        for df, _ in all_files:
+        for df, _ in yield_dataframes_from_dir(results_path):
             feature_columns = df.iloc[:, :-1]
             for feature_name in feature_columns:
                 feature_distributions[feature_name].extend(df[feature_name].tolist())
-        logging.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
+        _LOGGER.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
         for feature_name, values in feature_distributions.items():
             plt.figure(figsize=(12, 7))
             sns.histplot(x=values, kde=True, bins='auto', stat="density")
@@ -589,7 +596,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
             plt.savefig(plot_filename, bbox_inches='tight')
             plt.close()
-    logging.info(f"✅ All plots saved successfully to: {output_path}")
+    _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
 def info():

ml_tools/RNN_forecast.py ADDED Viewed

@@ -0,0 +1,49 @@
+import torch
+from torch import nn
+import numpy as np
+__all__ = [
+    "rnn_forecast"
+]
+def rnn_forecast(model: nn.Module, start_sequence: torch.Tensor, steps: int, device: str = 'cpu'):
+    """
+    Runs a sequential forecast for a trained RNN-based model.
+    This function iteratively predicts future time steps, where each new prediction
+    is generated by feeding the previous prediction back into the model.
+    Args:
+        model (nn.Module): The trained PyTorch RNN model (e.g., LSTM, GRU).
+        start_sequence (torch.Tensor): The initial sequence to start the forecast from.
+                                       Shape should be (sequence_length, num_features).
+        steps (int): The number of future time steps to predict.
+        device (str, optional): The device to run the forecast on ('cpu', 'cuda', 'mps').
+                                Defaults to 'cpu'.
+    Returns:
+        np.ndarray: A numpy array containing the forecasted values.
+    """
+    model.eval()
+    model.to(device)
+    predictions = []
+    current_sequence = start_sequence.to(device)
+    with torch.no_grad():
+        for _ in range(steps):
+            # Get the model's prediction for the current sequence
+            output = model(current_sequence.unsqueeze(0)) # Add batch dimension
+            # The prediction is the last element of the output sequence
+            next_pred = output[0, -1, :].view(1, -1)
+            # Store the prediction
+            predictions.append(next_pred.cpu().numpy())
+            # Update the sequence for the next iteration:
+            # Drop the first element and append the new prediction
+            current_sequence = torch.cat([current_sequence[1:], next_pred], dim=0)
+    # Concatenate all predictions and flatten the array for easy use
+    return np.concatenate(predictions).flatten()

ml_tools/VIF_factor.py CHANGED Viewed

@@ -8,6 +8,7 @@ from statsmodels.tools.tools import add_constant
 import warnings
 from pathlib import Path
 from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
+from .logger import _LOGGER
 __all__ = [
@@ -54,20 +55,20 @@ def compute_vif(
         sanitized_columns = df.select_dtypes(include='number').columns.tolist()
         missing_features = set(ground_truth_cols) - set(sanitized_columns)
         if missing_features and verbose:
-            print(f"⚠️ These columns are not Numeric:\n{missing_features}")
+            _LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
     else:
         sanitized_columns = list()
         for feature in use_columns:
             if feature not in ground_truth_cols:
                 if verbose:
-                    print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
+                    _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
             else:
                 sanitized_columns.append(feature)
     if ignore_columns is not None and use_columns is None:
         missing_ignore = set(ignore_columns) - set(ground_truth_cols)
         if missing_ignore and verbose:
-            print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
+            _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
         sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
     X = df[sanitized_columns].copy()
@@ -167,12 +168,12 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
     # Identify features to drop
     to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
-    print(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
+    _LOGGER.info(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
     result_df = df.drop(columns=to_drop)
     if result_df.empty:
-        print(f"\t⚠️ Warning: All columns were dropped.")
+        _LOGGER.warning(f"\t⚠️ All columns were dropped.")
     return result_df, to_drop

ml_tools/data_exploration.py CHANGED Viewed

@@ -587,14 +587,14 @@ def standardize_percentages(
     Standardizes numeric columns containing mixed-format percentages.
     This function cleans columns where percentages might be entered as whole
-    numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
+    numbers (55) and as proportions (0.55). It assumes values
     between 0 and 1 are proportions and multiplies them by 100.
     Args:
         df (pd.Dataframe): The input pandas DataFrame.
         columns (list[str]): A list of column names to standardize.
         treat_one_as_proportion (bool):
-            - If True (default): The value `1` is treated as a proportion and converted to `100`.
+            - If True (default): The value `1` is treated as a proportion and converted to `100%`.
             - If False: The value `1` is treated as `1%`.
         round_digits (int): The number of decimal places to round the final result to.

dragon-ml-toolbox 2.4.0__py3-none-any.whl → 3.1.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 2.4.0py3-none-any.whl → 3.1.0py3-none-any.whl