PyPI - dragon-ml-toolbox - Versions diffs - 4.5.0__tar.gz → 5.1.0__tar.gz - Mend

dragon-ml-toolbox 4.5.0tar.gz → 5.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (37) hide show

{dragon_ml_toolbox-4.5.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-5.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 4.5.0
+Version: 5.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -36,6 +36,7 @@ Requires-Dist: lightgbm; extra == "ml"
 Requires-Dist: shap; extra == "ml"
 Requires-Dist: tqdm; extra == "ml"
 Requires-Dist: Pillow; extra == "ml"
+Requires-Dist: evotorch; extra == "ml"
 Provides-Extra: mice
 Requires-Dist: numpy<2.0; extra == "mice"
 Requires-Dist: pandas; extra == "mice"
@@ -204,6 +205,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
 #### Modules:
 ```Bash
+custom_logger
 GUI_tools
 ensemble_inference
 path_manager
@@ -224,6 +226,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
 #### Modules:
 ```Bash
+custom_logger
 GUI_tools
 ML_inference
 path_manager
@@ -265,5 +268,5 @@ After installation, import modules like this:
 ```python
 from ml_tools.utilities import serialize_object, deserialize_object
-from ml_tools.custom_logger import custom_logger
+from ml_tools import custom_logger
 ```

{dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/README.md RENAMED Viewed

@@ -124,6 +124,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
 #### Modules:
 ```Bash
+custom_logger
 GUI_tools
 ensemble_inference
 path_manager
@@ -144,6 +145,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
 #### Modules:
 ```Bash
+custom_logger
 GUI_tools
 ML_inference
 path_manager
@@ -185,5 +187,5 @@ After installation, import modules like this:
 ```python
 from ml_tools.utilities import serialize_object, deserialize_object
-from ml_tools.custom_logger import custom_logger
+from ml_tools import custom_logger
 ```

{dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 4.5.0
+Version: 5.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -36,6 +36,7 @@ Requires-Dist: lightgbm; extra == "ml"
 Requires-Dist: shap; extra == "ml"
 Requires-Dist: tqdm; extra == "ml"
 Requires-Dist: Pillow; extra == "ml"
+Requires-Dist: evotorch; extra == "ml"
 Provides-Extra: mice
 Requires-Dist: numpy<2.0; extra == "mice"
 Requires-Dist: pandas; extra == "mice"
@@ -204,6 +205,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
 #### Modules:
 ```Bash
+custom_logger
 GUI_tools
 ensemble_inference
 path_manager
@@ -224,6 +226,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
 #### Modules:
 ```Bash
+custom_logger
 GUI_tools
 ML_inference
 path_manager
@@ -265,5 +268,5 @@ After installation, import modules like this:
 ```python
 from ml_tools.utilities import serialize_object, deserialize_object
-from ml_tools.custom_logger import custom_logger
+from ml_tools import custom_logger
 ```

{dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt RENAMED Viewed

@@ -11,8 +11,10 @@ ml_tools/ETL_engineering.py
 ml_tools/GUI_tools.py
 ml_tools/MICE_imputation.py
 ml_tools/ML_callbacks.py
+ml_tools/ML_datasetmaster.py
 ml_tools/ML_evaluation.py
 ml_tools/ML_inference.py
+ml_tools/ML_optimization.py
 ml_tools/ML_trainer.py
 ml_tools/PSO_optimization.py
 ml_tools/RNN_forecast.py
@@ -24,10 +26,10 @@ ml_tools/_pytorch_models.py
 ml_tools/_script_info.py
 ml_tools/custom_logger.py
 ml_tools/data_exploration.py
-ml_tools/datasetmaster.py
 ml_tools/ensemble_inference.py
 ml_tools/ensemble_learning.py
 ml_tools/handle_excel.py
 ml_tools/keys.py
+ml_tools/optimization_tools.py
 ml_tools/path_manager.py
 ml_tools/utilities.py

{dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/dragon_ml_toolbox.egg-info/requires.txt RENAMED Viewed

@@ -18,6 +18,7 @@ lightgbm
 shap
 tqdm
 Pillow
+evotorch
 [base]
 pandas

dragon_ml_toolbox-4.5.0/ml_tools/datasetmaster.py → dragon_ml_toolbox-5.1.0/ml_tools/ML_datasetmaster.py RENAMED Viewed

@@ -21,6 +21,7 @@ from ._script_info import _script_info
 # --- public-facing API ---
 __all__ = [
     "DatasetMaker",
+    "SimpleDatasetMaker",
     "VisionDatasetMaker",
     "SequenceMaker",
     "ResizeAspectFill",
@@ -328,7 +329,7 @@ class DatasetMaker(_BaseMaker):
         return self.scaler.inverse_transform(data_np)
-    def get_datasets(self) -> Tuple[_PytorchDataset, _PytorchDataset]:
+    def get_datasets(self) -> Tuple[Dataset, Dataset]:
         """Primary method to get the final PyTorch Datasets."""
         if not self._is_split:
             raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
@@ -370,6 +371,95 @@ class DatasetMaker(_BaseMaker):
         return pandas.DataFrame(full_tensor.numpy(), columns=new_columns, index=cat_df.index)
+# Streamlined DatasetMaker version
+class SimpleDatasetMaker:
+    """
+    A simplified dataset maker for pre-processed, numerical pandas DataFrames.
+    This class takes a DataFrame, automatically splits it into training and
+    testing sets, and converts them into PyTorch Datasets. It assumes the
+    target variable is the last column.
+    Args:
+        pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
+        test_size (float): The proportion of the dataset to allocate to the
+                           test split.
+        random_state (int): The seed for the random number generator for
+                            reproducibility.
+        id (str | None): An optional object identifier.
+    """
+    def __init__(self, pandas_df: pandas.DataFrame, test_size: float = 0.2, random_state: int = 42, id: Optional[str]=None):
+        """
+        Attributes:
+            `train_dataset` -> PyTorch Dataset
+            `test_dataset`  -> PyTorch Dataset
+            `feature_names` -> list[str]
+            `target_name`   -> str
+            `id` -> str | None
+        """
+        if not isinstance(pandas_df, pandas.DataFrame):
+            raise TypeError("Input must be a pandas.DataFrame.")
+        #set id
+        self._id = id
+        # 1. Identify features and target
+        features = pandas_df.iloc[:, :-1]
+        target = pandas_df.iloc[:, -1]
+        self._feature_names = features.columns.tolist()
+        self._target_name = target.name
+        # 2. Split the data
+        X_train, X_test, y_train, y_test = train_test_split(
+            features, target, test_size=test_size, random_state=random_state
+        )
+        self._X_train_shape = X_train.shape
+        self._X_test_shape = X_test.shape
+        self._y_train_shape = y_train.shape
+        self._y_test_shape = y_test.shape
+        # 3. Convert to PyTorch Datasets
+        self._train_ds = _PytorchDataset(X_train.values, y_train.values)
+        self._test_ds = _PytorchDataset(X_test.values, y_test.values)
+    @property
+    def train_dataset(self) -> Dataset:
+        """Returns the training PyTorch dataset."""
+        return self._train_ds
+    @property
+    def test_dataset(self) -> Dataset:
+        """Returns the testing PyTorch dataset."""
+        return self._test_ds
+    @property
+    def feature_names(self) -> list[str]:
+        """Returns the list of feature column names."""
+        return self._feature_names
+    @property
+    def target_name(self) -> str:
+        """Returns the name of the target column."""
+        return str(self._target_name)
+    @property
+    def id(self) -> Optional[str]:
+        """Returns teh object identifier if any."""
+        return self._id
+    def dataframes_info(self) -> None:
+        """Prints the shape information of the split pandas DataFrames."""
+        print("--- Original DataFrame Shapes After Split ---")
+        print(f"  X_train shape: {self._X_train_shape}")
+        print(f"  y_train shape: {self._y_train_shape}\n")
+        print(f"  X_test shape:  {self._X_test_shape}")
+        print(f"  y_test shape:  {self._y_test_shape}")
+        print("-------------------------------------------")
 # --- VisionDatasetMaker ---
 class VisionDatasetMaker(_BaseMaker):
     """

dragon_ml_toolbox-5.1.0/ml_tools/ML_optimization.py ADDED Viewed

@@ -0,0 +1,236 @@
+import torch
+import numpy
+import evotorch
+from evotorch.algorithms import CMAES, SteadyStateGA
+from evotorch.logging import StdOutLogger
+from typing import Literal, Union, Tuple, List, Optional
+from pathlib import Path
+from tqdm.auto import trange
+from contextlib import nullcontext
+from .path_manager import make_fullpath, sanitize_filename
+from ._logger import _LOGGER
+from ._script_info import _script_info
+from .ML_inference import PyTorchInferenceHandler
+from .keys import PyTorchInferenceKeys
+from .SQL import DatabaseManager
+from .optimization_tools import _save_result
+from .utilities import threshold_binary_values
+__all__ = [
+    "create_pytorch_problem",
+    "run_optimization"
+]
+def create_pytorch_problem(
+    handler: PyTorchInferenceHandler,
+    bounds: Tuple[List[float], List[float]],
+    binary_features: int,
+    task: Literal["minimize", "maximize"],
+    algorithm: Literal["CMAES", "GA"] = "CMAES",
+    verbose: bool = False,
+    **searcher_kwargs
+) -> Tuple[evotorch.Problem, evotorch.Searcher]:
+    """
+    Creates and configures an EvoTorch Problem and Searcher for a PyTorch model.
+    Args:
+        handler (PyTorchInferenceHandler): An initialized inference handler
+            containing the model and weights.
+        bounds (tuple[list[float], list[float]]): A tuple containing the lower
+            and upper bounds for the solution features.
+        binary_features (int): Number of binary features located at the END of the feature vector. Will be automatically added to the bounds.
+        task (str): The optimization goal, either "minimize" or "maximize".
+        algorithm (str): The search algorithm to use, "CMAES" or "GA" (SteadyStateGA).
+        verbose (bool): Add an Evotorch logger for real-time console updates.
+        **searcher_kwargs: Additional keyword arguments to pass to the
+            selected search algorithm's constructor (e.g., stdev_init=0.5 for CMAES).
+    Returns:
+        A tuple containing the configured evotorch.Problem and evotorch.Searcher.
+    """
+    lower_bounds, upper_bounds = bounds
+    # add binary bounds
+    if binary_features > 0:
+        lower_bounds.extend([0.45] * binary_features)
+        upper_bounds.extend([0.55] * binary_features)
+    solution_length = len(lower_bounds)
+    device = handler.device
+    # Define the fitness function that EvoTorch will call.
+    @evotorch.decorators.to_tensor
+    @evotorch.decorators.on_aux_device(device)
+    def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
+        # Make a mutable copy of the solutions from the optimizer
+        processed_tensor = solution_tensor.clone()
+        # Apply thresholding if binary features are present
+        if binary_features > 0:
+            # Isolate the binary part of the tensor (the last n columns)
+            binary_part = processed_tensor[:, -binary_features:]
+            # Apply rounding to snap values to 0.0 or 1.0
+            processed_tensor[:, -binary_features:] = torch.round(binary_part)
+        # Use the processed tensor (with thresholded values) for prediction
+        predictions = handler.predict_batch(processed_tensor)[PyTorchInferenceKeys.PREDICTIONS]
+        return predictions.flatten()
+    # Create the Problem instance.
+    problem = evotorch.Problem(
+        objective_sense=task,
+        objective_func=fitness_func,
+        solution_length=solution_length,
+        initial_bounds=(lower_bounds, upper_bounds),
+        device=device,
+    )
+    # Create the selected searcher instance.
+    if algorithm == "CMAES":
+        searcher = CMAES(problem, **searcher_kwargs)
+    elif algorithm == "GA":
+        searcher = SteadyStateGA(problem, **searcher_kwargs)
+    else:
+        raise ValueError(f"Unknown algorithm '{algorithm}'. Choose 'CMAES' or 'GA'.")
+    # Add a logger for real-time console updates.
+    # This gives the user immediate feedback on the optimization progress.
+    if verbose:
+        _ = StdOutLogger(searcher)
+    return problem, searcher
+def run_optimization(
+    problem: evotorch.Problem,
+    searcher: evotorch.Searcher,
+    num_generations: int,
+    target_name: str,
+    binary_features: int,
+    save_dir: Union[str, Path],
+    save_format: Literal['csv', 'sqlite', 'both'],
+    feature_names: Optional[List[str]],
+    repetitions: int = 1
+) -> Optional[dict]:
+    """
+    Runs the evolutionary optimization process, with support for multiple repetitions.
+    This function serves as the main engine for the optimization task. It takes a
+    configured Problem and a Searcher from EvoTorch and executes the optimization
+    for a specified number of generations.
+    It has two modes of operation:
+    1.  **Single Run (repetitions=1):** Executes the optimization once, saves the
+        single best result to a CSV file, and returns it as a dictionary.
+    2.  **Iterative Analysis (repetitions > 1):** Executes the optimization
+        multiple times. Results from each run are streamed incrementally to the
+        specified file formats (CSV and/or SQLite database). In this mode,
+        the function returns None.
+    Args:
+        problem (evotorch.Problem): The configured problem instance, which defines
+            the objective function, solution space, and optimization sense.
+        searcher (evotorch.Searcher): The configured searcher instance, which
+            contains the evolutionary algorithm (e.g., CMAES, GA).
+        num_generations (int): The total number of generations to run the
+            search algorithm for in each repetition.
+        target_name (str): Target name that will also be used for the CSV filename and SQL table.
+        binary_features (int): Number of binary features located at the END of the feature vector.
+        save_dir (str | Path): The directory where the result file(s) will be saved.
+        save_format (Literal['csv', 'sqlite', 'both'], optional): The format for
+            saving results during iterative analysis. Defaults to 'both'.
+        feature_names (List[str], optional): Names of the solution features for
+            labeling the output files. If None, generic names like 'feature_0',
+            'feature_1', etc., will be created. Defaults to None.
+        repetitions (int, optional): The number of independent times to run the
+            entire optimization process. Defaults to 1.
+    Returns:
+        Optional[dict]: A dictionary containing the best feature values and the
+        fitness score if `repetitions` is 1. Returns `None` if `repetitions`
+        is greater than 1, as results are streamed to files instead.
+    """
+    # preprocess paths
+    save_path = make_fullpath(save_dir, make=True, enforce="directory")
+    sanitized_target_name = sanitize_filename(target_name)
+    if not sanitized_target_name.endswith(".csv"):
+        sanitized_target_name = sanitized_target_name + ".csv"
+    csv_path = save_path / sanitized_target_name
+    db_path = save_path / "Optimization.db"
+    db_table_name = target_name
+    # preprocess feature names
+    if feature_names is None:
+        feature_names = [f"feature_{i}" for i in range(problem.solution_length)]
+    # --- SINGLE RUN LOGIC ---
+    if repetitions <= 1:
+        _LOGGER.info(f"🤖 Starting optimization with {searcher.__class__.__name__} for {num_generations} generations...")
+        for _ in trange(num_generations, desc="Optimizing"):
+            searcher.step()
+        best_solution_tensor, best_fitness = searcher.best
+        best_solution_np = best_solution_tensor.cpu().numpy()
+        # threshold binary features
+        if binary_features > 0:
+            best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
+        else:
+            best_solution_thresholded = best_solution_np
+        result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
+        result_dict[target_name] = best_fitness.item()
+        _save_result(result_dict, 'csv', csv_path) # Single run defaults to CSV
+        _LOGGER.info(f"✅ Optimization complete. Best solution saved to '{csv_path.name}'")
+        return result_dict
+    # --- MULTIPLE REPETITIONS LOGIC ---
+    else:
+        _LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
+        db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
+        with db_context as db_manager:
+            if db_manager:
+                schema = {name: "REAL" for name in feature_names}
+                schema[target_name] = "REAL"
+                db_manager.create_table(db_table_name, schema)
+            for i in trange(repetitions, desc="Repetitions"):
+                _LOGGER.info(f"--- Starting Repetition {i+1}/{repetitions} ---")
+                # CRITICAL: Re-initialize the searcher to ensure each run is independent
+                searcher.reset()
+                for _ in range(num_generations): # Inner loop does not need a progress bar
+                    searcher.step()
+                best_solution_tensor, best_fitness = searcher.best
+                best_solution_np = best_solution_tensor.cpu().numpy()
+                # threshold binary features
+                if binary_features > 0:
+                    best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
+                else:
+                    best_solution_thresholded = best_solution_np
+                result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
+                result_dict[target_name] = best_fitness.item()
+                # Save each result incrementally
+                _save_result(result_dict, save_format, csv_path, db_manager, db_table_name)
+        _LOGGER.info(f"✅ Optimal solution space complete. Results saved to '{save_path}'")
+        return None
+def info():
+    _script_info(__all__)

{dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/PSO_optimization.py RENAMED Viewed

@@ -2,32 +2,27 @@ import numpy as np
 from pathlib import Path
 import xgboost as xgb
 import lightgbm as lgb
-from typing import Literal, Union, Tuple, Dict, Optional, Any
-import pandas as pd
+from typing import Literal, Union, Tuple, Dict, Optional
 from copy import deepcopy
 from .utilities import (
     threshold_binary_values,
     threshold_binary_values_batch,
-    deserialize_object,
-    yield_dataframes_from_dir)
-from .path_manager import sanitize_filename, make_fullpath, list_files_by_extension, list_csv_paths
+    deserialize_object)
+from .path_manager import sanitize_filename, make_fullpath, list_files_by_extension
 import torch
 from tqdm import trange
-import matplotlib.pyplot as plt
-import seaborn as sns
 from ._logger import _LOGGER
 from .keys import ModelSaveKeys
 from ._script_info import _script_info
 from .SQL import DatabaseManager
 from contextlib import nullcontext
+from .optimization_tools import _save_result
 __all__ = [
     "ObjectiveFunction",
     "multiple_objective_functions_from_dir",
-    "parse_lower_upper_bounds",
-    "run_pso",
-    "plot_optimal_feature_distributions"
+    "run_pso"
 ]
@@ -170,18 +165,6 @@ def multiple_objective_functions_from_dir(directory: Union[str,Path], add_noise:
     return objective_functions, objective_function_names
-def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
-    """
-    Parse lower and upper boundaries, returning 2 lists:
-    `lower_bounds`, `upper_bounds`
-    """
-    lower = [low[0] for low in source.values()]
-    upper = [up[1] for up in source.values()]
-    return lower, upper
 def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
     assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
     assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
@@ -198,45 +181,6 @@ def _set_feature_names(size: int, names: Union[list[str], None]):
         return names
-def _save_result(result_dict: dict,
-                 save_format: Literal['csv', 'sqlite', 'both'],
-                 csv_path: Path,
-                 db_manager: Optional[DatabaseManager] = None,
-                 db_table_name: Optional[str] = None):
-    """
-    Handles saving a single result to CSV, SQLite, or both.
-    """
-    # Save to CSV
-    if save_format in ['csv', 'both']:
-        _save_or_append_to_csv(result_dict, csv_path)
-    # Save to SQLite
-    if save_format in ['sqlite', 'both']:
-        if db_manager and db_table_name:
-            db_manager.insert_row(db_table_name, result_dict)
-        else:
-            _LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
-def _save_or_append_to_csv(data_dict: dict, save_path: Path):
-    """
-    Saves or appends a dictionary of data as a single row to a CSV file.
-    If the file doesn't exist, it creates it and writes the header.
-    If the file exists, it appends the new data without the header.
-    """
-    df_row = pd.DataFrame([data_dict])
-    file_exists = save_path.exists()
-    df_row.to_csv(
-        save_path,
-        mode='a',              # 'a' for append mode
-        index=False,           # Don't write the DataFrame index
-        header=not file_exists # Write header only if file does NOT exist
-    )
 def _run_single_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, random_state: int, save_format: Literal['csv', 'sqlite', 'both'], csv_path: Path, db_manager: Optional[DatabaseManager], db_table_name: str):
     """Helper for a single PSO run that also handles saving."""
     pso_args.update({"seed": random_state})
@@ -282,14 +226,14 @@ def run_pso(lower_boundaries: list[float],
             upper_boundaries: list[float],
             objective_function: ObjectiveFunction,
             save_results_dir: Union[str,Path],
-            save_format: Literal['csv', 'sqlite', 'both'] = 'csv',
+            save_format: Literal['csv', 'sqlite', 'both'],
             auto_binary_boundaries: bool=True,
             target_name: Union[str, None]=None,
             feature_names: Union[list[str], None]=None,
             swarm_size: int=200,
             max_iterations: int=3000,
             random_state: int=101,
-            post_hoc_analysis: Optional[int]=10) -> Optional[Tuple[Dict[str, float], Dict[str, float]]]:
+            post_hoc_analysis: Optional[int]=20) -> Optional[Tuple[Dict[str, float], Dict[str, float]]]:
     """
     Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
@@ -303,7 +247,7 @@ def run_pso(lower_boundaries: list[float],
         A callable object encapsulating a tree-based regression model.
     save_results_dir : str | Path
         Directory path to save the results CSV file.
-    save_format : {'csv', 'sqlite', 'both'}, default 'csv'
+    save_format : {'csv', 'sqlite', 'both'}
         The format for saving optimization results.
         - 'csv': Saves results to a CSV file.
         - 'sqlite': Saves results to an SQLite database file. ⚠️ If a database exists, new tables will be created using the target name.
@@ -578,83 +522,6 @@ def _pso(func: ObjectiveFunction,
         return best_position, best_score
-def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
-    """
-    Analyzes optimization results and plots the distribution of optimal values for each feature.
-    For features with more than two unique values, this function generates a color-coded
-    Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
-    showing relative frequency.
-    Parameters
-    ----------
-    results_dir : str or Path
-        The path to the directory containing the optimization result CSV files.
-    save_dir : str or Path
-        The directory where the output plots will be saved.
-    """
-    # Check results_dir and create output path
-    results_path = make_fullpath(results_dir)
-    output_path = make_fullpath(save_dir, make=True)
-    # Check that the directory contains csv files
-    list_csv_paths(results_path, verbose=False)
-    # --- Data Loading and Preparation ---
-    _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
-    data_to_plot = []
-    for df, df_name in yield_dataframes_from_dir(results_path):
-        melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
-        melted_df['target'] = df_name.replace("Optimization_", "")
-        data_to_plot.append(melted_df)
-    long_df = pd.concat(data_to_plot, ignore_index=True)
-    features = long_df['feature'].unique()
-    _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
-    # --- Plotting Loop ---
-    for feature_name in features:
-        plt.figure(figsize=(12, 7))
-        feature_df = long_df[long_df['feature'] == feature_name]
-        # Check if the feature is binary or constant
-        if feature_df['value'].nunique() <= 2:
-            # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
-            # This ensures the X-axis is clean (e.g., just 0 and 1).
-            norm_df = (feature_df.groupby('target')['value']
-                       .value_counts(normalize=True)
-                       .mul(100)
-                       .rename('percent')
-                       .reset_index())
-            ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
-            plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
-            plt.ylabel("Frequency (%)", fontsize=12)
-            ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
-        else:
-            # PLOT 2: KDE plot for continuous values.
-            ax = sns.kdeplot(data=feature_df, x='value', hue='target',
-                             fill=True, alpha=0.1, warn_singular=False)
-            plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
-            plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
-        # --- Common settings for both plot types ---
-        plt.xlabel("Feature Value", fontsize=12)
-        plt.grid(axis='y', alpha=0.5, linestyle='--')
-        legend = ax.get_legend()
-        if legend:
-            legend.set_title('Target')
-        sanitized_feature_name = sanitize_filename(feature_name)
-        plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
-        plt.savefig(plot_filename, bbox_inches='tight')
-        plt.close()
-    _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
 def info():

dragon_ml_toolbox-5.1.0/ml_tools/optimization_tools.py ADDED Viewed

@@ -0,0 +1,137 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import Union, Any, Literal, Optional
+from pathlib import Path
+import pandas as pd
+from .path_manager import make_fullpath, list_csv_paths, sanitize_filename
+from .utilities import yield_dataframes_from_dir
+from ._logger import _LOGGER
+from ._script_info import _script_info
+from .SQL import DatabaseManager
+__all__ = [
+    "parse_lower_upper_bounds",
+    "plot_optimal_feature_distributions"
+]
+def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
+    """
+    Parse lower and upper boundaries, returning 2 lists:
+    `lower_bounds`, `upper_bounds`
+    """
+    lower = [low[0] for low in source.values()]
+    upper = [up[1] for up in source.values()]
+    return lower, upper
+def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
+    """
+    Analyzes optimization results and plots the distribution of optimal values for each feature.
+    For features with more than two unique values, this function generates a color-coded
+    Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
+    showing relative frequency.
+    Parameters
+    ----------
+    results_dir : str or Path
+        The path to the directory containing the optimization result CSV files.
+    save_dir : str or Path
+        The directory where the output plots will be saved.
+    """
+    # Check results_dir and create output path
+    results_path = make_fullpath(results_dir)
+    output_path = make_fullpath(save_dir, make=True)
+    # Check that the directory contains csv files
+    list_csv_paths(results_path, verbose=False)
+    # --- Data Loading and Preparation ---
+    _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
+    data_to_plot = []
+    for df, df_name in yield_dataframes_from_dir(results_path):
+        melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
+        melted_df['target'] = df_name.replace("Optimization_", "")
+        data_to_plot.append(melted_df)
+    long_df = pd.concat(data_to_plot, ignore_index=True)
+    features = long_df['feature'].unique()
+    _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
+    # --- Plotting Loop ---
+    for feature_name in features:
+        plt.figure(figsize=(12, 7))
+        feature_df = long_df[long_df['feature'] == feature_name]
+        # Check if the feature is binary or constant
+        if feature_df['value'].nunique() <= 2:
+            # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
+            # This ensures the X-axis is clean (e.g., just 0 and 1).
+            norm_df = (feature_df.groupby('target')['value']
+                       .value_counts(normalize=True)
+                       .mul(100)
+                       .rename('percent')
+                       .reset_index())
+            ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
+            plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
+            plt.ylabel("Frequency (%)", fontsize=12)
+            ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
+        else:
+            # PLOT 2: KDE plot for continuous values.
+            ax = sns.kdeplot(data=feature_df, x='value', hue='target',
+                             fill=True, alpha=0.1, warn_singular=False)
+            plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
+            plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
+        # --- Common settings for both plot types ---
+        plt.xlabel("Feature Value", fontsize=12)
+        plt.grid(axis='y', alpha=0.5, linestyle='--')
+        legend = ax.get_legend()
+        if legend:
+            legend.set_title('Target')
+        sanitized_feature_name = sanitize_filename(feature_name)
+        plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
+        plt.savefig(plot_filename, bbox_inches='tight')
+        plt.close()
+    _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
+def _save_result(
+        result_dict: dict,
+        save_format: Literal['csv', 'sqlite', 'both'],
+        csv_path: Path,
+        db_manager: Optional[DatabaseManager] = None,
+        db_table_name: Optional[str] = None
+    ):
+    """
+    Private helper to handle saving a single result to CSV, SQLite, or both.
+    """
+    # Save to CSV
+    if save_format in ['csv', 'both']:
+        df_row = pd.DataFrame([result_dict])
+        file_exists = csv_path.exists()
+        df_row.to_csv(csv_path, mode='a', index=False, header=not file_exists)
+    # Save to SQLite
+    if save_format in ['sqlite', 'both']:
+        if db_manager and db_table_name:
+            db_manager.insert_row(db_table_name, result_dict)
+        else:
+            _LOGGER.warning("⚠️ SQLite saving requested but db_manager or table_name not provided.")
+def info():
+    _script_info(__all__)

{dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "4.5.0"
+version = "5.1.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }
@@ -27,7 +27,7 @@ base = [
     "joblib"
 ]
-# Machine Learning main toolbox. Additionally Requires PyTorch with CUDA / MPS support if pytorch models are used
+# Machine Learning main toolbox. Additionally Requires PyTorch with CUDA / MPS support
 ML = [
     "numpy",
     "pandas",
@@ -46,7 +46,8 @@ ML = [
     "lightgbm",
     "shap",
     "tqdm",
-    "Pillow"
+    "Pillow",
+    "evotorch"
 ]
 # MICE and VIF - Requires a new virtual-env due to dependency version conflicts