PyPI - dragon-ml-toolbox - Versions diffs - 1.4.5__tar.gz → 1.4.7__tar.gz - Mend

dragon-ml-toolbox 1.4.5tar.gz → 1.4.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{dragon_ml_toolbox-1.4.5/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.4.5
+Version: 1.4.7
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.4.5
+Version: 1.4.7
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/data_exploration.py RENAMED Viewed

@@ -5,9 +5,10 @@ import seaborn as sns
 from IPython import get_ipython
 from IPython.display import clear_output
 import time
-from typing import Union, Literal, Dict, Tuple, Iterator
+from typing import Union, Literal, Dict, Tuple, List
 import os
 from ml_tools.utilities import sanitize_filename, _script_info
+import re
 # Keep track of all available tools, show using `info()`
@@ -22,7 +23,8 @@ __all__ = [
     "check_value_distributions",
     "plot_value_distributions",
     "clip_outliers_single",
-    "clip_outliers_multi"
+    "clip_outliers_multi",
+    "match_and_filter_columns_by_regex"
 ]
@@ -245,9 +247,6 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
         cbar_kws={"shrink": 0.8}
     )
-    # sanitize the plot title
-    plot_title = sanitize_filename(plot_title)
     plt.title(plot_title)
     plt.xticks(rotation=45, ha='right')
     plt.yticks(rotation=0)
@@ -255,6 +254,8 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
     plt.tight_layout()
     if save_dir:
+        # sanitize the plot title to save the file
+        plot_title = sanitize_filename(plot_title)
         os.makedirs(save_dir, exist_ok=True)
         full_path = os.path.join(save_dir, plot_title + ".svg")
         plt.savefig(full_path, bbox_inches="tight", format='svg')
@@ -518,6 +519,36 @@ def clip_outliers_multi(
     return new_df
+def match_and_filter_columns_by_regex(
+    df: pd.DataFrame,
+    pattern: str,
+    case_sensitive: bool = False,
+    escape_pattern: bool = False
+) -> Tuple[pd.DataFrame, List[str]]:
+    """
+    Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
+    Parameters:
+        df (pd.DataFrame): The DataFrame to search.
+        pattern (str): The regex pattern to match column names (use a raw string).
+        case_sensitive (bool): Whether matching is case-sensitive.
+        escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
+    Returns:
+        (Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
+    """
+    if escape_pattern:
+        pattern = re.escape(pattern)
+    mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
+    matched_columns = df.columns[mask].to_list()
+    filtered_df = df.loc[:, mask]
+    print(f"{len(matched_columns)} column(s) match the regex pattern '{pattern}'.")
+    return filtered_df, matched_columns
 def _is_notebook():
     return get_ipython() is not None

{dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/particle_swarm_optimization.py RENAMED Viewed

@@ -8,11 +8,13 @@ from sklearn.base import ClassifierMixin
 from typing import Literal, Union, Tuple, Dict, Optional
 import polars as pl
 from functools import partial
-from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object
+from copy import deepcopy
+from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object, list_files_by_extension
 __all__ = [
     "ObjectiveFunction",
+    "multiple_objective_functions_from_dir",
     "run_pso"
 ]
@@ -29,12 +31,12 @@ class ObjectiveFunction():
         Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
     add_noise : bool
         Whether to apply multiplicative noise to the input features during evaluation.
-    binary_features : int, default=0
-        Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
-    task : Literal, default 'maximization'
+    task : (Literal["maximization", "minimization"])
         Whether to maximize or minimize the target.
+    binary_features : int
+        Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
     """
-    def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
+    def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
         self.binary_features = binary_features
         self.is_hybrid = False if binary_features <= 0 else True
         self.use_noise = add_noise
@@ -96,6 +98,35 @@ class ObjectiveFunction():
         return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
+def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
+    """
+    Loads multiple objective functions from serialized models in the given directory.
+    Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
+    Parameters:
+        directory (str) : Path to the directory containing `.joblib` files (serialized models).
+        add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
+        task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
+        binary_features (int) : Number of binary features expected by each objective function.
+    Returns:
+        (tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
+            - list of `ObjectiveFunction` instances.
+            - list of corresponding filenames.
+    """
+    objective_functions = list()
+    objective_function_names = list()
+    for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
+        current_objective = ObjectiveFunction(trained_model_path=file_path,
+                                              add_noise=add_noise,
+                                              task=task,
+                                              binary_features=binary_features)
+        objective_functions.append(current_objective)
+        objective_function_names.append(file_name)
+    return objective_functions, objective_function_names
 def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
     assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
     assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
@@ -131,9 +162,9 @@ def run_pso(lower_boundaries: list[float],
             target_name: Union[str, None]=None,
             feature_names: Union[list[str], None]=None,
             swarm_size: int=200,
-            max_iterations: int=1500,
+            max_iterations: int=1000,
             inequality_constrain_function=None,
-            post_hoc_analysis: Optional[int]=5,
+            post_hoc_analysis: Optional[int]=3,
             workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
     """
     Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
@@ -180,18 +211,25 @@ def run_pso(lower_boundaries: list[float],
     -----
     - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
     """
+    # set local deep copies to prevent in place list modification
+    local_lower_boundaries = deepcopy(lower_boundaries)
+    local_upper_boundaries = deepcopy(upper_boundaries)
     # Append binary boundaries
     binary_number = objective_function.binary_features
     if auto_binary_boundaries and binary_number > 0:
-        lower_boundaries.extend([0] * binary_number)
-        upper_boundaries.extend([1] * binary_number)
+        local_lower_boundaries.extend([0] * binary_number)
+        local_upper_boundaries.extend([1] * binary_number)
+    # Set the total length of features
+    size_of_features = len(local_lower_boundaries)
-    lower, upper = _set_boundaries(lower_boundaries, upper_boundaries)
+    lower, upper = _set_boundaries(local_lower_boundaries, local_upper_boundaries)
     # feature names
     if feature_names is None and objective_function.feature_names is not None:
         feature_names = objective_function.feature_names
-    names = _set_feature_names(size=len(lower_boundaries), names=feature_names)
+    names = _set_feature_names(size=size_of_features, names=feature_names)
     # target name
     if target_name is None and objective_function.target_name is not None:
@@ -233,7 +271,7 @@ def run_pso(lower_boundaries: list[float],
         return best_features_named, best_target_named
     else:
         all_best_targets = list()
-        all_best_features = [[] for _ in range(len(lower_boundaries))]
+        all_best_features = [[] for _ in range(size_of_features)]
         for _ in range(post_hoc_analysis):
             best_features, best_target, *_ = _pso(**arguments)
             # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)

{dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/ml_tools/utilities.py RENAMED Viewed

@@ -13,6 +13,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
 # Keep track of available tools
 __all__ = [
     "list_csv_paths",
+    "list_files_by_extension",
     "load_dataframe",
     "yield_dataframes_from_dir",
     "merge_dataframes",
@@ -34,7 +35,7 @@ def list_csv_paths(directory: str) -> dict[str, str]:
         directory (str): Path to the directory containing `.csv` files.
     Returns:
-        (dict[str, str]): Mapping {name, path}.
+        (dict[str, str]): Dictionary mapping {filename: filepath}.
     """
     dir_path = Path(directory).expanduser().resolve()
@@ -48,13 +49,47 @@ def list_csv_paths(directory: str) -> dict[str, str]:
     # make a dictionary of paths and names
     name_path_dict = {p.stem: str(p) for p in csv_paths}
-    print("🗂️ CSV files found:")
+    print("\n🗂️ CSV files found:")
     for name in name_path_dict.keys():
         print(f"\t{name}")
     return name_path_dict
+def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
+    """
+    Lists all files with the specified extension in the given directory and returns a mapping:
+    filenames (without extensions) to their absolute paths.
+    Parameters:
+        directory (str): Path to the directory to search in.
+        extension (str): File extension to search for (e.g., 'json', 'txt').
+    Returns:
+        (dict[str, str]): Dictionary mapping {filename: filepath}.
+    """
+    dir_path = Path(directory).expanduser().resolve()
+    if not dir_path.is_dir():
+        raise FileNotFoundError(f"Directory not found: {dir_path}")
+    # Normalize the extension (remove leading dot if present)
+    normalized_ext = extension.lstrip(".").lower()
+    pattern = f"*.{normalized_ext}"
+    matched_paths = list(dir_path.glob(pattern))
+    if not matched_paths:
+        raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
+    name_path_dict = {p.stem: str(p) for p in matched_paths}
+    print(f"\n📂 '{normalized_ext.upper()}' files found:")
+    for name in name_path_dict:
+        print(f"\t{name}")
+    return name_path_dict
 def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
     """
     Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
@@ -404,8 +439,8 @@ def distribute_datasets_by_target(
     Yields
     ------
     Tuple[str, pd.DataFrame]
-        * First element is the target column name.
-        * Second element is the corresponding cleaned DataFrame.
+        * Target name.
+        * Pandas DataFrame.
     """
     # Validate path
     if isinstance(df_or_path, str):

{dragon_ml_toolbox-1.4.5 → dragon_ml_toolbox-1.4.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "1.4.5"
+version = "1.4.7"
 description = "A collection of tools for data science and machine learning projects"
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }