PyPI - dragon-ml-toolbox - Versions diffs - 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.2.0py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (10) hide show

{dragon_ml_toolbox-2.2.0.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 2.2.0
+Version: 2.3.0
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-2.2.0.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,21 @@
-dragon_ml_toolbox-2.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-2.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
-ml_tools/ETL_engineering.py,sha256=9Lg-anXhggtdzvRPgVVSiAUGu5sb-LAZDfLDFXJlHns,21328
+dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-2.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
+ml_tools/ETL_engineering.py,sha256=ns8HsLWZhByurvjtUUW10p7If1h1O5-btUfCRXxzkME,31568
 ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
-ml_tools/PSO_optimization.py,sha256=T-wnB94DcRWuRd2M3loDVT4POtIP0MOhs-VilAf1L4E,20974
+ml_tools/PSO_optimization.py,sha256=gi56mF-q6BApYwhAd9jix0xiYz595WTPcUh7afZsRJ4,25378
 ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
-ml_tools/data_exploration.py,sha256=CDUVRTHfww105IXDRpBQ81KZWx5HXSsA-FVsVYBzNw8,21298
+ml_tools/data_exploration.py,sha256=Fzbz_DKZ7F2e3-JbahLqKr3aP6lt9aCK9rNOHvR7nlA,23665
 ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
 ml_tools/ensemble_learning.py,sha256=q9jbu7SupvXz61sURFQ9V2-7gUsLbA3cSgyb2MQFyyc,37351
 ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,12944
 ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
 ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
 ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
-ml_tools/utilities.py,sha256=A7Wm1ArpqFG80WKmnkYdtSzIRLvg5x-9nPNidZIbpPA,20671
+ml_tools/utilities.py,sha256=T6AnNEQjUDnMAMSIJ8yZqToAVESIlEKK0bGBEm3sAUU,20670
 ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
-dragon_ml_toolbox-2.2.0.dist-info/METADATA,sha256=oTLE1Q6BzsIwicQM7XCumt89XAjHZcV6CxDTfyteP_w,2974
-dragon_ml_toolbox-2.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-2.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-2.2.0.dist-info/RECORD,,
+dragon_ml_toolbox-2.3.0.dist-info/METADATA,sha256=4wivV_JKPd83xNzf6xzSfCwxiZgvYL5uW4yE6Da8tnU,2974
+dragon_ml_toolbox-2.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-2.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-2.3.0.dist-info/RECORD,,

ml_tools/ETL_engineering.py CHANGED Viewed

@@ -2,19 +2,120 @@ import polars as pl
 import re
 from typing import Literal, Union, Optional, Any, Callable, List, Dict
 from .utilities import _script_info
+import pandas as pd
 __all__ = [
+    "ColumnCleaner",
+    "DataFrameCleaner"
     "TransformationRecipe",
     "DataProcessor",
     "KeywordDummifier",
     "NumberExtractor",
     "MultiNumberExtractor",
+    "RatioCalculator"
     "CategoryMapper",
+    "RegexMapper",
     "ValueBinner",
     "DateFeatureExtractor"
 ]
+########## EXTRACT and CLEAN ##########
+class ColumnCleaner:
+    """
+    Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
+    Args:
+        rules (Dict[str, str]):
+            A dictionary where each key is a regular expression pattern and
+            each value is the standardized string to replace matches with.
+    """
+    def __init__(self, rules: Dict[str, str]):
+        if not isinstance(rules, dict):
+            raise TypeError("The 'rules' argument must be a dictionary.")
+        # Validate that all keys are valid regular expressions
+        for pattern in rules.keys():
+            try:
+                re.compile(pattern)
+            except re.error as e:
+                raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
+        self.rules = rules
+    def clean(self, series: pd.Series) -> pd.Series:
+        """
+        Applies the standardization rules to the provided Series (requires string data).
+        Non-matching values are kept as they are.
+        Args:
+            series (pd.Series): The pandas Series to clean.
+        Returns:
+            pd.Series: A new Series with the values cleaned and standardized.
+        """
+        return series.astype(str).replace(self.rules, regex=True)
+class DataFrameCleaner:
+    """
+    Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
+    Args:
+        rules (Dict[str, Dict[str, str]]):
+            A nested dictionary where each top-level key is a column name,
+            and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
+    """
+    def __init__(self, rules: Dict[str, Dict[str, str]]):
+        if not isinstance(rules, dict):
+            raise TypeError("The 'rules' argument must be a nested dictionary.")
+        for col_name, col_rules in rules.items():
+            if not isinstance(col_rules, dict):
+                raise TypeError(
+                    f"The value for column '{col_name}' must be a dictionary "
+                    f"of rules, but got type {type(col_rules).__name__}."
+                )
+        self.rules = rules
+    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Applies all defined cleaning rules to the DataFrame.
+        Args:
+            df (pd.DataFrame): The pandas DataFrame to clean.
+        Returns:
+            pd.DataFrame: A new, cleaned DataFrame.
+        """
+        rule_columns = set(self.rules.keys())
+        df_columns = set(df.columns)
+        missing_columns = rule_columns - df_columns
+        if missing_columns:
+            # Report all missing columns in a single, clear error message
+            raise ValueError(
+                f"The following columns specified in the cleaning rules "
+                f"were not found in the DataFrame: {sorted(list(missing_columns))}"
+            )
+        # Start the process
+        df_cleaned = df.copy()
+        for column_name, column_rules in self.rules.items():
+            # Create and apply the specific cleaner for the column
+            cleaner = ColumnCleaner(rules=column_rules)
+            df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
+        return df_cleaned
+############ TRANSFORM ####################
 # Magic word for rename-only transformation
 _RENAME = "rename"
@@ -101,7 +202,7 @@ class DataProcessor:
             raise TypeError("The recipe must be an instance of TransformationRecipe.")
         if len(recipe) == 0:
             raise ValueError("The recipe cannot be empty.")
-        self.recipe = recipe
+        self._recipe = recipe
     def transform(self, df: pl.DataFrame) -> pl.DataFrame:
         """
@@ -109,7 +210,7 @@ class DataProcessor:
         """
         processed_columns = []
         # Recipe object is iterable
-        for step in self.recipe:
+        for step in self._recipe:
             input_col_name = step["input_col"]
             output_col_spec = step["output_col"]
             transform_action = step["transform"]
@@ -154,6 +255,49 @@ class DataProcessor:
             return pl.DataFrame()
         return pl.DataFrame(processed_columns)
+    def __str__(self) -> str:
+        """
+        Provides a detailed, human-readable string representation of the
+        entire processing pipeline.
+        """
+        header = "DataProcessor Pipeline"
+        divider = "-" * len(header)
+        num_steps = len(self._recipe)
+        lines = [
+            header,
+            divider,
+            f"Number of steps: {num_steps}\n"
+        ]
+        if num_steps == 0:
+            lines.append("No transformation steps defined.")
+            return "\n".join(lines)
+        for i, step in enumerate(self._recipe, 1):
+            transform_action = step["transform"]
+            # Get a clean name for the transformation action
+            if transform_action == _RENAME: # "rename"
+                transform_name = "Rename"
+            else:
+                # This works for both functions and class instances
+                transform_name = type(transform_action).__name__
+            lines.append(f"[{i}] Input: '{step['input_col']}'")
+            lines.append(f"    - Transform: {transform_name}")
+            lines.append(f"    - Output(s): {step['output_col']}")
+            if i < num_steps:
+                lines.append("") # Add a blank line between steps
+        return "\n".join(lines)
+    def inspect(self) -> None:
+        """
+        Prints the detailed string representation of the pipeline to the console.
+        """
+        print(self)
 class KeywordDummifier:
@@ -293,8 +437,7 @@ class MultiNumberExtractor:
     """
     Extracts multiple numbers from a single polars string column into several new columns.
-    This transformer is designed for one-to-many mappings, such as parsing
-    ratios (100:30) or coordinates (10, 25) into separate columns.
+    This transformer is designed for one-to-many mappings, such as parsing coordinates (10, 25) into separate columns.
     Args:
         num_outputs (int):
@@ -370,6 +513,59 @@ class MultiNumberExtractor:
         return pl.select(output_expressions)
+class RatioCalculator:
+    """
+    A transformer that parses a string ratio (e.g., "40:5" or "30/2") and computes the result of the division.
+    Args:
+        regex_pattern (str, optional):
+            The regex pattern to find the numerator and denominator. It MUST
+            contain exactly two capturing groups: the first for the
+            numerator and the second for the denominator. Defaults to a
+            pattern that handles common delimiters like ':' and '/'.
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r"(\d+\.?\d*)\s*[:/]\s*(\d+\.?\d*)"
+    ):
+        # --- Validation ---
+        try:
+            if re.compile(regex_pattern).groups != 2:
+                raise ValueError(
+                    "regex_pattern must contain exactly two "
+                    "capturing groups '(...)'."
+                )
+        except re.error as e:
+            raise ValueError(f"Invalid regex pattern provided: {e}") from e
+        self.regex_pattern = regex_pattern
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Applies the ratio calculation logic to the input column.
+        Args:
+            column (pl.Series): The input Polars Series of ratio strings.
+        Returns:
+            pl.Series: A new Series of floats containing the division result.
+                       Returns null for invalid formats or division by zero.
+        """
+        # .extract_groups returns a struct with a field for each capture group
+        # e.g., {"group_1": "40", "group_2": "5"}
+        groups = column.str.extract_groups(self.regex_pattern)
+        # Extract numerator and denominator, casting to float
+        # strict=False ensures that non-matches become null
+        numerator = groups.struct.field("group_1").cast(pl.Float64, strict=False)
+        denominator = groups.struct.field("group_2").cast(pl.Float64, strict=False)
+        # Safely perform division, returning null if denominator is 0
+        return pl.when(denominator != 0).then(
+            numerator / denominator
+        ).otherwise(None)
 class CategoryMapper:
     """
     A transformer that maps string categories to specified numerical values using a dictionary.
@@ -407,7 +603,90 @@ class CategoryMapper:
             pl.Series: A new Series with categories mapped to numbers.
         """
         # Ensure the column is treated as a string for matching keys
-        return column.cast(pl.Utf8).map_dict(self.mapping, default=self.default_value)
+        str_column = column.cast(pl.Utf8)
+        # Create a list of 'when/then' expressions, one for each mapping
+        mapping_expressions = [
+            pl.when(str_column == from_val).then(pl.lit(to_val))
+            for from_val, to_val in self.mapping.items()
+        ]
+        # Use coalesce to find the first non-null value.
+        # The default_value acts as the final fallback.
+        final_expr = pl.coalesce(
+            *mapping_expressions, # Unpack the list of expressions
+            pl.lit(self.default_value)
+        )
+        return pl.select(final_expr).to_series()
+class RegexMapper:
+    """
+    A transformer that maps string categories to numerical values based on a
+    dictionary of regular expression patterns.
+    The class iterates through the mapping dictionary in order, and the first
+    pattern that matches a given string determines the output value. This
+    "first match wins" logic makes the order of the mapping important.
+    Args:
+        mapping (Dict[str, Union[int, float]]):
+            An ordered dictionary where keys are regex patterns and values are
+            the numbers to map to if the pattern is found.
+        unseen_value (Optional[Union[int, float]], optional):
+            The numerical value to use for strings that do not match any
+            of the regex patterns. If None (default), unseen values are
+            mapped to null.
+    """
+    def __init__(
+        self,
+        mapping: Dict[str, Union[int, float]],
+        unseen_value: Optional[Union[int, float]] = None,
+    ):
+        # --- Validation ---
+        if not isinstance(mapping, dict):
+            raise TypeError("The 'mapping' argument must be a dictionary.")
+        for pattern, value in mapping.items():
+            try:
+                re.compile(pattern)
+            except re.error as e:
+                raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
+            if not isinstance(value, (int, float)):
+                raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
+        self.mapping = mapping
+        self.unseen_value = unseen_value
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Applies the regex mapping logic to the input column.
+        Args:
+            column (pl.Series): The input Polars Series of string data.
+        Returns:
+            pl.Series: A new Series with strings mapped to numbers based on
+                       the first matching regex pattern.
+        """
+        # Ensure the column is treated as a string for matching
+        str_column = column.cast(pl.Utf8)
+        # Build the when/then/otherwise chain from the inside out.
+        # Start with the final fallback value for non-matches.
+        mapping_expr = pl.lit(self.unseen_value)
+        # Iterate through the mapping in reverse to construct the nested expression
+        for pattern, value in reversed(list(self.mapping.items())):
+            mapping_expr = (
+                pl.when(str_column.str.contains(pattern))
+                .then(pl.lit(value))
+                .otherwise(mapping_expr)
+            )
+        # Execute the complete expression chain and return the resulting Series
+        return pl.select(mapping_expr).to_series()
 class ValueBinner:

ml_tools/PSO_optimization.py CHANGED Viewed

@@ -7,15 +7,27 @@ from sklearn.base import ClassifierMixin
 from typing import Literal, Union, Tuple, Dict, Optional
 import pandas as pd
 from copy import deepcopy
-from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath
+from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath, yield_dataframes_from_dir, sanitize_filename
 import torch
 from tqdm import trange
+import logging
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import defaultdict
+# Configure logger
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] [%(levelname)s] - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S"
+)
 __all__ = [
     "ObjectiveFunction",
     "multiple_objective_functions_from_dir",
-    "run_pso"
+    "run_pso",
+    "plot_optimal_feature_distributions"
 ]
@@ -184,6 +196,52 @@ def _save_results(*dicts, save_dir: Union[str,Path], target_name: str):
     save_dataframe(df=df, save_dir=save_dir, filename=f"Optimization_{target_name}")
+def _run_single_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, random_state: int):
+    """Helper for a single PSO run."""
+    pso_args.update({"seed": random_state})
+    best_features, best_target, *_ = _pso(**pso_args)
+    # Flip best_target if maximization was used
+    if objective_function.task == "maximization":
+        best_target = -best_target
+    # Threshold binary features
+    binary_number = objective_function.binary_features
+    best_features_threshold = threshold_binary_values(best_features, binary_number)
+    # Name features and target
+    best_features_named = {name: value for name, value in zip(feature_names, best_features_threshold)}
+    best_target_named = {target_name: best_target}
+    return best_features_named, best_target_named
+def _run_post_hoc_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, repetitions: int):
+    """Helper for post-hoc PSO analysis."""
+    all_best_targets = []
+    all_best_features = [[] for _ in range(len(feature_names))]
+    for _ in range(repetitions):
+        best_features, best_target, *_ = _pso(**pso_args)
+        if objective_function.task == "maximization":
+            best_target = -best_target
+        binary_number = objective_function.binary_features
+        best_features_threshold = threshold_binary_values(best_features, binary_number)
+        for i, best_feature in enumerate(best_features_threshold):
+            all_best_features[i].append(best_feature)
+        all_best_targets.append(best_target)
+    # Name features and target
+    all_best_features_named = {name: lst for name, lst in zip(feature_names, all_best_features)}
+    all_best_targets_named = {target_name: all_best_targets}
+    return all_best_features_named, all_best_targets_named
 def run_pso(lower_boundaries: list[float],
             upper_boundaries: list[float],
             objective_function: ObjectiveFunction,
@@ -236,6 +294,8 @@ def run_pso(lower_boundaries: list[float],
     -----
     - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
     """
     # Select device
     if torch.cuda.is_available():
         device = torch.device("cuda")
@@ -243,7 +303,8 @@ def run_pso(lower_boundaries: list[float],
         device = torch.device("mps")
     else:
         device = torch.device("cpu")
-    print(f"[PSO] Using device: '{device}'")
+    logging.info(f"Using device: '{device}'")
     # set local deep copies to prevent in place list modification
     local_lower_boundaries = deepcopy(lower_boundaries)
@@ -271,7 +332,7 @@ def run_pso(lower_boundaries: list[float],
     if target_name is None:
         target_name = "Target"
-    arguments = {
+    pso_arguments = {
             "func":objective_function,
             "lb": lower,
             "ub": upper,
@@ -281,59 +342,17 @@ def run_pso(lower_boundaries: list[float],
             "particle_output": False,
     }
+    # Dispatcher
+    if post_hoc_analysis is None or post_hoc_analysis <= 1:
+        features, target = _run_single_pso(objective_function, pso_arguments, names, target_name, random_state)
+    else:
+        features, target = _run_post_hoc_pso(objective_function, pso_arguments, names, target_name, post_hoc_analysis)
+    # --- Save Results ---
     save_results_path = make_fullpath(save_results_dir, make=True)
+    _save_results(features, target, save_dir=save_results_path, target_name=target_name)
-    if post_hoc_analysis is None or post_hoc_analysis == 1:
-        arguments.update({"seed": random_state})
-        best_features, best_target, *_ = _pso(**arguments)
-        # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
-        # flip best_target if maximization was used
-        if objective_function.task == "maximization":
-            best_target = -best_target
-        # threshold binary features
-        best_features_threshold = threshold_binary_values(best_features, binary_number)
-        # name features
-        best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
-        best_target_named = {target_name: best_target}
-        # save results
-        _save_results(best_features_named, best_target_named, save_dir=save_results_path, target_name=target_name)
-        return best_features_named, best_target_named
-    else:
-        all_best_targets = list()
-        all_best_features = [[] for _ in range(size_of_features)]
-        for _ in range(post_hoc_analysis):
-            best_features, best_target, *_ = _pso(**arguments)
-            # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
-            # flip best_target if maximization was used
-            if objective_function.task == "maximization":
-                best_target = -best_target
-            # threshold binary features
-            best_features_threshold = threshold_binary_values(best_features, binary_number)
-            for i, best_feature in enumerate(best_features_threshold):
-                all_best_features[i].append(best_feature)
-            all_best_targets.append(best_target)
-        # name features
-        all_best_features_named = {name: list_values for name, list_values in zip(names, all_best_features)}
-        all_best_targets_named = {target_name: all_best_targets}
-        # save results
-        _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_path, target_name=target_name)
-        return all_best_features_named, all_best_targets_named # type: ignore
-def info():
-    _script_info(__all__)
+    return features, target
 def _pso(func: ObjectiveFunction,
@@ -342,7 +361,9 @@ def _pso(func: ObjectiveFunction,
          device: torch.device,
          swarmsize: int,
          maxiter: int,
-         omega = 0.729,     # Clerc and Kennedy’s constriction coefficient
+         omega_start = 0.9, # STARTING inertia weight
+         omega_end = 0.4,   # ENDING inertia weight
+        #  omega = 0.729,     # Clerc and Kennedy’s constriction coefficient
          phip = 1.49445,    # Clerc and Kennedy’s constriction coefficient
          phig = 1.49445,    # Clerc and Kennedy’s constriction coefficient
          tolerance = 1e-8,
@@ -418,7 +439,7 @@ def _pso(func: ObjectiveFunction,
     # Initialize positions and velocities
     r = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
-    positions = lb_t + r * (ub_t - lb_t)  # shape: (swarmsize, ndim)
+    positions = lb_t + r * (ub_t - lb_t)
     velocities = torch.zeros_like(positions, requires_grad=False)
     # Initialize best positions and scores
@@ -428,19 +449,17 @@ def _pso(func: ObjectiveFunction,
     global_best_score = float('inf')
     global_best_position = torch.zeros(ndim, device=device, requires_grad=False)
-    # History (optional)
     if particle_output:
         history_positions = []
         history_scores = []
-    # Main loop
     previous_best_score = float('inf')
-    progress = trange(maxiter, desc="PSO", unit="iter", leave=True) #tqdm bar
+    progress = trange(maxiter, desc="PSO", unit="iter", leave=True)
     with torch.no_grad():
         for i in progress:
             # Evaluate objective for all particles
-            positions_np = positions.detach().cpu().numpy() # shape: (swarmsize, n_features)
-            scores_np = func(positions_np)  # shape: (swarmsize,)
+            positions_np = positions.detach().cpu().numpy()
+            scores_np = func(positions_np)
             scores = torch.tensor(scores_np, device=device, dtype=torch.float32)
             # Update personal bests
@@ -454,17 +473,18 @@ def _pso(func: ObjectiveFunction,
                 global_best_score = min_score.item()
                 global_best_position = personal_best_positions[min_idx].clone()
-                # Early stopping criteria
                 if abs(previous_best_score - global_best_score) < tolerance:
                     progress.set_description(f"PSO (early stop at iteration {i+1})")
                     break
                 previous_best_score = global_best_score
-            # Optional: track history for debugging/visualization
             if particle_output:
                 history_positions.append(positions.detach().cpu().numpy())
                 history_scores.append(scores_np)
+            # Linearly decreasing inertia weight
+            omega = omega_start - (omega_start - omega_end) * (i / maxiter)
             # Velocity update
             rp = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
             rg = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
@@ -476,11 +496,9 @@ def _pso(func: ObjectiveFunction,
             # Position update
             positions = positions + velocities
-            # Clamp to search space bounds
             positions = torch.max(positions, lb_t)
             positions = torch.min(positions, ub_t)
-    # Move to CPU and convert to NumPy
     best_position = global_best_position.detach().cpu().numpy()
     best_score = global_best_score
@@ -488,3 +506,91 @@ def _pso(func: ObjectiveFunction,
         return best_position, best_score, history_positions, history_scores
     else:
         return best_position, best_score
+def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path], color_by_target: bool = True):
+    """
+    Analyzes optimization results and plots the distribution of optimal values for each feature.
+    This function can operate in two modes based on the `color_by_target` parameter:
+    1.  Aggregates all values for a feature into a single group and plots one overall distribution (histogram + KDE).
+    2.  Color-coded: Plots a separate, color-coded Kernel Density Estimate (KDE) for each source target, allowing for direct comparison on a single chart.
+    Parameters
+    ----------
+    results_dir : str or Path
+        The path to the directory containing the optimization result CSV files.
+    save_dir : str or Path
+        The directory where the output plots will be saved.
+    color_by_target : bool, optional
+        If True, generates comparative plots with distributions colored by their source target.
+    """
+    mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
+    logging.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
+    output_path = make_fullpath(save_dir, make=True)
+    all_files = list(yield_dataframes_from_dir(results_dir))
+    if not all_files:
+        logging.warning("No data found. No plots will be generated.")
+        return
+    # --- MODE 1: Color-coded plots by target ---
+    if color_by_target:
+        data_to_plot = []
+        for df, df_name in all_files:
+            # Assumes last col is target, rest are features
+            melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
+            # Sanitize target name for cleaner legend labels
+            melted_df['target'] = df_name.replace("Optimization_", "")
+            data_to_plot.append(melted_df)
+        long_df = pd.concat(data_to_plot, ignore_index=True)
+        features = long_df['feature'].unique()
+        logging.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
+        for feature_name in features:
+            plt.figure(figsize=(12, 7))
+            feature_df = long_df[long_df['feature'] == feature_name]
+            sns.kdeplot(data=feature_df, x='value', hue='target', fill=True, alpha=0.1)
+            plt.title(f"Comparative Distribution for '{feature_name}'", fontsize=16)
+            plt.xlabel("Feature Value", fontsize=12)
+            plt.ylabel("Density", fontsize=12)
+            plt.grid(axis='y', alpha=0.5, linestyle='--')
+            plt.legend(title='Target')
+            sanitized_feature_name = sanitize_filename(feature_name)
+            plot_filename = output_path / f"Comparative_{sanitized_feature_name}.svg"
+            plt.savefig(plot_filename, bbox_inches='tight')
+            plt.close()
+    # --- MODE 2: Aggregate plot ---
+    else:
+        feature_distributions = defaultdict(list)
+        for df, _ in all_files:
+            feature_columns = df.iloc[:, :-1]
+            for feature_name in feature_columns:
+                feature_distributions[feature_name].extend(df[feature_name].tolist())
+        logging.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
+        for feature_name, values in feature_distributions.items():
+            plt.figure(figsize=(12, 7))
+            sns.histplot(x=values, kde=True, bins='auto', stat="density")
+            plt.title(f"Aggregate Distribution for '{feature_name}'", fontsize=16)
+            plt.xlabel("Feature Value", fontsize=12)
+            plt.ylabel("Density", fontsize=12)
+            plt.grid(axis='y', alpha=0.5, linestyle='--')
+            sanitized_feature_name = sanitize_filename(feature_name)
+            plot_filename = output_path / f"Aggregate_{sanitized_feature_name}.svg"
+            plt.savefig(plot_filename, bbox_inches='tight')
+            plt.close()
+    logging.info(f"✅ All plots saved successfully to: {output_path}")
+def info():
+    _script_info(__all__)

ml_tools/data_exploration.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import pandas as pd
+from pandas.api.types import is_numeric_dtype
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -24,7 +25,8 @@ __all__ = [
     "plot_value_distributions",
     "clip_outliers_single",
     "clip_outliers_multi",
-    "match_and_filter_columns_by_regex"
+    "match_and_filter_columns_by_regex",
+    "standardize_percentages"
 ]
@@ -575,6 +577,72 @@ def match_and_filter_columns_by_regex(
     return filtered_df, matched_columns
+def standardize_percentages(
+    df: pd.DataFrame,
+    columns: list[str],
+    treat_one_as_proportion: bool = True,
+    round_digits: int = 2
+) -> pd.DataFrame:
+    """
+    Standardizes numeric columns containing mixed-format percentages.
+    This function cleans columns where percentages might be entered as whole
+    numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
+    between 0 and 1 are proportions and multiplies them by 100.
+    Args:
+        df (pd.Dataframe): The input pandas DataFrame.
+        columns (list[str]): A list of column names to standardize.
+        treat_one_as_proportion (bool):
+            - If True (default): The value `1` is treated as a proportion and converted to `100`.
+            - If False: The value `1` is treated as `1%`.
+        round_digits (int): The number of decimal places to round the final result to.
+    Returns:
+        (pd.Dataframe):
+        A new DataFrame with the specified columns cleaned and standardized.
+    """
+    df_copy = df.copy()
+    if df_copy.empty:
+        return df_copy
+    # This helper function contains the core cleaning logic
+    def _clean_value(x: float) -> float:
+        """Applies the standardization rule to a single value."""
+        if pd.isna(x):
+            return x
+        # If treat_one_as_proportion is True, the range for proportions is [0, 1]
+        if treat_one_as_proportion and 0 <= x <= 1:
+            return x * 100
+        # If False, the range for proportions is [0, 1) (1 is excluded)
+        elif not treat_one_as_proportion and 0 <= x < 1:
+            return x * 100
+        # Otherwise, the value is assumed to be a correctly formatted percentage
+        return x
+    for col in columns:
+        # --- Robustness Checks ---
+        if col not in df_copy.columns:
+            print(f"Warning: Column '{col}' not found. Skipping.")
+            continue
+        if not is_numeric_dtype(df_copy[col]):
+            print(f"Warning: Column '{col}' is not numeric. Skipping.")
+            continue
+        # --- Applying the Logic ---
+        # Apply the cleaning function to every value in the column
+        df_copy[col] = df_copy[col].apply(_clean_value)
+        # Round the result
+        df_copy[col] = df_copy[col].round(round_digits)
+    return df_copy
 def _is_notebook():
     return get_ipython() is not None

ml_tools/utilities.py CHANGED Viewed

@@ -86,7 +86,6 @@ def make_fullpath(
     return resolved
 def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
     """
     Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.

{dragon_ml_toolbox-2.2.0.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.2.0.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.2.0.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.2.0.dist-info → dragon_ml_toolbox-2.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 2.2.0py3-none-any.whl → 2.3.0py3-none-any.whl