PyPI - dragon-ml-toolbox - Versions diffs - 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.2.1py3-none-any.whl → 2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 2.2.1
+Version: 2.4.0
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -37,9 +37,11 @@ Requires-Dist: Pillow
 Provides-Extra: pytorch
 Requires-Dist: torch; extra == "pytorch"
 Requires-Dist: torchvision; extra == "pytorch"
+Provides-Extra: gui
+Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui"
 Dynamic: license-file
-# dragon-ml-tools
+# dragon-ml-toolbox
 A collection of Python utilities for data science and machine learning, structured as a modular package for easy reuse and installation.
@@ -57,7 +59,7 @@ A collection of Python utilities for data science and machine learning, structur
 Install the latest stable release from PyPI:
 ```bash
-pip install dragon-ml-tools
+pip install dragon-ml-toolbox
 ```
 ### Via GitHub (Editable)
@@ -77,16 +79,26 @@ Install from the conda-forge channel:
 ```bash
 conda install -c conda-forge dragon-ml-toolbox
 ```
-**Note:** This version is outdated or broken due to dependency incompatibilities.
+**Note:** This version is outdated or broken due to dependency incompatibilities. Use PyPi instead.
 ## Optional dependencies
-**PyTorch**, which provides different builds depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
+### FreeSimpleGUI
+Wrapper library used to build powerful GUIs. Requires the tkinter backend.
+```bash
+pip install dragon-ml-toolbox[gui]
+```
+### PyTorch
+Different builds available depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
 Install the default CPU-only version with
 ```bash
-pip install dragon-ml-tools[pytorch]
+pip install dragon-ml-toolbox[pytorch]
 ```
 To make use of GPU acceleration use the official PyTorch installation instructions:
@@ -108,6 +120,8 @@ from ml_tools.logger import custom_logger
 data_exploration
 datasetmaster
 ensemble_learning
+ETL_engineering
+GUI_tools
 handle_excel
 logger
 MICE_imputation

{dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,9 @@
-dragon_ml_toolbox-2.2.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-2.2.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
-ml_tools/ETL_engineering.py,sha256=meQwdMUmAGXmrOSF5K5MaIhztvAbwxPeKnPnv8TxBi0,23283
+dragon_ml_toolbox-2.4.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-2.4.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
+ml_tools/ETL_engineering.py,sha256=ns8HsLWZhByurvjtUUW10p7If1h1O5-btUfCRXxzkME,31568
+ml_tools/GUI_tools.py,sha256=sKLBWRhwGax3QSVICEduQiTbGhQdwvW0eeHPQMiyOF0,20150
 ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
-ml_tools/PSO_optimization.py,sha256=T-wnB94DcRWuRd2M3loDVT4POtIP0MOhs-VilAf1L4E,20974
+ml_tools/PSO_optimization.py,sha256=gi56mF-q6BApYwhAd9jix0xiYz595WTPcUh7afZsRJ4,25378
 ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
@@ -13,9 +14,9 @@ ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,1294
 ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
 ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
 ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
-ml_tools/utilities.py,sha256=A7Wm1ArpqFG80WKmnkYdtSzIRLvg5x-9nPNidZIbpPA,20671
+ml_tools/utilities.py,sha256=T6AnNEQjUDnMAMSIJ8yZqToAVESIlEKK0bGBEm3sAUU,20670
 ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
-dragon_ml_toolbox-2.2.1.dist-info/METADATA,sha256=1Xjem3tZp5rlaFrz5_lQKdtal_jUB9lKRUIlQqYseyE,2974
-dragon_ml_toolbox-2.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-2.2.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-2.2.1.dist-info/RECORD,,
+dragon_ml_toolbox-2.4.0.dist-info/METADATA,sha256=LewdCOSOEeCNVLrB37FD39hnESJ7lPt2voeO-nFG-es,3232
+dragon_ml_toolbox-2.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-2.4.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-2.4.0.dist-info/RECORD,,

ml_tools/ETL_engineering.py CHANGED Viewed

@@ -2,19 +2,120 @@ import polars as pl
 import re
 from typing import Literal, Union, Optional, Any, Callable, List, Dict
 from .utilities import _script_info
+import pandas as pd
 __all__ = [
+    "ColumnCleaner",
+    "DataFrameCleaner"
     "TransformationRecipe",
     "DataProcessor",
     "KeywordDummifier",
     "NumberExtractor",
     "MultiNumberExtractor",
+    "RatioCalculator"
     "CategoryMapper",
+    "RegexMapper",
     "ValueBinner",
     "DateFeatureExtractor"
 ]
+########## EXTRACT and CLEAN ##########
+class ColumnCleaner:
+    """
+    Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
+    Args:
+        rules (Dict[str, str]):
+            A dictionary where each key is a regular expression pattern and
+            each value is the standardized string to replace matches with.
+    """
+    def __init__(self, rules: Dict[str, str]):
+        if not isinstance(rules, dict):
+            raise TypeError("The 'rules' argument must be a dictionary.")
+        # Validate that all keys are valid regular expressions
+        for pattern in rules.keys():
+            try:
+                re.compile(pattern)
+            except re.error as e:
+                raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
+        self.rules = rules
+    def clean(self, series: pd.Series) -> pd.Series:
+        """
+        Applies the standardization rules to the provided Series (requires string data).
+        Non-matching values are kept as they are.
+        Args:
+            series (pd.Series): The pandas Series to clean.
+        Returns:
+            pd.Series: A new Series with the values cleaned and standardized.
+        """
+        return series.astype(str).replace(self.rules, regex=True)
+class DataFrameCleaner:
+    """
+    Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
+    Args:
+        rules (Dict[str, Dict[str, str]]):
+            A nested dictionary where each top-level key is a column name,
+            and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
+    """
+    def __init__(self, rules: Dict[str, Dict[str, str]]):
+        if not isinstance(rules, dict):
+            raise TypeError("The 'rules' argument must be a nested dictionary.")
+        for col_name, col_rules in rules.items():
+            if not isinstance(col_rules, dict):
+                raise TypeError(
+                    f"The value for column '{col_name}' must be a dictionary "
+                    f"of rules, but got type {type(col_rules).__name__}."
+                )
+        self.rules = rules
+    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Applies all defined cleaning rules to the DataFrame.
+        Args:
+            df (pd.DataFrame): The pandas DataFrame to clean.
+        Returns:
+            pd.DataFrame: A new, cleaned DataFrame.
+        """
+        rule_columns = set(self.rules.keys())
+        df_columns = set(df.columns)
+        missing_columns = rule_columns - df_columns
+        if missing_columns:
+            # Report all missing columns in a single, clear error message
+            raise ValueError(
+                f"The following columns specified in the cleaning rules "
+                f"were not found in the DataFrame: {sorted(list(missing_columns))}"
+            )
+        # Start the process
+        df_cleaned = df.copy()
+        for column_name, column_rules in self.rules.items():
+            # Create and apply the specific cleaner for the column
+            cleaner = ColumnCleaner(rules=column_rules)
+            df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
+        return df_cleaned
+############ TRANSFORM ####################
 # Magic word for rename-only transformation
 _RENAME = "rename"
@@ -336,8 +437,7 @@ class MultiNumberExtractor:
     """
     Extracts multiple numbers from a single polars string column into several new columns.
-    This transformer is designed for one-to-many mappings, such as parsing
-    ratios (100:30) or coordinates (10, 25) into separate columns.
+    This transformer is designed for one-to-many mappings, such as parsing coordinates (10, 25) into separate columns.
     Args:
         num_outputs (int):
@@ -413,6 +513,59 @@ class MultiNumberExtractor:
         return pl.select(output_expressions)
+class RatioCalculator:
+    """
+    A transformer that parses a string ratio (e.g., "40:5" or "30/2") and computes the result of the division.
+    Args:
+        regex_pattern (str, optional):
+            The regex pattern to find the numerator and denominator. It MUST
+            contain exactly two capturing groups: the first for the
+            numerator and the second for the denominator. Defaults to a
+            pattern that handles common delimiters like ':' and '/'.
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r"(\d+\.?\d*)\s*[:/]\s*(\d+\.?\d*)"
+    ):
+        # --- Validation ---
+        try:
+            if re.compile(regex_pattern).groups != 2:
+                raise ValueError(
+                    "regex_pattern must contain exactly two "
+                    "capturing groups '(...)'."
+                )
+        except re.error as e:
+            raise ValueError(f"Invalid regex pattern provided: {e}") from e
+        self.regex_pattern = regex_pattern
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Applies the ratio calculation logic to the input column.
+        Args:
+            column (pl.Series): The input Polars Series of ratio strings.
+        Returns:
+            pl.Series: A new Series of floats containing the division result.
+                       Returns null for invalid formats or division by zero.
+        """
+        # .extract_groups returns a struct with a field for each capture group
+        # e.g., {"group_1": "40", "group_2": "5"}
+        groups = column.str.extract_groups(self.regex_pattern)
+        # Extract numerator and denominator, casting to float
+        # strict=False ensures that non-matches become null
+        numerator = groups.struct.field("group_1").cast(pl.Float64, strict=False)
+        denominator = groups.struct.field("group_2").cast(pl.Float64, strict=False)
+        # Safely perform division, returning null if denominator is 0
+        return pl.when(denominator != 0).then(
+            numerator / denominator
+        ).otherwise(None)
 class CategoryMapper:
     """
     A transformer that maps string categories to specified numerical values using a dictionary.
@@ -468,6 +621,74 @@ class CategoryMapper:
         return pl.select(final_expr).to_series()
+class RegexMapper:
+    """
+    A transformer that maps string categories to numerical values based on a
+    dictionary of regular expression patterns.
+    The class iterates through the mapping dictionary in order, and the first
+    pattern that matches a given string determines the output value. This
+    "first match wins" logic makes the order of the mapping important.
+    Args:
+        mapping (Dict[str, Union[int, float]]):
+            An ordered dictionary where keys are regex patterns and values are
+            the numbers to map to if the pattern is found.
+        unseen_value (Optional[Union[int, float]], optional):
+            The numerical value to use for strings that do not match any
+            of the regex patterns. If None (default), unseen values are
+            mapped to null.
+    """
+    def __init__(
+        self,
+        mapping: Dict[str, Union[int, float]],
+        unseen_value: Optional[Union[int, float]] = None,
+    ):
+        # --- Validation ---
+        if not isinstance(mapping, dict):
+            raise TypeError("The 'mapping' argument must be a dictionary.")
+        for pattern, value in mapping.items():
+            try:
+                re.compile(pattern)
+            except re.error as e:
+                raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
+            if not isinstance(value, (int, float)):
+                raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
+        self.mapping = mapping
+        self.unseen_value = unseen_value
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Applies the regex mapping logic to the input column.
+        Args:
+            column (pl.Series): The input Polars Series of string data.
+        Returns:
+            pl.Series: A new Series with strings mapped to numbers based on
+                       the first matching regex pattern.
+        """
+        # Ensure the column is treated as a string for matching
+        str_column = column.cast(pl.Utf8)
+        # Build the when/then/otherwise chain from the inside out.
+        # Start with the final fallback value for non-matches.
+        mapping_expr = pl.lit(self.unseen_value)
+        # Iterate through the mapping in reverse to construct the nested expression
+        for pattern, value in reversed(list(self.mapping.items())):
+            mapping_expr = (
+                pl.when(str_column.str.contains(pattern))
+                .then(pl.lit(value))
+                .otherwise(mapping_expr)
+            )
+        # Execute the complete expression chain and return the resulting Series
+        return pl.select(mapping_expr).to_series()
 class ValueBinner:
     """
     A transformer that discretizes a continuous numerical column into a finite number of bins.

ml_tools/GUI_tools.py ADDED Viewed

@@ -0,0 +1,496 @@
+import configparser
+from pathlib import Path
+from typing import Optional, Callable, Any
+import traceback
+import FreeSimpleGUI as sg
+from functools import wraps
+from typing import Any, Dict, Tuple, List
+from .utilities import _script_info
+import numpy as np
+__all__ = [
+    "PathManager",
+    "ConfigManager",
+    "GUIFactory",
+    "catch_exceptions",
+    "prepare_feature_vector",
+    "update_target_fields"
+]
+# --- Path Management ---
+class PathManager:
+    """
+    Manages paths for a Python application, supporting both development mode and bundled mode via Briefcase.
+    """
+    def __init__(self, anchor_file: str):
+        """
+        Initializes the PathManager. The package name is automatically inferred
+        from the parent directory of the anchor file.
+        Args:
+            anchor_file (str): The absolute path to a file within the project's
+                               package, typically `__file__` from a module inside
+                               that package (paths.py).
+        Note:
+            This inference assumes that the anchor file's parent directory
+            has the same name as the package (e.g., `.../src/my_app/paths.py`).
+            This is a standard and recommended project structure.
+        """
+        resolved_anchor_path = Path(anchor_file).resolve()
+        self.package_name = resolved_anchor_path.parent.name
+        self._is_bundled, self._resource_path_func = self._check_bundle_status()
+        if self._is_bundled:
+            # In a Briefcase bundle, resource_path gives an absolute path
+            # to the resource directory.
+            self.package_root = self._resource_path_func(self.package_name, "")
+        else:
+            # In development mode, the package root is the directory
+            # containing the anchor file.
+            self.package_root = resolved_anchor_path.parent
+    def _check_bundle_status(self) -> tuple[bool, Optional[Callable]]:
+        """Checks if the app is running in a bundled environment."""
+        try:
+            # This is the function Briefcase provides in a bundled app
+            from briefcase.platforms.base import resource_path
+            return True, resource_path
+        except ImportError:
+            return False, None
+    def get_path(self, relative_path: str | Path) -> Path:
+        """
+        Gets the absolute path for a given resource file or directory
+        relative to the package root.
+        Args:
+            relative_path (str | Path): The path relative to the package root (e.g., 'helpers/icon.png').
+        Returns:
+            Path: The absolute path to the resource.
+        """
+        if self._is_bundled:
+            # Briefcase's resource_path handles resolving the path within the app bundle
+            return self._resource_path_func(self.package_name, str(relative_path)) # type: ignore
+        else:
+            # In dev mode, join package root with the relative path.
+            return self.package_root / relative_path
+# --- Configuration Management ---
+class _SectionProxy:
+    """A helper class to represent a section of the .ini file as an object."""
+    def __init__(self, parser: configparser.ConfigParser, section_name: str):
+        for option, value in parser.items(section_name):
+            setattr(self, option.lower(), self._process_value(value))
+    def _process_value(self, value_str: str) -> Any:
+        """Automatically converts string values to appropriate types."""
+        # Handle None
+        if value_str is None or value_str.lower() == 'none':
+            return None
+        # Handle Booleans
+        if value_str.lower() in ['true', 'yes', 'on']:
+            return True
+        if value_str.lower() in ['false', 'no', 'off']:
+            return False
+        # Handle Integers
+        try:
+            return int(value_str)
+        except ValueError:
+            pass
+        # Handle Floats
+        try:
+            return float(value_str)
+        except ValueError:
+            pass
+        # Handle 'width,height' tuples
+        if ',' in value_str:
+            try:
+                return tuple(map(int, value_str.split(",")))
+            except (ValueError, TypeError):
+                pass
+        # Fallback to the original string
+        return value_str
+class ConfigManager:
+    """
+    Loads a .ini file and provides access to its values as object attributes.
+    Includes a method to generate a default configuration template.
+    """
+    def __init__(self, config_path: str | Path):
+        """
+        Initializes the ConfigManager and dynamically creates attributes
+        based on the .ini file's sections and options.
+        """
+        config_path = Path(config_path)
+        if not config_path.exists():
+            raise FileNotFoundError(f"Configuration file not found at: {config_path}")
+        parser = configparser.ConfigParser(comment_prefixes=('#', ';'), inline_comment_prefixes=('#', ';'))
+        parser.read(config_path)
+        for section in parser.sections():
+            setattr(self, section.lower(), _SectionProxy(parser, section))
+    @staticmethod
+    def generate_template(file_path: str | Path, force_overwrite: bool = False):
+        """
+        Generates a complete, commented .ini template file that works with the GUIFactory.
+        Args:
+            file_path (str | Path): The path where the .ini file will be saved.
+            force_overwrite (bool): If True, overwrites the file if it already exists.
+        """
+        path = Path(file_path)
+        if path.exists() and not force_overwrite:
+            print(f"Configuration file already exists at {path}. Aborting.")
+            return
+        config = configparser.ConfigParser()
+        config['General'] = {
+            '; The overall theme for the GUI. Find more at https://www.pysimplegui.org/en/latest/call%20reference/#themes-automatic-coloring-of-elements': '',
+            'theme': 'LightGreen6',
+            '; Default font for the application.': '',
+            'font_family': 'Helvetica',
+            '; Title of the main window.': '',
+            'window_title': 'My Application',
+            '; Can the user resize the window? (true/false)': '',
+            'resizable_window': 'false',
+            '; Optional minimum window size (width,height). Leave blank for no minimum.': '',
+            'min_size': '800,600',
+            '; Optional maximum window size (width,height). Leave blank for no maximum.': '',
+            'max_size': ''
+        }
+        config['Layout'] = {
+            '; Default size for continuous input boxes (width,height in characters).': '',
+            'input_size_cont': '16,1',
+            '; Default size for combo/binary boxes (width,height in characters).': '',
+            'input_size_binary': '14,1',
+            '; Default size for buttons (width,height in characters).': '',
+            'button_size': '15,2'
+        }
+        config['Fonts'] = {
+            '; Font settings. Style can be "bold", "italic", "underline", or a combination.': '',
+            'label_size': '11',
+            'label_style': 'bold',
+            'range_size': '9',
+            'range_style': '',
+            'button_size': '14',
+            'button_style': 'bold',
+            'frame_size': '14',
+            'frame_style': ''
+        }
+        config['Colors'] = {
+            '; Use standard hex codes (e.g., #FFFFFF) or color names (e.g., white).': '',
+            '; Color for the text inside a disabled target/output box.': '',
+            'target_text': '#0000D0',
+            '; Background color for a disabled target/output box.': '',
+            'target_background': '#E0E0E0',
+            '; Color for the text on a button.': '',
+            'button_text': '#FFFFFF',
+            '; Background color for a button.': '',
+            'button_background': '#3c8a7e',
+            '; Background color when the mouse is over a button.': '',
+            'button_background_hover': '#5499C7'
+        }
+        config['Meta'] = {
+            '; Optional application version, displayed in the window title.': '',
+            'version': '1.0.0'
+        }
+        with open(path, 'w') as configfile:
+            config.write(configfile)
+        print(f"Successfully generated config template at: '{path}'")
+# --- GUI Factory ---
+class GUIFactory:
+    """
+    Builds styled FreeSimpleGUI elements and layouts using a "building block"
+    approach, driven by a ConfigManager instance.
+    """
+    def __init__(self, config: ConfigManager):
+        """
+        Initializes the factory with a configuration object.
+        """
+        self.config = config
+        sg.theme(self.config.general.theme)
+        sg.set_options(font=(self.config.general.font_family, 12))
+    # --- Atomic Element Generators ---
+    def make_button(self, text: str, key: str, **kwargs) -> sg.Button:
+        """
+        Creates a single, styled action button.
+        Args:
+            text (str): The text displayed on the button.
+            key (str): The key for the button element.
+            **kwargs: Override default styles or add other sg.Button parameters
+                      (e.g., `tooltip='Click me'`, `disabled=True`).
+        """
+        cfg = self.config
+        font = (cfg.fonts.font_family, cfg.fonts.button_size, cfg.fonts.button_style)
+        style_args = {
+            "size": cfg.layout.button_size,
+            "font": font,
+            "button_color": (cfg.colors.button_text, cfg.colors.button_background),
+            "mouseover_colors": (cfg.colors.button_text, cfg.colors.button_background_hover),
+            "border_width": 0,
+            **kwargs
+        }
+        return sg.Button(text.title(), key=key, **style_args)
+    def make_frame(self, title: str, layout: List[List[sg.Element]], **kwargs) -> sg.Frame:
+        """
+        Creates a styled frame around a given layout.
+        Args:
+            title (str): The title displayed on the frame's border.
+            layout (list): The layout to enclose within the frame.
+            **kwargs: Override default styles or add other sg.Frame parameters
+                      (e.g., `title_color='red'`, `relief=sg.RELIEF_SUNKEN`).
+        """
+        cfg = self.config
+        font = (cfg.fonts.font_family, cfg.fonts.frame_size)
+        style_args = {
+            "font": font,
+            "expand_x": True,
+            "background_color": sg.theme_background_color(),
+            **kwargs
+        }
+        return sg.Frame(title, layout, **style_args)
+    # --- General-Purpose Layout Generators ---
+    def generate_continuous_layout(
+        self,
+        data_dict: Dict[str, Tuple[float, float]],
+        is_target: bool = False,
+        layout_mode: str = 'grid',
+        columns_per_row: int = 4
+    ) -> List[List[sg.Column]]:
+        """
+        Generates a layout for continuous features or targets.
+        Args:
+            data_dict (dict): Keys are feature names, values are (min, max) tuples.
+            is_target (bool): If True, creates disabled inputs for displaying results.
+            layout_mode (str): 'grid' for a multi-row grid layout, or 'row' for a single horizontal row.
+            columns_per_row (int): Number of feature columns per row when layout_mode is 'grid'.
+        Returns:
+            A list of lists of sg.Column elements, ready to be used in a window layout.
+        """
+        cfg = self.config
+        bg_color = sg.theme_background_color()
+        label_font = (cfg.fonts.font_family, cfg.fonts.label_size, cfg.fonts.label_style)
+        columns = []
+        for name, (val_min, val_max) in data_dict.items():
+            key = f"TARGET_{name}" if is_target else name
+            default_text = "" if is_target else str(val_max)
+            label = sg.Text(name, font=label_font, background_color=bg_color, key=f"_text_{name}")
+            input_style = {"size": cfg.layout.input_size_cont, "justification": "center"}
+            if is_target:
+                input_style["text_color"] = cfg.colors.target_text
+                input_style["disabled_readonly_background_color"] = cfg.colors.target_background
+            element = sg.Input(default_text, key=key, disabled=is_target, **input_style)
+            if is_target:
+                layout = [[label], [element]]
+            else:
+                range_font = (cfg.fonts.font_family, cfg.fonts.range_size)
+                range_text = sg.Text(f"Range: {int(val_min)}-{int(val_max)}", font=range_font, background_color=bg_color)
+                layout = [[label], [element], [range_text]]
+            layout.append([sg.Text(" ", font=(cfg.fonts.font_family, 2), background_color=bg_color)])
+            columns.append(sg.Column(layout, background_color=bg_color))
+        if layout_mode == 'row':
+            return [columns] # A single row containing all columns
+        # Default to 'grid' layout
+        return [columns[i:i + columns_per_row] for i in range(0, len(columns), columns_per_row)]
+    def generate_combo_layout(
+        self,
+        data_dict: Dict[str, List[Any]],
+        layout_mode: str = 'grid',
+        columns_per_row: int = 4
+    ) -> List[List[sg.Column]]:
+        """
+        Generates a layout for categorical or binary features using Combo boxes.
+        Args:
+            data_dict (dict): Keys are feature names, values are lists of options.
+            layout_mode (str): 'grid' for a multi-row grid layout, or 'row' for a single horizontal row.
+            columns_per_row (int): Number of feature columns per row when layout_mode is 'grid'.
+        Returns:
+            A list of lists of sg.Column elements, ready to be used in a window layout.
+        """
+        cfg = self.config
+        bg_color = sg.theme_background_color()
+        label_font = (cfg.fonts.font_family, cfg.fonts.label_size, cfg.fonts.label_style)
+        columns = []
+        for name, values in data_dict.items():
+            label = sg.Text(name, font=label_font, background_color=bg_color, key=f"_text_{name}")
+            element = sg.Combo(
+                values, default_value=values[0], key=name,
+                size=cfg.layout.input_size_binary, readonly=True
+            )
+            layout = [[label], [element]]
+            layout.append([sg.Text(" ", font=(cfg.fonts.font_family, 2), background_color=bg_color)])
+            columns.append(sg.Column(layout, background_color=bg_color))
+        if layout_mode == 'row':
+            return [columns] # A single row containing all columns
+        # Default to 'grid' layout
+        return [columns[i:i + columns_per_row] for i in range(0, len(columns), columns_per_row)]
+    # --- Window Creation ---
+    def create_window(self, title: str, layout: List[List[sg.Element]], **kwargs) -> sg.Window:
+        """
+        Creates and finalizes the main application window.
+        Args:
+            title (str): The title for the window.
+            layout (list): The final, assembled layout for the window.
+            **kwargs: Additional arguments to pass to the sg.Window constructor
+                      (e.g., `location=(100, 100)`, `keep_on_top=True`).
+        """
+        cfg = self.config.general
+        version = getattr(self.config.meta, 'version', None)
+        full_title = f"{title} v{version}" if version else title
+        window_args = {
+            "resizable": cfg.resizable_window,
+            "finalize": True,
+            "background_color": sg.theme_background_color(),
+            **kwargs
+        }
+        window = sg.Window(full_title, layout, **window_args)
+        if cfg.min_size: window.TKroot.minsize(*cfg.min_size)
+        if cfg.max_size: window.TKroot.maxsize(*cfg.max_size)
+        return window
+# --- Exception Handling Decorator ---
+def catch_exceptions(show_popup: bool = True):
+    """
+    A decorator that wraps a function in a try-except block.
+    If an exception occurs, it's caught and displayed in a popup window.
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            except Exception as e:
+                # Format the full traceback to give detailed error info
+                error_msg = traceback.format_exc()
+                if show_popup:
+                    sg.popup_error("An error occurred:", error_msg, title="Error")
+                else:
+                    # Fallback for non-GUI contexts or if popup is disabled
+                    print("--- An exception occurred ---")
+                    print(error_msg)
+                    print("-----------------------------")
+        return wrapper
+    return decorator
+# --- Inference Helpers ---
+def _default_categorical_processor(feature_name: str, chosen_value: Any) -> List[float]:
+    """
+    Default processor for binary 'True'/'False' strings.
+    Returns a list containing a single float.
+    """
+    return [1.0] if str(chosen_value) == 'True' else [0.0]
+def prepare_feature_vector(
+    values: Dict[str, Any],
+    feature_order: List[str],
+    continuous_features: List[str],
+    categorical_features: List[str],
+    categorical_processor: Optional[Callable[[str, Any], List[float]]] = None
+) -> np.ndarray:
+    """
+    Validates and converts GUI values into a numpy array for a model.
+    This function supports label encoding and one-hot encoding via the processor.
+    Args:
+        values (dict): The values dictionary from a `window.read()` call.
+        feature_order (list): A list of all feature names that have a GUI element.
+                              For one-hot encoding, this should be the name of the
+                              single GUI element (e.g., 'material_type'), not the
+                              expanded feature names (e.g., 'material_is_steel').
+        continuous_features (list): A list of names for continuous features.
+        categorical_features (list): A list of names for categorical features.
+        categorical_processor (callable, optional): A function to process categorical
+            values. It should accept (feature_name, chosen_value) and return a
+            list of floats (e.g., [1.0] for label encoding, [0.0, 1.0, 0.0] for one-hot).
+            If None, a default 'True'/'False' processor is used.
+    Returns:
+        A 1D numpy array ready for model inference.
+    """
+    processed_values: List[float] = []
+    # Use the provided processor or the default one
+    processor = categorical_processor or _default_categorical_processor
+    # Create sets for faster lookups
+    cont_set = set(continuous_features)
+    cat_set = set(categorical_features)
+    for name in feature_order:
+        chosen_value = values.get(name)
+        if chosen_value is None or chosen_value == '':
+            raise ValueError(f"Feature '{name}' is missing a value.")
+        if name in cont_set:
+            try:
+                processed_values.append(float(chosen_value))
+            except (ValueError, TypeError):
+                raise ValueError(f"Invalid input for '{name}'. Please enter a valid number.")
+        elif name in cat_set:
+            # The processor returns a list of values (one for label, multiple for one-hot)
+            numeric_values = processor(name, chosen_value)
+            processed_values.extend(numeric_values)
+    return np.array(processed_values, dtype=np.float32)
+def update_target_fields(window: sg.Window, results_dict: Dict[str, Any]):
+    """
+    Updates the GUI's target fields with inference results.
+    Args:
+        window (sg.Window): The application's window object.
+        results_dict (dict): A dictionary where keys are target names (without the
+                             'TARGET_' prefix) and values are the predicted results.
+    """
+    for target_name, result in results_dict.items():
+        # Format numbers to 2 decimal places, leave other types as-is
+        display_value = f"{result:.2f}" if isinstance(result, (int, float)) else result
+        window[f'TARGET_{target_name}'].update(display_value)
+def info():
+    _script_info(__all__)

ml_tools/PSO_optimization.py CHANGED Viewed

@@ -7,15 +7,27 @@ from sklearn.base import ClassifierMixin
 from typing import Literal, Union, Tuple, Dict, Optional
 import pandas as pd
 from copy import deepcopy
-from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath
+from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath, yield_dataframes_from_dir, sanitize_filename
 import torch
 from tqdm import trange
+import logging
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import defaultdict
+# Configure logger
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] [%(levelname)s] - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S"
+)
 __all__ = [
     "ObjectiveFunction",
     "multiple_objective_functions_from_dir",
-    "run_pso"
+    "run_pso",
+    "plot_optimal_feature_distributions"
 ]
@@ -184,6 +196,52 @@ def _save_results(*dicts, save_dir: Union[str,Path], target_name: str):
     save_dataframe(df=df, save_dir=save_dir, filename=f"Optimization_{target_name}")
+def _run_single_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, random_state: int):
+    """Helper for a single PSO run."""
+    pso_args.update({"seed": random_state})
+    best_features, best_target, *_ = _pso(**pso_args)
+    # Flip best_target if maximization was used
+    if objective_function.task == "maximization":
+        best_target = -best_target
+    # Threshold binary features
+    binary_number = objective_function.binary_features
+    best_features_threshold = threshold_binary_values(best_features, binary_number)
+    # Name features and target
+    best_features_named = {name: value for name, value in zip(feature_names, best_features_threshold)}
+    best_target_named = {target_name: best_target}
+    return best_features_named, best_target_named
+def _run_post_hoc_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, repetitions: int):
+    """Helper for post-hoc PSO analysis."""
+    all_best_targets = []
+    all_best_features = [[] for _ in range(len(feature_names))]
+    for _ in range(repetitions):
+        best_features, best_target, *_ = _pso(**pso_args)
+        if objective_function.task == "maximization":
+            best_target = -best_target
+        binary_number = objective_function.binary_features
+        best_features_threshold = threshold_binary_values(best_features, binary_number)
+        for i, best_feature in enumerate(best_features_threshold):
+            all_best_features[i].append(best_feature)
+        all_best_targets.append(best_target)
+    # Name features and target
+    all_best_features_named = {name: lst for name, lst in zip(feature_names, all_best_features)}
+    all_best_targets_named = {target_name: all_best_targets}
+    return all_best_features_named, all_best_targets_named
 def run_pso(lower_boundaries: list[float],
             upper_boundaries: list[float],
             objective_function: ObjectiveFunction,
@@ -236,6 +294,8 @@ def run_pso(lower_boundaries: list[float],
     -----
     - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
     """
     # Select device
     if torch.cuda.is_available():
         device = torch.device("cuda")
@@ -243,7 +303,8 @@ def run_pso(lower_boundaries: list[float],
         device = torch.device("mps")
     else:
         device = torch.device("cpu")
-    print(f"[PSO] Using device: '{device}'")
+    logging.info(f"Using device: '{device}'")
     # set local deep copies to prevent in place list modification
     local_lower_boundaries = deepcopy(lower_boundaries)
@@ -271,7 +332,7 @@ def run_pso(lower_boundaries: list[float],
     if target_name is None:
         target_name = "Target"
-    arguments = {
+    pso_arguments = {
             "func":objective_function,
             "lb": lower,
             "ub": upper,
@@ -281,59 +342,17 @@ def run_pso(lower_boundaries: list[float],
             "particle_output": False,
     }
+    # Dispatcher
+    if post_hoc_analysis is None or post_hoc_analysis <= 1:
+        features, target = _run_single_pso(objective_function, pso_arguments, names, target_name, random_state)
+    else:
+        features, target = _run_post_hoc_pso(objective_function, pso_arguments, names, target_name, post_hoc_analysis)
+    # --- Save Results ---
     save_results_path = make_fullpath(save_results_dir, make=True)
+    _save_results(features, target, save_dir=save_results_path, target_name=target_name)
-    if post_hoc_analysis is None or post_hoc_analysis == 1:
-        arguments.update({"seed": random_state})
-        best_features, best_target, *_ = _pso(**arguments)
-        # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
-        # flip best_target if maximization was used
-        if objective_function.task == "maximization":
-            best_target = -best_target
-        # threshold binary features
-        best_features_threshold = threshold_binary_values(best_features, binary_number)
-        # name features
-        best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
-        best_target_named = {target_name: best_target}
-        # save results
-        _save_results(best_features_named, best_target_named, save_dir=save_results_path, target_name=target_name)
-        return best_features_named, best_target_named
-    else:
-        all_best_targets = list()
-        all_best_features = [[] for _ in range(size_of_features)]
-        for _ in range(post_hoc_analysis):
-            best_features, best_target, *_ = _pso(**arguments)
-            # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
-            # flip best_target if maximization was used
-            if objective_function.task == "maximization":
-                best_target = -best_target
-            # threshold binary features
-            best_features_threshold = threshold_binary_values(best_features, binary_number)
-            for i, best_feature in enumerate(best_features_threshold):
-                all_best_features[i].append(best_feature)
-            all_best_targets.append(best_target)
-        # name features
-        all_best_features_named = {name: list_values for name, list_values in zip(names, all_best_features)}
-        all_best_targets_named = {target_name: all_best_targets}
-        # save results
-        _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_path, target_name=target_name)
-        return all_best_features_named, all_best_targets_named # type: ignore
-def info():
-    _script_info(__all__)
+    return features, target
 def _pso(func: ObjectiveFunction,
@@ -342,7 +361,9 @@ def _pso(func: ObjectiveFunction,
          device: torch.device,
          swarmsize: int,
          maxiter: int,
-         omega = 0.729,     # Clerc and Kennedy’s constriction coefficient
+         omega_start = 0.9, # STARTING inertia weight
+         omega_end = 0.4,   # ENDING inertia weight
+        #  omega = 0.729,     # Clerc and Kennedy’s constriction coefficient
          phip = 1.49445,    # Clerc and Kennedy’s constriction coefficient
          phig = 1.49445,    # Clerc and Kennedy’s constriction coefficient
          tolerance = 1e-8,
@@ -418,7 +439,7 @@ def _pso(func: ObjectiveFunction,
     # Initialize positions and velocities
     r = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
-    positions = lb_t + r * (ub_t - lb_t)  # shape: (swarmsize, ndim)
+    positions = lb_t + r * (ub_t - lb_t)
     velocities = torch.zeros_like(positions, requires_grad=False)
     # Initialize best positions and scores
@@ -428,19 +449,17 @@ def _pso(func: ObjectiveFunction,
     global_best_score = float('inf')
     global_best_position = torch.zeros(ndim, device=device, requires_grad=False)
-    # History (optional)
     if particle_output:
         history_positions = []
         history_scores = []
-    # Main loop
     previous_best_score = float('inf')
-    progress = trange(maxiter, desc="PSO", unit="iter", leave=True) #tqdm bar
+    progress = trange(maxiter, desc="PSO", unit="iter", leave=True)
     with torch.no_grad():
         for i in progress:
             # Evaluate objective for all particles
-            positions_np = positions.detach().cpu().numpy() # shape: (swarmsize, n_features)
-            scores_np = func(positions_np)  # shape: (swarmsize,)
+            positions_np = positions.detach().cpu().numpy()
+            scores_np = func(positions_np)
             scores = torch.tensor(scores_np, device=device, dtype=torch.float32)
             # Update personal bests
@@ -454,17 +473,18 @@ def _pso(func: ObjectiveFunction,
                 global_best_score = min_score.item()
                 global_best_position = personal_best_positions[min_idx].clone()
-                # Early stopping criteria
                 if abs(previous_best_score - global_best_score) < tolerance:
                     progress.set_description(f"PSO (early stop at iteration {i+1})")
                     break
                 previous_best_score = global_best_score
-            # Optional: track history for debugging/visualization
             if particle_output:
                 history_positions.append(positions.detach().cpu().numpy())
                 history_scores.append(scores_np)
+            # Linearly decreasing inertia weight
+            omega = omega_start - (omega_start - omega_end) * (i / maxiter)
             # Velocity update
             rp = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
             rg = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
@@ -476,11 +496,9 @@ def _pso(func: ObjectiveFunction,
             # Position update
             positions = positions + velocities
-            # Clamp to search space bounds
             positions = torch.max(positions, lb_t)
             positions = torch.min(positions, ub_t)
-    # Move to CPU and convert to NumPy
     best_position = global_best_position.detach().cpu().numpy()
     best_score = global_best_score
@@ -488,3 +506,91 @@ def _pso(func: ObjectiveFunction,
         return best_position, best_score, history_positions, history_scores
     else:
         return best_position, best_score
+def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path], color_by_target: bool = True):
+    """
+    Analyzes optimization results and plots the distribution of optimal values for each feature.
+    This function can operate in two modes based on the `color_by_target` parameter:
+    1.  Aggregates all values for a feature into a single group and plots one overall distribution (histogram + KDE).
+    2.  Color-coded: Plots a separate, color-coded Kernel Density Estimate (KDE) for each source target, allowing for direct comparison on a single chart.
+    Parameters
+    ----------
+    results_dir : str or Path
+        The path to the directory containing the optimization result CSV files.
+    save_dir : str or Path
+        The directory where the output plots will be saved.
+    color_by_target : bool, optional
+        If True, generates comparative plots with distributions colored by their source target.
+    """
+    mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
+    logging.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
+    output_path = make_fullpath(save_dir, make=True)
+    all_files = list(yield_dataframes_from_dir(results_dir))
+    if not all_files:
+        logging.warning("No data found. No plots will be generated.")
+        return
+    # --- MODE 1: Color-coded plots by target ---
+    if color_by_target:
+        data_to_plot = []
+        for df, df_name in all_files:
+            # Assumes last col is target, rest are features
+            melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
+            # Sanitize target name for cleaner legend labels
+            melted_df['target'] = df_name.replace("Optimization_", "")
+            data_to_plot.append(melted_df)
+        long_df = pd.concat(data_to_plot, ignore_index=True)
+        features = long_df['feature'].unique()
+        logging.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
+        for feature_name in features:
+            plt.figure(figsize=(12, 7))
+            feature_df = long_df[long_df['feature'] == feature_name]
+            sns.kdeplot(data=feature_df, x='value', hue='target', fill=True, alpha=0.1)
+            plt.title(f"Comparative Distribution for '{feature_name}'", fontsize=16)
+            plt.xlabel("Feature Value", fontsize=12)
+            plt.ylabel("Density", fontsize=12)
+            plt.grid(axis='y', alpha=0.5, linestyle='--')
+            plt.legend(title='Target')
+            sanitized_feature_name = sanitize_filename(feature_name)
+            plot_filename = output_path / f"Comparative_{sanitized_feature_name}.svg"
+            plt.savefig(plot_filename, bbox_inches='tight')
+            plt.close()
+    # --- MODE 2: Aggregate plot ---
+    else:
+        feature_distributions = defaultdict(list)
+        for df, _ in all_files:
+            feature_columns = df.iloc[:, :-1]
+            for feature_name in feature_columns:
+                feature_distributions[feature_name].extend(df[feature_name].tolist())
+        logging.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
+        for feature_name, values in feature_distributions.items():
+            plt.figure(figsize=(12, 7))
+            sns.histplot(x=values, kde=True, bins='auto', stat="density")
+            plt.title(f"Aggregate Distribution for '{feature_name}'", fontsize=16)
+            plt.xlabel("Feature Value", fontsize=12)
+            plt.ylabel("Density", fontsize=12)
+            plt.grid(axis='y', alpha=0.5, linestyle='--')
+            sanitized_feature_name = sanitize_filename(feature_name)
+            plot_filename = output_path / f"Aggregate_{sanitized_feature_name}.svg"
+            plt.savefig(plot_filename, bbox_inches='tight')
+            plt.close()
+    logging.info(f"✅ All plots saved successfully to: {output_path}")
+def info():
+    _script_info(__all__)

ml_tools/utilities.py CHANGED Viewed

@@ -86,7 +86,6 @@ def make_fullpath(
     return resolved
 def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
     """
     Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.

{dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.4.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-2.2.1.dist-info → dragon_ml_toolbox-2.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl

dragon-ml-toolbox 2.2.1py3-none-any.whl → 2.4.0py3-none-any.whl