PyPI - dragon-ml-toolbox - Versions diffs - 12.1.0__py3-none-any.whl → 12.3.0__py3-none-any.whl - Mend

dragon-ml-toolbox 12.1.0py3-none-any.whl → 12.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{dragon_ml_toolbox-12.1.0.dist-info → dragon_ml_toolbox-12.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 12.1.0
+Version: 12.3.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-12.1.0.dist-info → dragon_ml_toolbox-12.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-dragon_ml_toolbox-12.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-12.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
+dragon_ml_toolbox-12.3.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-12.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
 ml_tools/ETL_cleaning.py,sha256=PLRSR-VYnt1nNT9XrcWq40SE0VzHCw7DQ8v9czfSQsU,20366
 ml_tools/ETL_engineering.py,sha256=l0I6Og9o4s6EODdk0kZXjbbC-a3vVPYy1FopP2BkQSQ,54909
 ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
@@ -10,7 +10,7 @@ ml_tools/ML_evaluation.py,sha256=tLswOPgH4G1KExSMn0876YtNkbxPh-W3J4MYOjomMWA,162
 ml_tools/ML_evaluation_multi.py,sha256=6OZyQ4SM9ALh38mOABmiHgIQDWcovsD_iOo7Bg9YZCE,12516
 ml_tools/ML_inference.py,sha256=ymFvncFsU10PExq87xnEj541DKV5ck0nMuK8ToJHzVQ,23067
 ml_tools/ML_models.py,sha256=pSCV6KbmVnPZr49Kbyg7g25CYaWBWJr6IinBHKgVKGw,28042
-ml_tools/ML_optimization.py,sha256=TfVccKfZ_W6BgraZZ01-SNcNgGuViPozWLezBY8mBIg,20466
+ml_tools/ML_optimization.py,sha256=-Rb7ffp-VS6Bv5U0Dw6nSTNp2bGu7BaBQi04mTmSdEE,22942
 ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
 ml_tools/ML_simple_optimization.py,sha256=X96zX6XPu3ggrcOapuG69jsiZJczJNihS1rcwi9OsBI,18159
 ml_tools/ML_trainer.py,sha256=_g48w5Ak-wQr5fGHdJqlcpnzv3gWyL1ghkOhy9VOZbo,23930
@@ -24,18 +24,18 @@ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
 ml_tools/custom_logger.py,sha256=OZqG7FR_UE6byzY3RDmlj08a336ZU-4DzNBMPLr_d5c,5881
-ml_tools/data_exploration.py,sha256=is9P4c4orIKW6gRhTeScZlCGYH9ODguxMtVlrVubb4E,42515
+ml_tools/data_exploration.py,sha256=H-cHp6jL4u4Kl2L_fktcCdQWRdAzTC6kwFCrOHnzLNA,46549
 ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
 ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
 ml_tools/ensemble_learning.py,sha256=aTPeKthO4zRWBEaQJOUj8jEqVHiHjjOMXuiEWjI9NxM,21946
 ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
 ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
 ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
-ml_tools/optimization_tools.py,sha256=bkKrTjukNOpxgVDMW5mUX5vQ72ckBcS5VA4eG8uZsOI,13515
+ml_tools/optimization_tools.py,sha256=ewYMAdSGlFxYALAGFXn-MsHpvW_Sbx6I-sKg9Kp6rB8,13533
 ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
 ml_tools/serde.py,sha256=k0qAwfMf13lVBQSgq5u9MSXEoo31iOA2-Ncm8XgMCMI,3974
 ml_tools/utilities.py,sha256=gef62GLK7ev5BWkkQekeJoVZqwf2mIuOlOfyCw6WdtE,13882
-dragon_ml_toolbox-12.1.0.dist-info/METADATA,sha256=PJbBSG9h6juu_srL07VVhgOIGqebQwn_rlI1RgZdTwo,6166
-dragon_ml_toolbox-12.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-12.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-12.1.0.dist-info/RECORD,,
+dragon_ml_toolbox-12.3.0.dist-info/METADATA,sha256=999BzvvR1VfwwEETUJhMLJk6a3EtcPuITG_QXE2NP_c,6166
+dragon_ml_toolbox-12.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-12.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-12.3.0.dist-info/RECORD,,

ml_tools/ML_optimization.py CHANGED Viewed

@@ -24,6 +24,7 @@ from .math_utilities import discretize_categorical_values
 __all__ = [
     "MLOptimizer",
+    "FitnessEvaluator",
     "create_pytorch_problem",
     "run_optimization"
 ]
@@ -33,8 +34,8 @@ class MLOptimizer:
     """
     A wrapper class for setting up and running EvoTorch optimization tasks.
-    This class combines the functionality of `create_pytorch_problem` and
-    `run_optimization` functions into a single, streamlined workflow.
+    This class combines the functionality of `FitnessEvaluator`, `create_pytorch_problem`, and
+    `run_optimization` into a single, streamlined workflow.
     SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
@@ -91,9 +92,16 @@ class MLOptimizer:
                 False if it starts at 1 (e.g., [1, 2, 3]).
             **searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
         """
+        # Make a fitness function
+        self.evaluator = FitnessEvaluator(
+            inference_handler=inference_handler,
+            categorical_index_map=categorical_index_map,
+            discretize_start_at_zero=discretize_start_at_zero
+        )
         # Call the existing factory function to get the problem and searcher factory
         self.problem, self.searcher_factory = create_pytorch_problem(
-            inference_handler=inference_handler,
+            evaluator=self.evaluator,
             bounds=bounds,
             task=task,
             algorithm=algorithm,
@@ -144,10 +152,67 @@ class MLOptimizer:
             categorical_mappings=self.categorical_mappings,
             discretize_start_at_zero=self.discretize_start_at_zero
         )
+class FitnessEvaluator:
+    """
+    A callable class that wraps the PyTorch model inference handler and performs
+    on-the-fly discretization for the EvoTorch fitness function.
+    This class is automatically instantiated by MLOptimizer and passed to
+    create_pytorch_problem, encapsulating the evaluation logic.
+    """
+    def __init__(self,
+                 inference_handler: PyTorchInferenceHandler,
+                 categorical_index_map: Optional[Dict[int, int]] = None,
+                 discretize_start_at_zero: bool = True):
+        """
+        Initializes the fitness evaluator.
+        Args:
+            inference_handler (PyTorchInferenceHandler):
+                An initialized inference handler containing the model.
+            categorical_index_map (Dict[int, int] | None):
+                Maps {column_index: cardinality} for discretization.
+            discretize_start_at_zero (bool):
+                True if discrete encoding starts at 0.
+        """
+        self.inference_handler = inference_handler
+        self.categorical_index_map = categorical_index_map
+        self.discretize_start_at_zero = discretize_start_at_zero
+        # Expose the device
+        self.device = self.inference_handler.device
+    def __call__(self, solution_tensor: torch.Tensor) -> torch.Tensor:
+        """
+        This is the fitness function EvoTorch will call.
+        It receives a batch of continuous solutions, discretizes the
+        categorical ones, and returns the model's predictions.
+        """
+        # Clone to avoid modifying the optimizer's internal state (SNES, CEM, GA)
+        processed_tensor = solution_tensor.clone()
+        if self.categorical_index_map:
+            for col_idx, cardinality in self.categorical_index_map.items():
+                # 1. Round (using torch.floor(x + 0.5) for "round half up" behavior)
+                rounded_col = torch.floor(processed_tensor[:, col_idx] + 0.5)
+                # 2. Determine clamping bounds
+                min_bound = 0 if self.discretize_start_at_zero else 1
+                max_bound = cardinality - 1 if self.discretize_start_at_zero else cardinality
+                # 3. Clamp the values and update the processed tensor
+                processed_tensor[:, col_idx] = torch.clamp(rounded_col, min_bound, max_bound)
+        # Use the *processed_tensor* for prediction
+        predictions = self.inference_handler.predict_batch(processed_tensor)[PyTorchInferenceKeys.PREDICTIONS]
+        return predictions.flatten()
 def create_pytorch_problem(
-    inference_handler: PyTorchInferenceHandler,
+    evaluator: FitnessEvaluator,
     bounds: Tuple[List[float], List[float]],
     task: Literal["min", "max"],
     algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
@@ -162,7 +227,7 @@ def create_pytorch_problem(
     The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
     Args:
-        inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
+        evaluator (FitnessEvaluator): A callable class that wraps the model inference and handles on-the-fly discretization.
         bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
             Use the `optimization_tools.create_optimization_bounds()` helper to easily generate this and ensure unbiased categorical bounds.
         task (str): The optimization goal, either "minimize" or "maximize".
@@ -180,20 +245,13 @@ def create_pytorch_problem(
     upper_bounds = list(bounds[1])
     solution_length = len(lower_bounds)
-    device = inference_handler.device
+    device = evaluator.device
-    # Define the fitness function that EvoTorch will call.
-    def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
-        # Directly use the continuous-valued tensor from the optimizer for prediction
-        predictions = inference_handler.predict_batch(solution_tensor)[PyTorchInferenceKeys.PREDICTIONS]
-        return predictions.flatten()
     # Create the Problem instance.
     if algorithm == "CEM" or algorithm == "SNES":
         problem = evotorch.Problem(
             objective_sense=task,
-            objective_func=fitness_func,
+            objective_func=evaluator,
             solution_length=solution_length,
             initial_bounds=(lower_bounds, upper_bounds),
             device=device,
@@ -219,7 +277,7 @@ def create_pytorch_problem(
     elif algorithm == "Genetic":
         problem = evotorch.Problem(
             objective_sense=task,
-            objective_func=fitness_func,
+            objective_func=evaluator,
             solution_length=solution_length,
             bounds=(lower_bounds, upper_bounds),
             device=device,

ml_tools/data_exploration.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pandas.api.types import is_numeric_dtype
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-from typing import Union, Literal, Dict, Tuple, List, Optional
+from typing import Union, Literal, Dict, Tuple, List, Optional, Any
 from pathlib import Path
 import re
@@ -33,7 +33,8 @@ __all__ = [
     "match_and_filter_columns_by_regex",
     "standardize_percentages",
     "create_transformer_categorical_map",
-    "reconstruct_one_hot"
+    "reconstruct_one_hot",
+    "reconstruct_binary"
 ]
@@ -1081,7 +1082,110 @@ def reconstruct_one_hot(
         unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
         new_df.drop(columns=unique_cols_to_drop, inplace=True)
         _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
+    _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
+    return new_df
+def reconstruct_binary(
+    df: pd.DataFrame,
+    reconstruction_map: Dict[str, Tuple[str, Any, Any]],
+    drop_original: bool = True,
+    verbose: bool = True
+) -> pd.DataFrame:
+    """
+    Reconstructs new categorical columns from existing binary (0/1) columns.
+    Used to reverse a binary encoding by mapping 0 and 1 back to
+    descriptive categorical labels.
+    Args:
+        df (pd.DataFrame):
+            The input DataFrame.
+        reconstruction_map (Dict[str, Tuple[str, Any, Any]]):
+            A dictionary defining the reconstructions.
+            Format:
+            { "new_col_name": ("source_col_name", "label_for_0", "label_for_1") }
+            Example:
+            {
+                "Sex": ("Sex_male", "Female", "Male"),
+                "Smoker": ("Is_Smoker", "No", "Yes")
+            }
+        drop_original (bool):
+            If True, the original binary source columns (e.g., "Sex_male")
+            will be dropped from the returned DataFrame.
+        verbose (bool):
+            If True, prints the details of each reconstruction.
+    Returns:
+        pd.DataFrame:
+            A new DataFrame with the reconstructed categorical columns.
+    Raises:
+        TypeError: If `df` is not a pandas DataFrame.
+        ValueError: If `reconstruction_map` is not a dictionary or a
+                    configuration is invalid (e.g., column name collision).
+    Notes:
+        - The function operates on a copy of the DataFrame.
+        - Rows with `NaN` in the source column will have `NaN` in the
+          new column.
+        - Values in the source column other than 0 or 1 (e.g., 2) will
+          result in `NaN` in the new column.
+    """
+    if not isinstance(df, pd.DataFrame):
+        _LOGGER.error("Input must be a pandas DataFrame.")
+        raise TypeError()
+    if not isinstance(reconstruction_map, dict):
+        _LOGGER.error("`reconstruction_map` must be a dictionary with the required format.")
+        raise ValueError()
+    new_df = df.copy()
+    source_cols_to_drop: List[str] = []
+    reconstructed_count = 0
+    _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_map)} binary feature(s).")
+    for new_col_name, config in reconstruction_map.items():
+        # --- 1. Validation ---
+        if not (isinstance(config, tuple) and len(config) == 3):
+            _LOGGER.error(f"Config for '{new_col_name}' is invalid. Must be a 3-item tuple. Skipping.")
+            raise ValueError()
+        source_col, label_for_0, label_for_1 = config
+        if source_col not in new_df.columns:
+            _LOGGER.error(f"Source column '{source_col}' for new column '{new_col_name}' not found. Skipping.")
+            raise ValueError()
+        if new_col_name in new_df.columns and verbose:
+            _LOGGER.warning(f"New column '{new_col_name}' already exists and will be overwritten.")
+        if new_col_name == source_col:
+            _LOGGER.error(f"New column name '{new_col_name}' cannot be the same as source column '{source_col}'.")
+            raise ValueError()
+        # --- 2. Reconstruction ---
+        # .map() handles 0, 1, preserves NaNs, and converts any other value to NaN.
+        mapping_dict = {0: label_for_0, 1: label_for_1}
+        new_df[new_col_name] = new_df[source_col].map(mapping_dict)
+        # --- 3. Logging/Tracking ---
+        source_cols_to_drop.append(source_col)
+        reconstructed_count += 1
+        if verbose:
+            print(f"  - Reconstructed '{new_col_name}' from '{source_col}' (0='{label_for_0}', 1='{label_for_1}').")
+    # --- 4. Cleanup ---
+    if drop_original and source_cols_to_drop:
+        # Use set() to avoid duplicates if the same source col was used
+        unique_cols_to_drop = list(set(source_cols_to_drop))
+        new_df.drop(columns=unique_cols_to_drop, inplace=True)
+        _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original binary source column(s).")
     _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
     return new_df

ml_tools/optimization_tools.py CHANGED Viewed

@@ -66,7 +66,7 @@ def create_optimization_bounds(
     # 1. Read header and determine feature names
     full_csv_path = make_fullpath(csv_path, enforce="file")
     try:
-        df_header = pd.read_csv(full_csv_path, nrows=0)
+        df_header = pd.read_csv(full_csv_path, nrows=0, encoding="utf-8")
     except Exception as e:
         _LOGGER.error(f"Failed to read header from CSV: {e}")
         raise

{dragon_ml_toolbox-12.1.0.dist-info → dragon_ml_toolbox-12.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-12.1.0.dist-info → dragon_ml_toolbox-12.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-12.1.0.dist-info → dragon_ml_toolbox-12.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-12.1.0.dist-info → dragon_ml_toolbox-12.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 12.1.0__py3-none-any.whl → 12.3.0__py3-none-any.whl

dragon-ml-toolbox 12.1.0py3-none-any.whl → 12.3.0py3-none-any.whl