PyPI - dragon-ml-toolbox - Versions diffs - 12.8.0__py3-none-any.whl → 12.9.1__py3-none-any.whl - Mend

dragon-ml-toolbox 12.8.0py3-none-any.whl → 12.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (7) hide show

{dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 12.8.0
+Version: 12.9.1
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-dragon_ml_toolbox-12.8.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-12.8.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
+dragon_ml_toolbox-12.9.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-12.9.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
 ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
 ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
 ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
@@ -24,7 +24,7 @@ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
 ml_tools/custom_logger.py,sha256=xot-VeZFigKjcVxADgzvI54vZO_MqMMejo7JmDED8Xo,5892
-ml_tools/data_exploration.py,sha256=joaJPgXeweYMAn-xnMOzUIE8VvKvbEPenVjVHM21U4c,46914
+ml_tools/data_exploration.py,sha256=9Bbppxi6WWSAotB1tCwwWPOEkx7Vs-yvCAhesVplIBY,50618
 ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
 ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
 ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
@@ -35,7 +35,7 @@ ml_tools/optimization_tools.py,sha256=P074YCuZzkqkONnAsM-Zb9DTX_i8cRkkJLpwAWz6CR
 ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
 ml_tools/serde.py,sha256=UIshIesHRFmxr8F6B3LxGG8bYc1HHK-nlE3kENSZL18,5288
 ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
-dragon_ml_toolbox-12.8.0.dist-info/METADATA,sha256=zbA_0bdkX_96eSpkx7QGZelCTKrckDXUdvmHE4oCNMI,6166
-dragon_ml_toolbox-12.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-12.8.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-12.8.0.dist-info/RECORD,,
+dragon_ml_toolbox-12.9.1.dist-info/METADATA,sha256=oQWsgVpaYAb7-91f2DpCuMUNCmP1OuHmwzMCeSgVQU8,6166
+dragon_ml_toolbox-12.9.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-12.9.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-12.9.1.dist-info/RECORD,,

ml_tools/data_exploration.py CHANGED Viewed

@@ -346,6 +346,7 @@ def encode_categorical_features(
     df: pd.DataFrame,
     columns_to_encode: List[str],
     encode_nulls: bool,
+    null_label: str = "Other",
     split_resulting_dataset: bool = True,
     verbose: bool = True
 ) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
@@ -359,13 +360,14 @@ def encode_categorical_features(
     Args:
         df (pd.DataFrame): The input DataFrame.
         columns_to_encode (List[str]): A list of column names to be encoded.
-        encode_nulls (bool): If True, encodes Null values as a distinct category
-            "Other" with a value of 0. Other categories start from 1.
-            If False, Nulls are ignored and categories start from 0.
-            Note: Use False when encoding binary values with missing entries.
-        split_resulting_dataset (bool): If True, returns two separate DataFrames:
-            one with non-categorical columns and one with the encoded columns.
-            If False, returns a single DataFrame with all columns.
+        encode_nulls (bool):
+            - If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
+            - If False, Nulls are ignored and categories start from 0.
+        null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
+        split_resulting_dataset (bool):
+            - If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
+            - If False, returns a single DataFrame with all columns.
         verbose (bool): If True, prints encoding progress.
     Returns:
@@ -376,6 +378,9 @@ def encode_categorical_features(
         - pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
         - pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
+    ## **Note:**
+    Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
     """
     df_encoded = df.copy()
@@ -401,8 +406,16 @@ def encode_categorical_features(
             mapped_series = df_encoded[col_name].astype(str).map(mapping)
             df_encoded[col_name] = mapped_series.fillna(0).astype(int)
+            # --- Validate nulls category---
+            # Ensure the key for 0 doesn't collide with a real category.
+            if null_label in mapping.keys():
+                # COLLISION! null_label is a real category
+                original_label = null_label
+                null_label = "__NULL__" # fallback
+                _LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
             # Create the complete user-facing map including "Other"
-            user_mapping = {**mapping, "Other": 0}
+            user_mapping = {**mapping, null_label: 0}
             mappings[col_name] = user_mapping
         else:
             # ignore nulls
@@ -1009,9 +1022,11 @@ def create_transformer_categorical_map(
 def reconstruct_one_hot(
     df: pd.DataFrame,
-    base_feature_names: List[str],
+    features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
     separator: str = '_',
-    drop_original: bool = True
+    baseline_category_name: str = "Other",
+    drop_original: bool = True,
+    verbose: bool = True
 ) -> pd.DataFrame:
     """
     Reconstructs original categorical columns from a one-hot encoded DataFrame.
@@ -1023,12 +1038,26 @@ def reconstruct_one_hot(
     Args:
         df (pd.DataFrame):
             The input DataFrame with one-hot encoded columns.
-        base_features (List[str]):
-            A list of base feature names to reconstruct. For example, if you have
-            columns 'B_a', 'B_b', 'B_c', you would pass `['B']`.
+        features_to_reconstruct (List[str | Tuple[str, str | None]]):
+            A list defining the features to reconstruct. This list can contain:
+            - A string: (e.g., "Color")
+              This reconstructs the feature 'Color' and assumes all-zero rows represent the baseline category ("Other" by default).
+            - A tuple: (e.g., ("Pet", "Dog"))
+              This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog".
+            - A tuple with None: (e.g., ("Size", None))
+              This reconstructs 'Size' and maps all-zero rows to the NaN value.
+            Example:
+            [
+                "Mood",                      # All-zeros -> "Other"
+                ("Color", "Red"),            # All-zeros -> "Red"
+                ("Size", None)               # All-zeros -> NaN
+            ]
         separator (str):
             The character separating the base name from the categorical value in
             the column names (e.g., '_' in 'B_a').
+        baseline_category_name (str):
+            The baseline category name to use by default if it is not explicitly provided.
         drop_original (bool):
             If True, the original one-hot encoded columns will be dropped from
             the returned DataFrame.
@@ -1051,14 +1080,47 @@ def reconstruct_one_hot(
     if not isinstance(df, pd.DataFrame):
         _LOGGER.error("Input must be a pandas DataFrame.")
         raise TypeError()
+    if not isinstance(baseline_category_name, str):
+        _LOGGER.error("The baseline_category must be a string.")
+        raise TypeError()
     new_df = df.copy()
     all_ohe_cols_to_drop = []
     reconstructed_count = 0
-    _LOGGER.info(f"Attempting to reconstruct {len(base_feature_names)} one-hot encoded feature(s).")
-    for base_name in base_feature_names:
+    # --- 1. Parse and validate the reconstruction config ---
+    # This normalizes the input into a clean {base_name: baseline_val} dict
+    reconstruction_config: Dict[str, Optional[str]] = {}
+    try:
+        for item in features_to_reconstruct:
+            if isinstance(item, str):
+                # Case 1: "Color"
+                base_name = item
+                baseline_val = baseline_category_name
+            elif isinstance(item, tuple) and len(item) == 2:
+                # Case 2: ("Pet", "dog") or ("Size", None)
+                base_name, baseline_val = item
+                if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
+                    _LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
+                    raise ValueError()
+            else:
+                _LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
+                raise ValueError()
+            if base_name in reconstruction_config and verbose:
+                _LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
+            reconstruction_config[base_name] = baseline_val
+    except Exception as e:
+        _LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
+        raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
+    _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
+    # Main logic
+    for base_name, baseline_category in reconstruction_config.items():
         # Regex to find all columns belonging to this base feature.
         pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
@@ -1070,24 +1132,34 @@ def reconstruct_one_hot(
             continue
         # For each row, find the column name with the maximum value (which is 1)
-        reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
+        reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
         # Extract the categorical value (the suffix) from the column name
         # Use n=1 in split to handle cases where the category itself might contain the separator
         new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
-        # Handle rows where all OHE columns were 0 (e.g., original value was NaN).
-        # In these cases, idxmax returns the first column name, but the sum of values is 0.
-        all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0
-        new_column_values.loc[all_zero_mask] = np.nan # type: ignore
+        # Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
+        all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
+        if baseline_category is not None:
+            # A baseline category was provided
+            new_column_values.loc[all_zero_mask] = baseline_category
+        else:
+            # No baseline provided: assign NaN
+            new_column_values.loc[all_zero_mask] = np.nan # type: ignore
+        if verbose:
+            print(f"  - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
         # Assign the new reconstructed column to the DataFrame
         new_df[base_name] = new_column_values
         all_ohe_cols_to_drop.extend(ohe_cols)
         reconstructed_count += 1
-        print(f"  - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
+        if verbose:
+            print(f"  - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
+    # Cleanup
     if drop_original and all_ohe_cols_to_drop:
         # Drop the original OHE columns, ensuring no duplicates in the drop list
         unique_cols_to_drop = list(set(all_ohe_cols_to_drop))

{dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-12.8.0.dist-info → dragon_ml_toolbox-12.9.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 12.8.0__py3-none-any.whl → 12.9.1__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 12.8.0py3-none-any.whl → 12.9.1py3-none-any.whl