PyPI - dragon-ml-toolbox - Versions diffs - 10.11.1__py3-none-any.whl → 10.12.0__py3-none-any.whl - Mend

dragon-ml-toolbox 10.11.1py3-none-any.whl → 10.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.11.1
+Version: 10.12.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-dragon_ml_toolbox-10.11.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-10.11.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+dragon_ml_toolbox-10.12.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-10.12.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
 ml_tools/ETL_cleaning.py,sha256=lSP5q6-ukGhJBPV8dlsqJvPXAzj4du_0J-SbtEd0Pjg,19292
 ml_tools/ETL_engineering.py,sha256=a6KCWH6kRatZtjaFEF_o917ApPMK5_vRD-BjfCDAl-E,49400
 ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
@@ -9,7 +9,7 @@ ml_tools/ML_datasetmaster.py,sha256=vqKZhCXsvN5yeRJdOKqMPh5OhY1xe6xlNjM3WoH5lys,
 ml_tools/ML_evaluation.py,sha256=6FB6S-aDDpFzQdrp3flBVECzEsHhMbQknYVGhHooEFs,16207
 ml_tools/ML_evaluation_multi.py,sha256=2jTSNFCu8cz5C05EusnrDyffs59M2Fq3UXSHxo2TR1A,12515
 ml_tools/ML_inference.py,sha256=SGDPiPxs_OYDKKRZziFMyaWcC8A37c70W9t-dMP5niI,23066
-ml_tools/ML_models.py,sha256=8UOMg9Qn8qtecUGfgnLRedX-lCWYwEs-C5RJ2m8mZM4,27544
+ml_tools/ML_models.py,sha256=JMFOuw4jtX5RtUFpkQWS8-dzDW0AwqYjbl67XRCVubA,27996
 ml_tools/ML_optimization.py,sha256=a2Uxe1g-y4I-gFa8ENIM8QDS-Pz3hoPRRaVXAWMbyQA,13491
 ml_tools/ML_scaler.py,sha256=h2ymq5u953Lx60Qb38Y0mAWj85x9PbnP0xYNQ3pd8-w,7535
 ml_tools/ML_trainer.py,sha256=_g48w5Ak-wQr5fGHdJqlcpnzv3gWyL1ghkOhy9VOZbo,23930
@@ -21,16 +21,16 @@ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
 ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
-ml_tools/data_exploration.py,sha256=4McT2BR9muK4JVVTKUfvRyThe0m_o2vpy9RJ1f_1FeY,28692
+ml_tools/data_exploration.py,sha256=-aTi5jmv4AepPgi2k_85qEJsSLx5zPOtTbhorqzUvGQ,38542
 ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
 ml_tools/ensemble_inference.py,sha256=EFHnbjbu31fcVp88NBx8lWAVdu2Gpg9MY9huVZJHFfM,9350
 ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
 ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
 ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
 ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
-ml_tools/path_manager.py,sha256=CCZSlHpUiuaHsMAYcmMGZ9GvbHNbbrTqYFicgWz6pRs,17883
+ml_tools/path_manager.py,sha256=ke0MYOhYheRPX599GUbrvRsYHn2JKUmMDldS5LP6LQA,18431
 ml_tools/utilities.py,sha256=uheMUjQJ1zI69gASsE-mCq4KlRPVGgrgqson02rGNYM,30755
-dragon_ml_toolbox-10.11.1.dist-info/METADATA,sha256=x3e66l1-dXkoE6ldWAH77epdEMnqj6YAvSVKYDVFhHU,6969
-dragon_ml_toolbox-10.11.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-10.11.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-10.11.1.dist-info/RECORD,,
+dragon_ml_toolbox-10.12.0.dist-info/METADATA,sha256=dgxB7Ad4a5Zf1CPzLZFo5ny2Siotmsm2mWjQ8B7Nsa4,6969
+dragon_ml_toolbox-10.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-10.12.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-10.12.0.dist-info/RECORD,,

ml_tools/ML_models.py CHANGED Viewed

@@ -300,8 +300,8 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
     sequence with a standard Transformer Encoder.
     """
     def __init__(self, *,
+                 in_features: int,
                  out_targets: int,
-                 numerical_indices: List[int],
                  categorical_map: Dict[int, int],
                  embedding_dim: int = 32,
                  num_heads: int = 8,
@@ -309,8 +309,8 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
                  dropout: float = 0.1):
         """
         Args:
+            in_features (int): The total number of columns in the input data (features).
             out_targets (int): Number of output targets (1 for regression).
-            numerical_indices (List[int]): Column indices for numerical features.
             categorical_map (Dict[int, int]): Maps categorical column index to its cardinality (number of unique categories).
             embedding_dim (int): The dimension for all feature embeddings. Must be divisible by num_heads.
             num_heads (int): The number of heads in the multi-head attention mechanism.
@@ -330,15 +330,25 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
         their cardinality (the number of unique categories) via the `categorical_map` parameter.
         **Ordinal & Binary Features** (e.g., 'Low/Medium/High', 'True/False'): Should be treated as **numerical**. Map them to numbers that
-        represent their state (e.g., `{'Low': 0, 'Medium': 1}` or `{False: 0, True: 1}`). Their column indices should be included in the
-        `numerical_indices` list.
+        represent their state (e.g., `{'Low': 0, 'Medium': 1}` or `{False: 0, True: 1}`). Their column indices should **NOT** be included in the
+        `categorical_map` parameter.
-        **Standard Numerical Features** (e.g., 'Age', 'Price'): Should be included in the `numerical_indices` list. It is highly recommended to
-        scale them before training.
+        **Standard Numerical and Continuous Features** (e.g., 'Age', 'Price'): It is highly recommended to scale them before training.
         """
         super().__init__()
+         # --- Validation ---
+        if categorical_map and max(categorical_map.keys()) >= in_features:
+            _LOGGER.error(f"A categorical index ({max(categorical_map.keys())}) is out of bounds for the provided input features ({in_features}).")
+            raise ValueError()
+        # --- Derive numerical indices ---
+        all_indices = set(range(in_features))
+        categorical_indices_set = set(categorical_map.keys())
+        numerical_indices = sorted(list(all_indices - categorical_indices_set))
         # --- Save configuration ---
+        self.in_features = in_features
         self.out_targets = out_targets
         self.numerical_indices = numerical_indices
         self.categorical_map = categorical_map
@@ -405,8 +415,8 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
     def get_architecture_config(self) -> Dict[str, Any]:
         """Returns the full configuration of the model."""
         return {
+            'in_features': self.in_features,
             'out_targets': self.out_targets,
-            'numerical_indices': self.numerical_indices,
             'categorical_map': self.categorical_map,
             'embedding_dim': self.embedding_dim,
             'num_heads': self.num_heads,
@@ -416,11 +426,9 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
     def __repr__(self) -> str:
         """Returns the developer-friendly string representation of the model."""
-        num_features = len(self.numerical_indices) + len(self.categorical_map)
         # Build the architecture string part-by-part
         parts = [
-            f"Tokenizer(features={num_features}, dim={self.embedding_dim})",
+            f"Tokenizer(features={self.in_features}, dim={self.embedding_dim})",
             "[CLS]",
             f"TransformerEncoder(layers={self.num_layers}, heads={self.num_heads})",
             f"PredictionHead(outputs={self.out_targets})"

ml_tools/data_exploration.py CHANGED Viewed

@@ -22,6 +22,7 @@ __all__ = [
     "drop_columns_with_missing_data",
     "drop_macro",
     "clean_column_names",
+    "encode_categorical_features",
     "split_features_targets",
     "split_continuous_binary",
     "plot_correlation_heatmap",
@@ -29,7 +30,9 @@ __all__ = [
     "clip_outliers_single",
     "clip_outliers_multi",
     "match_and_filter_columns_by_regex",
-    "standardize_percentages"
+    "standardize_percentages",
+    "create_transformer_categorical_map",
+    "reconstruct_one_hot"
 ]
@@ -337,6 +340,90 @@ def clean_column_names(df: pd.DataFrame, replacement_char: str = '-', replacemen
     return new_df
+def encode_categorical_features(
+    df: pd.DataFrame,
+    columns_to_encode: List[str],
+    encode_nulls: bool,
+    split_resulting_dataset: bool = True,
+    verbose: bool = True
+) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
+    """
+    Finds unique values in specified categorical columns, encodes them into integers,
+    and returns a dictionary containing the mappings for each column.
+    This function automates the label encoding process and generates a simple,
+    human-readable dictionary of the mappings.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        columns_to_encode (List[str]): A list of column names to be encoded.
+        encode_nulls (bool): If True, encodes Null values as a distinct category
+            "Other" with a value of 0. Other categories start from 1.
+            If False, Nulls are ignored.
+        split_resulting_dataset (bool): If True, returns two separate DataFrames:
+            one with non-categorical columns and one with the encoded columns.
+            If False, returns a single DataFrame with all columns.
+        verbose (bool): If True, prints encoding progress.
+    Returns:
+        Tuple:
+        - Dict[str, Dict[str, int]]: A dictionary where each key is a column name and the value is its category-to-integer mapping.
+        - pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
+        - pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
+    """
+    df_encoded = df.copy()
+    # Validate columns
+    valid_columns = [col for col in columns_to_encode if col in df_encoded.columns]
+    missing_columns = set(columns_to_encode) - set(valid_columns)
+    if missing_columns:
+        _LOGGER.warning(f"Columns not found and will be skipped: {list(missing_columns)}")
+    mappings: Dict[str, Dict[str, int]] = {}
+    _LOGGER.info(f"Encoding {len(valid_columns)} categorical column(s).")
+    for col_name in valid_columns:
+        has_nulls = df_encoded[col_name].isnull().any()
+        if encode_nulls and has_nulls:
+            # Handle nulls: "Other" -> 0, other categories -> 1, 2, 3...
+            categories = sorted([str(cat) for cat in df_encoded[col_name].dropna().unique()])
+            # Start mapping from 1 for non-null values
+            mapping = {category: i + 1 for i, category in enumerate(categories)}
+            # Apply mapping and fill remaining NaNs with 0
+            mapped_series = df_encoded[col_name].astype(str).map(mapping)
+            df_encoded[col_name] = mapped_series.fillna(0).astype(int)
+            # Create the complete user-facing map including "Other"
+            user_mapping = {**mapping, "Other": 0}
+            mappings[col_name] = user_mapping
+        else:
+            # ignore nulls
+            categories = sorted([str(cat) for cat in df_encoded[col_name].dropna().unique()])
+            mapping = {category: i for i, category in enumerate(categories)}
+            df_encoded[col_name] = df_encoded[col_name].astype(str).map(mapping)
+            mappings[col_name] = mapping
+        if verbose:
+            cardinality = len(mappings[col_name])
+            print(f"  - Encoded '{col_name}' with {cardinality} unique values.")
+    # Handle the dataset splitting logic
+    if split_resulting_dataset:
+        df_categorical = df_encoded[valid_columns].to_frame()
+        df_non_categorical = df.drop(columns=valid_columns)
+        return mappings, df_non_categorical, df_categorical
+    else:
+        return mappings, df_encoded, None
 def split_features_targets(df: pd.DataFrame, targets: list[str]):
     """
     Splits a DataFrame's columns into features and targets.
@@ -766,6 +853,141 @@ def standardize_percentages(
     return df_copy
+def create_transformer_categorical_map(
+    df: pd.DataFrame,
+    mappings: Dict[str, Dict[str, int]],
+    verbose: bool = True
+) -> Dict[int, int]:
+    """
+    Creates the `categorical_map` required by a `TabularTransformer` model.
+    This function should be called late in the preprocessing pipeline, after all
+    column additions, deletions, or reordering have occurred. It uses the final
+    DataFrame's column order to map the correct column index to its cardinality.
+    Args:
+        df (pd.DataFrame): The final, processed DataFrame.
+        mappings (Dict[str, Dict[str, int]]): The mappings dictionary generated by
+          `encode_categorical_features`, containing the category-to-integer
+          mapping for each categorical column.
+        verbose (bool): If True, prints mapping progress.
+    Returns:
+        (Dict[int, int]): The final `categorical_map` for the transformer,
+          mapping each column's current index to its cardinality (e.g., {0: 3}).
+    """
+    transformer_map = {}
+    categorical_column_names = mappings.keys()
+    _LOGGER.info("Creating categorical map for TabularTransformer.")
+    for col_name in categorical_column_names:
+        if col_name in df.columns:
+            col_idx = df.columns.get_loc(col_name)
+            # Get cardinality directly from the length of the mapping dictionary
+            cardinality = len(mappings[col_name])
+            transformer_map[col_idx] = cardinality
+            if verbose:
+                print(f"  - Mapping column '{col_name}' at index {col_idx} with cardinality {cardinality}.")
+        else:
+            _LOGGER.warning(f"Categorical column '{col_name}' not found in the final DataFrame. Skipping.")
+    return transformer_map
+def reconstruct_one_hot(
+    df: pd.DataFrame,
+    base_feature_names: List[str],
+    separator: str = '_',
+    drop_original: bool = True
+) -> pd.DataFrame:
+    """
+    Reconstructs original categorical columns from a one-hot encoded DataFrame.
+    This function identifies groups of one-hot encoded columns based on a common
+    prefix (base feature name) and a separator. It then collapses each group
+    into a single column containing the categorical value.
+    Args:
+        df (pd.DataFrame):
+            The input DataFrame with one-hot encoded columns.
+        base_features (List[str]):
+            A list of base feature names to reconstruct. For example, if you have
+            columns 'B_a', 'B_b', 'B_c', you would pass `['B']`.
+        separator (str):
+            The character separating the base name from the categorical value in
+            the column names (e.g., '_' in 'B_a').
+        drop_original (bool):
+            If True, the original one-hot encoded columns will be dropped from
+            the returned DataFrame.
+    Returns:
+        pd.DataFrame:
+            A new DataFrame with the specified one-hot encoded features
+            reconstructed into single categorical columns.
+    <br>
+    ## Note:
+    This function is designed to be robust, but users should be aware of two key edge cases:
+    1.  **Ambiguous Base Feature Prefixes**: If `base_feature_names` list contains names where one is a prefix of another (e.g., `['feat', 'feat_ext']`), the order is critical. The function will match columns greedily. To avoid incorrect grouping, always list the **most specific base names first** (e.g., `['feat_ext', 'feat']`).
+    2.  **Malformed One-Hot Data**: If a row contains multiple `1`s within the same feature group (e.g., both `B_a` and `B_c` are `1`), the function will not raise an error. It uses `.idxmax()`, which returns the first column that contains the maximum value. This means it will silently select the first category it encounters and ignore the others, potentially masking an upstream data issue.
+    """
+    if not isinstance(df, pd.DataFrame):
+        _LOGGER.error("Input must be a pandas DataFrame.")
+        raise TypeError()
+    new_df = df.copy()
+    all_ohe_cols_to_drop = []
+    reconstructed_count = 0
+    _LOGGER.info(f"Attempting to reconstruct {len(base_feature_names)} one-hot encoded feature(s).")
+    for base_name in base_feature_names:
+        # Regex to find all columns belonging to this base feature.
+        pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
+        # Find matching columns
+        ohe_cols = [col for col in df.columns if re.match(pattern, col)]
+        if not ohe_cols:
+            _LOGGER.warning(f"No one-hot encoded columns found for base feature '{base_name}'. Skipping.")
+            continue
+        # For each row, find the column name with the maximum value (which is 1)
+        reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
+        # Extract the categorical value (the suffix) from the column name
+        # Use n=1 in split to handle cases where the category itself might contain the separator
+        new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
+        # Handle rows where all OHE columns were 0 (e.g., original value was NaN).
+        # In these cases, idxmax returns the first column name, but the sum of values is 0.
+        all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0
+        new_column_values.loc[all_zero_mask] = np.nan
+        # Assign the new reconstructed column to the DataFrame
+        new_df[base_name] = new_column_values
+        all_ohe_cols_to_drop.extend(ohe_cols)
+        reconstructed_count += 1
+        print(f"  - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
+    if drop_original and all_ohe_cols_to_drop:
+        # Drop the original OHE columns, ensuring no duplicates in the drop list
+        unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
+        new_df.drop(columns=unique_cols_to_drop, inplace=True)
+        _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
+    _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
+    return new_df
 def _validate_columns(df: pd.DataFrame, columns: list[str]):
     valid_columns = [column for column in columns if column in df.columns]
     return valid_columns

ml_tools/path_manager.py CHANGED Viewed

@@ -248,26 +248,33 @@ class PathManager:
             _LOGGER.error(f"'{type(self).__name__}' object has no attribute or path key '{sanitized_name}'")
             raise AttributeError()
-    def __setattr__(self, name: str, value: Union[str, Path]):
+    def __setattr__(self, name: str, value: Union[str, Path, bool, dict, str, int, tuple]):
         """Allows attribute-style setting of paths, e.g., PM.data = 'path/to/data'."""
-        # Check for internal attributes
+        # Check for internal attributes, which are set directly on the object.
         if name.startswith('_'):
-            if hasattr(self, '_initialized') and self._initialized:
-                self._check_underscore_key(name)
-                return
-            else:
-                # During initialization, allow private attributes to be set.
-                super().__setattr__(name, value)
+            # This check prevents setting new private attributes after __init__ is done.
+            is_initialized = self.__dict__.get('_initialized', False)
+            if is_initialized:
+                _LOGGER.error(f"Cannot set private attribute '{name}' after initialization.")
+                raise AttributeError()
+            super().__setattr__(name, value)
             return
-        # Block overwriting of existing methods/attributes
+        # Sanitize the key for the public path.
         sanitized_name = self._sanitize_key(name)
         self._check_underscore_key(sanitized_name)
-        if hasattr(self, sanitized_name):
+        # Prevent overwriting existing methods (e.g., PM.status = 'foo').
+        # This check looks at the class, not the instance therefore won't trigger __getattr__.
+        if hasattr(self.__class__, sanitized_name):
             _LOGGER.error(f"Cannot overwrite existing attribute or method '{sanitized_name}' ({name}).")
             raise AttributeError()
+        if not isinstance(value, (str, Path)):
+            _LOGGER.error(f"Cannot assign type '{type(value).__name__}' to a path. Must be str or Path.")
+            raise TypeError
-        # If all checks pass, treat it as a public path.
+        # If all checks pass, treat it as a public path and store it in the _paths dictionary.
         self._paths[sanitized_name] = Path(value)

{dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 10.11.1__py3-none-any.whl → 10.12.0__py3-none-any.whl

dragon-ml-toolbox 10.11.1py3-none-any.whl → 10.12.0py3-none-any.whl