PyPI - dragon-ml-toolbox - Versions diffs - 8.0.0__py3-none-any.whl → 8.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 8.0.0py3-none-any.whl → 8.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 8.0.0
+Version: 8.2.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-dragon_ml_toolbox-8.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-8.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
-ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
+dragon_ml_toolbox-8.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-8.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+ml_tools/ETL_engineering.py,sha256=69YGK4fN5ouRBknTvU4uZ8KLQGT-hPrvwymH-IygEnk,40911
 ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
 ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
 ml_tools/ML_callbacks.py,sha256=noedVMmHZ72Odbg28zqx5wkhhvX2v-jXicKE_NCAiqU,13838
@@ -21,7 +21,7 @@ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
 ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/custom_logger.py,sha256=nyLRxaRxkqYOFdSjI0X2BWXB8C2IU18QfmqIFKqSedI,5820
-ml_tools/data_exploration.py,sha256=P4f8OpRa7Q4i-11nkppxXw5Lx2lwlpn20GwWBbN_xbM,23901
+ml_tools/data_exploration.py,sha256=RuMHWagXrSQi1MzAMlYeBeVg7UxhVvEq8gJ9bIam2BM,27103
 ml_tools/ensemble_evaluation.py,sha256=wnqoTPg4WYWf2A8z5XT0eSlW4snEuLCXQVj88sZKzQ4,24683
 ml_tools/ensemble_inference.py,sha256=rtU7eUaQne615n2g7IHZCJI-OvrBCcjxbTkEIvtCGFQ,9414
 ml_tools/ensemble_learning.py,sha256=dAyFgSTyvxJWjc_enJ_8EUoWwiekBeoNyJNxVY-kcUU,21868
@@ -30,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
 ml_tools/optimization_tools.py,sha256=EL5tgNFwRo-82pbRE1CFVy9noNhULD7wprWuKadPheg,5090
 ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
 ml_tools/utilities.py,sha256=LqXXTovaHbA5AOKRk6Ru6DgAPAM0wPfYU70kUjYBryo,19231
-dragon_ml_toolbox-8.0.0.dist-info/METADATA,sha256=sUJ-tiQBxu_emCNMFYDd762_a9Cpot4pWGhl2J8dXBE,6778
-dragon_ml_toolbox-8.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-8.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-8.0.0.dist-info/RECORD,,
+dragon_ml_toolbox-8.2.0.dist-info/METADATA,sha256=C1rjTnTNSj6VI2khy7Xl1VjQ__MP6-b43x9RIQCHY3E,6778
+dragon_ml_toolbox-8.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-8.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-8.2.0.dist-info/RECORD,,

ml_tools/ETL_engineering.py CHANGED Viewed

@@ -3,7 +3,6 @@ import re
 from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
 from ._script_info import _script_info
 from ._logger import _LOGGER
-import warnings
 __all__ = [
@@ -13,6 +12,7 @@ __all__ = [
     "DataProcessor",
     "BinaryTransformer",
     "MultiBinaryDummifier",
+    "AutoDummifier",
     "KeywordDummifier",
     "NumberExtractor",
     "MultiNumberExtractor",
@@ -277,16 +277,32 @@ class DataProcessor:
                     processed_columns.append(result.alias(output_col_spec))
                 elif isinstance(result, pl.DataFrame):
-                    if not isinstance(output_col_spec, list):
-                        raise TypeError(f"Function for '{input_col_name}' returned a DataFrame but 'output_col' is not a list.")
-                    if len(result.columns) != len(output_col_spec):
-                        raise ValueError(
-                            f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, "
-                            f"but recipe specifies {len(output_col_spec)} output names."
-                        )
+                    # 1. Handle list-based renaming
+                    if isinstance(output_col_spec, list):
+                        if len(result.columns) != len(output_col_spec):
+                            raise ValueError(
+                                f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, "
+                                f"but recipe specifies {len(output_col_spec)} output names."
+                            )
+                        renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
+                        processed_columns.extend(renamed_df.get_columns())
+                    # 2. Handle a string prefix for AutoDummifier
+                    elif isinstance(output_col_spec, str):
+                        prefix = output_col_spec
+                        # Replace the original name part with the desired prefix.
+                        new_names = {
+                            col: f"{prefix}{col[len(input_col_name):]}" for col in result.columns
+                        }
+                        renamed_df = result.rename(new_names)
+                        processed_columns.extend(renamed_df.get_columns())
-                    renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
-                    processed_columns.extend(renamed_df.get_columns())
+                    else:
+                        raise TypeError(
+                            f"Function for '{input_col_name}' returned a DataFrame, "
+                            f"so 'output_col' must be a list of names or a string prefix."
+                        )
                 else:
                     raise TypeError(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
@@ -413,6 +429,27 @@ class BinaryTransformer:
             return (~contains_keyword).cast(pl.UInt8)
+class AutoDummifier:
+    """
+    A transformer that performs one-hot encoding on a categorical column,
+    automatically detecting the unique categories from the data.
+    """
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Executes the one-hot encoding logic.
+        Args:
+            column (pl.Series): The input Polars Series of categories.
+        Returns:
+            pl.DataFrame: A DataFrame with one-hot encoded columns.
+                          Column names are auto-generated by Polars as
+                          '{original_col_name}_{category_value}'.
+        """
+        # Ensure the column is treated as a string before creating dummies
+        return column.cast(pl.Utf8).to_dummies()
 class MultiBinaryDummifier:
     """
     A one-to-many transformer that creates multiple binary columns from a single

ml_tools/data_exploration.py CHANGED Viewed

@@ -5,10 +5,12 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 from typing import Union, Literal, Dict, Tuple, List, Optional
 from pathlib import Path
+import re
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER
-import re
+from .utilities import save_dataframe
 # Keep track of all available tools, show using `info()`
@@ -18,6 +20,7 @@ __all__ = [
     "drop_rows_with_missing_data",
     "show_null_columns",
     "drop_columns_with_missing_data",
+    "drop_macro",
     "split_features_targets",
     "split_continuous_binary",
     "plot_correlation_heatmap",
@@ -155,7 +158,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
 def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
     """
-    Displays a table of columns with missing values, showing both the count and
+    Returns a table of columns with missing values, showing both the count and
     percentage of missing entries per column.
     Parameters:
@@ -221,6 +224,81 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
         return df
+def drop_macro(df: pd.DataFrame,
+               log_directory: Union[str,Path],
+               targets: list[str],
+               skip_targets: bool=False,
+               threshold: float=0.7) -> pd.DataFrame:
+    """
+    Iteratively removes rows and columns with excessive missing data.
+    This function performs a comprehensive cleaning cycle on a DataFrame. It
+    repeatedly drops columns with constant values, followed by rows and columns that exceed
+    a specified threshold of missing values. The process continues until the
+    DataFrame's dimensions stabilize, ensuring that the interdependency between
+    row and column deletions is handled.
+    Initial and final missing data reports are saved to the specified log directory.
+    Args:
+        df (pd.DataFrame): The input pandas DataFrame to be cleaned.
+        log_directory (Union[str, Path]): Path to the directory where the
+            'Missing_Data_start.csv' and 'Missing_Data_final.csv' logs
+            will be saved.
+        targets (list[str]): A list of column names to be treated as target
+            variables. This list guides the row-dropping logic.
+        skip_targets (bool, optional): If True, the columns listed in `targets`
+            will be exempt from being dropped, even if they exceed the missing
+            data threshold.
+        threshold (float, optional): The proportion of missing data required to drop
+            a row or column. For example, 0.7 means a row/column will be
+            dropped if 70% or more of its data is missing.
+    Returns:
+        pd.DataFrame: A new, cleaned DataFrame with offending rows and columns removed.
+    """
+    # make a deep copy to work with
+    df_clean = df.copy()
+    # Log initial state
+    missing_data = show_null_columns(df=df_clean)
+    save_dataframe(df=missing_data.reset_index(drop=False),
+                   save_dir=log_directory,
+                   filename="Missing_Data_start")
+    # Clean cycles for rows and columns
+    master = True
+    while master:
+        # track rows and columns
+        initial_rows, initial_columns = df_clean.shape
+        # drop constant columns
+        df_clean = drop_constant_columns(df=df_clean)
+        # clean rows
+        df_clean = drop_rows_with_missing_data(df=df_clean, targets=targets, threshold=threshold)
+        # clean columns
+        if skip_targets:
+            df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False, skip_columns=targets)
+        else:
+            df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False)
+        # cleaned?
+        remaining_rows, remaining_columns = df_clean.shape
+        if remaining_rows >= initial_rows and remaining_columns >= initial_columns:
+            master = False
+    # log final state
+    missing_data = show_null_columns(df=df_clean)
+    save_dataframe(df=missing_data.reset_index(drop=False),
+                   save_dir=log_directory,
+                   filename="Missing_Data_final")
+    # return cleaned dataframe
+    return df_clean
 def split_features_targets(df: pd.DataFrame, targets: list[str]):
     """
     Splits a DataFrame's columns into features and targets.

{dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 8.0.0__py3-none-any.whl → 8.2.0__py3-none-any.whl

dragon-ml-toolbox 8.0.0py3-none-any.whl → 8.2.0py3-none-any.whl