PyPI - dragon-ml-toolbox - Versions diffs - 3.2.1__tar.gz → 3.4.0__tar.gz - Mend

dragon-ml-toolbox 3.2.1tar.gz → 3.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (30) hide show

{dragon_ml_toolbox-3.2.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.2.1
+Version: 3.4.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.2.1
+Version: 3.4.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/ETL_engineering.py RENAMED Viewed

@@ -12,6 +12,7 @@ __all__ = [
     "TransformationRecipe",
     "DataProcessor",
     "BinaryTransformer",
+    "MultiBinaryDummifier",
     "KeywordDummifier",
     "NumberExtractor",
     "MultiNumberExtractor",
@@ -400,12 +401,72 @@ class BinaryTransformer:
             return (~contains_keyword).cast(pl.UInt8)
+class MultiBinaryDummifier:
+    """
+    A one-to-many transformer that creates multiple binary columns from a single
+    text column based on a list of keywords.
+    For each keyword provided, this transformer generates a corresponding column
+    with a value of 1 if the keyword is present in the input string, and 0 otherwise.
+    It is designed to be used within the DataProcessor pipeline.
+    Args:
+        keywords (List[str]):
+            A list of strings, where each string is a keyword to search for. A separate
+            binary column will be created for each keyword.
+        case_insensitive (bool):
+            If True, keyword matching ignores case. Defaults to True.
+    """
+    def __init__(self, keywords: List[str], case_insensitive: bool = True):
+        if not isinstance(keywords, list) or not all(isinstance(k, str) for k in keywords):
+            raise TypeError("The 'keywords' argument must be a list of strings.")
+        if not keywords:
+            raise ValueError("The 'keywords' list cannot be empty.")
+        self.keywords = keywords
+        self.case_insensitive = case_insensitive
+    def __call__(self, column: pl.Series) -> pl.DataFrame:
+        """
+        Executes the dummification logic.
+        Args:
+            column (pl.Series): The input Polars Series to transform.
+        Returns:
+            pl.DataFrame: A DataFrame where each column corresponds to a keyword.
+        """
+        # Ensure the input is treated as a string, preserving nulls
+        str_column = column.cast(pl.Utf8)
+        output_expressions = []
+        for i, keyword in enumerate(self.keywords):
+            # Escape keyword to treat it as a literal, not a regex pattern
+            base_pattern = re.escape(keyword)
+            # Add case-insensitivity flag `(?i)` if needed
+            pattern = f"(?i){base_pattern}" if self.case_insensitive else base_pattern
+            # Create the binary expression
+            expr = (
+                pl.when(str_column.is_null())
+                .then(None) # Propagate nulls from original column
+                .when(str_column.str.contains(pattern))
+                .then(pl.lit(1, dtype=pl.UInt8))
+                .otherwise(pl.lit(0, dtype=pl.UInt8))
+                .alias(f"col_{i}") # Generic name for DataProcessor
+            )
+            output_expressions.append(expr)
+        return pl.select(output_expressions)
 class KeywordDummifier:
     """
     A configurable transformer that creates one-hot encoded columns based on
     keyword matching in a Polars Series.
-    Instantiate this class with keyword configurations. The instance can be used as a 'transform' callable compatible with the `TransformationRecipe`.
+    Operates on a "first match wins" principle.
     Args:
         group_names (List[str]):
@@ -417,17 +478,14 @@ class KeywordDummifier:
             `group_name` at the same index and contains the keywords to search for.
         case_insensitive (bool):
             If True, keyword matching ignores case.
-        drop_empty (bool):
-            If True, columns that contain no positive matches (all zeros) will be dropped from the final output.
     """
-    def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True, drop_empty: bool = True):
+    def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
         if len(group_names) != len(group_keywords):
             raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
         self.group_names = group_names
         self.group_keywords = group_keywords
         self.case_insensitive = case_insensitive
-        self.drop_empty = drop_empty
     def __call__(self, column: pl.Series) -> pl.DataFrame:
         """
@@ -474,16 +532,7 @@ class KeywordDummifier:
                 # If a group had no matches, create a column of zeros
                 final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
-        # First, create a full DataFrame with all potential columns
-        result_df = pl.DataFrame(final_columns)
-        # If drop_empty is True, filter out all-zero columns
-        if self.drop_empty:
-            # A column is kept if its sum is greater than 0
-            cols_to_keep = [col for col in result_df if col.sum() > 0]
-            return result_df.select(cols_to_keep)
-        return result_df
+        return pl.DataFrame(final_columns)
 class NumberExtractor:

{dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/VIF_factor.py RENAMED Viewed

@@ -35,7 +35,7 @@ def compute_vif(
     Args:
         df (pd.DataFrame): The input DataFrame.
         use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
-        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
+        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `use_columns` is provided.
         max_features_to_plot (int): Adjust the number of features shown in the plot.
         save_dir (str | Path | None): Directory to save the plot as SVG. If None, the plot is not saved.
         filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
@@ -194,7 +194,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
         output_plot_directory (str | Path): Save plots to this directory.
         output_dataset_directory (str | Path | None): If provided, saves new CSV files to this directory.
         use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
-        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
+        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `use_columns` is provided.
         max_features_to_plot (int): Adjust the number of features shown in the plot.
         fontsize (int): Base fontsize to scale title and labels on hte plot.

{dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/data_exploration.py RENAMED Viewed

@@ -15,6 +15,7 @@ import re
 # Keep track of all available tools, show using `info()`
 __all__ = [
     "summarize_dataframe",
+    "drop_zero_only_columns",
     "drop_rows_with_missing_data",
     "split_features_targets",
     "show_null_columns",
@@ -61,6 +62,47 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
     return summary
+def drop_zero_only_columns(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
+    """
+    Removes columns from a pandas DataFrame that contain only zeros and null/NaN values.
+    This utility is useful for cleaning data after dummification steps that may result in empty columns.
+    Args:
+        df (pd.DataFrame):
+            The pandas DataFrame to clean.
+    Returns:
+        pd.DataFrame:
+            A new DataFrame with the empty columns removed.
+    """
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("Input must be a pandas DataFrame.")
+    original_columns = set(df.columns)
+    cols_to_keep = []
+    for col_name in df.columns:
+        column = df[col_name]
+        # Keep any column that is not numeric by default
+        if not is_numeric_dtype(column):
+            cols_to_keep.append(col_name)
+            continue
+        # For numeric columns, check if there's at least one non-zero value.
+        if (column != 0).any():
+            cols_to_keep.append(col_name)
+    dropped_columns = original_columns - set(cols_to_keep)
+    if dropped_columns and verbose:
+        print(f"Dropped {len(dropped_columns)} columns:")
+        for dropped_column in dropped_columns:
+            print(f"    {dropped_column}")
+    return df[cols_to_keep]
 def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
     """
     Drops rows from the DataFrame using a two-stage strategy:

{dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/utilities.py RENAMED Viewed

@@ -24,7 +24,8 @@ __all__ = [
     "threshold_binary_values_batch",
     "serialize_object",
     "deserialize_object",
-    "distribute_datasets_by_target"
+    "distribute_datasets_by_target",
+    "train_dataset_orchestrator"
 ]
@@ -497,7 +498,7 @@ def threshold_binary_values_batch(
     return np.hstack([cont_part, bin_part])
-def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[Path]:
+def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> None:
     """
     Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
@@ -505,9 +506,6 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
         obj (Any) : The Python object to serialize.
         save_dir (str | Path) : Directory path where the serialized object will be saved.
         filename (str) : Name for the output file, extension will be appended if needed.
-    Returns:
-        (Path | None) : The full file path where the object was saved if successful; otherwise, None.
     """
     try:
         save_path = make_fullpath(save_dir, make=True)
@@ -526,7 +524,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
     else:
         if verbose:
             print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
-        return full_path
+        return None
 def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
@@ -597,6 +595,54 @@ def distribute_datasets_by_target(
         yield target, subset
+def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
+                               target_columns: list[str],
+                               save_dir: Union[str,Path],
+                               safe_mode: bool=False):
+    """
+    Orchestrates the creation of single-target datasets from multiple directories each with a variable number of CSV datasets.
+    This function iterates through a list of directories, finds all CSV files,
+    and splits each dataframe based on the provided target columns. Each resulting
+    single-target dataframe is then saved to a specified directory.
+    Parameters
+    ----------
+    list_of_dirs : list[str | Path]
+        A list of directory paths where the source CSV files are located.
+    target_columns : list[str]
+        A list of column names to be used as targets for splitting the datasets.
+    save_dir : str | Path
+        The directory where the newly created single-target datasets will be saved.
+    safe_mode : bool
+        If True, prefixes the saved filename with the source directory name to prevent overwriting files with the same name from different sources.
+    """
+    all_dir_paths: list[Path] = list()
+    for dir in list_of_dirs:
+        dir_path = make_fullpath(dir)
+        if not dir_path.is_dir():
+            raise IOError(f"'{dir}' is not a directory.")
+        all_dir_paths.append(dir_path)
+    # main loop
+    total_saved = 0
+    for df_dir in all_dir_paths:
+        for df_name, df_path in list_csv_paths(df_dir).items():
+            try:
+                for target_name, df in distribute_datasets_by_target(df_or_path=df_path, target_columns=target_columns, verbose=False):
+                    if safe_mode:
+                        filename = df_dir.name + '_' + target_name + '_' + df_name
+                    else:
+                        filename = target_name + '_' + df_name
+                    save_dataframe(df=df, save_dir=save_dir, filename=filename)
+                    total_saved += 1
+            except Exception as e:
+                print(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
+                continue
+    print(f"{total_saved} single-target datasets were created.")
 class LogKeys:
     """
     Used for ML scripts only

{dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "3.2.1"
+version = "3.4.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }