PyPI - dragon-ml-toolbox - Versions diffs - 3.3.0__tar.gz → 3.5.0__tar.gz - Mend

dragon-ml-toolbox 3.3.0tar.gz → 3.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (30) hide show

{dragon_ml_toolbox-3.3.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.3.0
+Version: 3.5.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.3.0 → dragon_ml_toolbox-3.5.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.3.0
+Version: 3.5.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.3.0 → dragon_ml_toolbox-3.5.0}/ml_tools/ETL_engineering.py RENAMED Viewed

@@ -2,7 +2,6 @@ import polars as pl
 import re
 from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
 from .utilities import _script_info
-import pandas as pd
 from .logger import _LOGGER
@@ -24,124 +23,137 @@ __all__ = [
 ]
 ########## EXTRACT and CLEAN ##########
 class ColumnCleaner:
     """
-    Cleans and standardizes a pandas Series by applying regex-to-replacement rules.
-    Supports sub-string replacements and case-insensitivity.
+    A configuration object that defines cleaning rules for a single Polars DataFrame column.
+    This class holds a dictionary of regex-to-replacement rules, the target column name,
+    and the case-sensitivity setting. It is intended to be used with the DataFrameCleaner.
     Notes:
-    - Write separate, specific rules for each case. Don't combine patterns with an "OR".
-    - Define rules from most specific to more general to create a fallback system.
-    - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
+        - Define rules from most specific to more general to create a fallback system.
+        - Beware of chain replacements (rules matching strings that have already been
+          changed by a previous rule in the same cleaner).
     Args:
+        column_name (str):
+            The name of the column to be cleaned.
         rules (Dict[str, str]):
             A dictionary of regex patterns to replacement strings. Can use
-            backreferences in the replacement statement (e.g., r'\\1 \\2 \\3 \\4 \\5') for captured groups.
+            backreferences (e.g., r'$1 $2') for captured groups. Note that Polars
+            uses a '$' prefix for backreferences.
         case_insensitive (bool):
-            If True, regex matching ignores case.
+            If True (default), regex matching ignores case.
+    ## Usage Example
+    ```python
+    phone_rules = {
+        # Matches (123) 456-7890 and reformats to 123-456-7890
+        r'\((\d{3})\)\s*(\d{3})-(\d{4})': r'$1-$2-$3'
+    }
+    phone_cleaner = ColumnCleaner(column_name='phone_number', rules=phone_rules)
+    # This object would then be passed to a DataFrameCleaner.
+    ```
     """
-    def __init__(self, rules: Dict[str, str], case_insensitive: bool = True):
+    def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
+        if not isinstance(column_name, str) or not column_name:
+            raise TypeError("The 'column_name' must be a non-empty string.")
         if not isinstance(rules, dict):
             raise TypeError("The 'rules' argument must be a dictionary.")
-        # Validate regex patterns
+        # Validate each regex pattern for correctness
         for pattern in rules.keys():
             try:
                 re.compile(pattern)
             except re.error as e:
                 raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
+        self.column_name = column_name
         self.rules = rules
         self.case_insensitive = case_insensitive
-    def clean(self, series: pd.Series) -> pd.Series:
-        """
-        Applies the standardization rules sequentially to the provided Series.
-        Args:
-            series (pd.Series): The pandas Series to clean.
-        Returns:
-            pd.Series: A new Series with the regex replacements applied.
-        """
-        cleaned_series = series.astype(str)
-        # Set the regex flags based on the case_insensitive setting
-        flags = re.IGNORECASE if self.case_insensitive else 0
-        # Sequentially apply each regex rule
-        for pattern, replacement in self.rules.items():
-            cleaned_series = cleaned_series.str.replace(
-                pattern,
-                replacement,
-                regex=True,
-                flags=flags
-            )
-        return cleaned_series
 class DataFrameCleaner:
     """
-    Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
-    Chosen case-sensitivity is applied to all columns.
-    Notes:
-    - Write separate, specific rules for each case. Don't combine patterns with an "OR".
-    - Define rules from most specific to more general to create a fallback system.
-    - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
+    Orchestrates cleaning multiple columns in a Polars DataFrame.
+    This class takes a list of ColumnCleaner objects and applies their defined
+    rules to the corresponding columns of a DataFrame using high-performance
+    Polars expressions.
     Args:
-        rules (Dict[str, Dict[str, str]]):
-            A nested dictionary where each top-level key is a column name,
-            and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
+        cleaners (List[ColumnCleaner]):
+            A list of ColumnCleaner configuration objects.
+    Raises:
+        TypeError: If 'cleaners' is not a list or contains non-ColumnCleaner objects.
+        ValueError: If multiple ColumnCleaner objects target the same column.
     """
-    def __init__(self, rules: Dict[str, Dict[str, str]], case_insensitive: bool = True):
-        if not isinstance(rules, dict):
-            raise TypeError("The 'rules' argument must be a nested dictionary.")
-        for col_name, col_rules in rules.items():
-            if not isinstance(col_rules, dict):
+    def __init__(self, cleaners: List[ColumnCleaner]):
+        if not isinstance(cleaners, list):
+            raise TypeError("The 'cleaners' argument must be a list of ColumnCleaner objects.")
+        seen_columns = set()
+        for cleaner in cleaners:
+            if not isinstance(cleaner, ColumnCleaner):
                 raise TypeError(
-                    f"The value for column '{col_name}' must be a dictionary "
-                    f"of rules, but got type {type(col_rules).__name__}."
+                    f"All items in 'cleaners' list must be ColumnCleaner objects, "
+                    f"but found an object of type {type(cleaner).__name__}."
                 )
-        self.rules = rules
-        self.case_insensitive = case_insensitive
+            if cleaner.column_name in seen_columns:
+                raise ValueError(
+                    f"Duplicate ColumnCleaner found for column '{cleaner.column_name}'. "
+                    "Each column should only have one cleaner."
+                )
+            seen_columns.add(cleaner.column_name)
-    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
+        self.cleaners = cleaners
+    def clean(self, df: pl.DataFrame) -> pl.DataFrame:
         """
-        Applies all defined cleaning rules to the DataFrame.
+        Applies all defined cleaning rules to the Polars DataFrame.
         Args:
-            df (pd.DataFrame): The pandas DataFrame to clean.
+            df (pl.DataFrame): The Polars DataFrame to clean.
         Returns:
-            pd.DataFrame: A new, cleaned DataFrame.
+            pl.DataFrame: A new, cleaned Polars DataFrame.
+        Raises:
+            ValueError: If any columns specified in the cleaners are not found
+                        in the input DataFrame.
         """
-        rule_columns = set(self.rules.keys())
+        rule_columns = {c.column_name for c in self.cleaners}
         df_columns = set(df.columns)
         missing_columns = rule_columns - df_columns
         if missing_columns:
-            # Report all missing columns in a single, clear error message
             raise ValueError(
-                f"The following columns specified in the cleaning rules "
+                f"The following columns specified in cleaning rules "
                 f"were not found in the DataFrame: {sorted(list(missing_columns))}"
             )
+        df_cleaned = df.clone()
-        # Start the process
-        df_cleaned = df.copy()
-        for column_name, column_rules in self.rules.items():
-            # Create and apply the specific cleaner for the column
-            cleaner = ColumnCleaner(rules=column_rules, case_insensitive=self.case_insensitive)
-            df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
+        # Build and apply a series of expressions for each column
+        for cleaner in self.cleaners:
+            col_name = cleaner.column_name
+            # Start with the column, cast to String for replacement operations
+            col_expr = pl.col(col_name).cast(pl.String)
+            # Sequentially chain 'replace_all' expressions for each rule
+            for pattern, replacement in cleaner.rules.items():
+                final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
+                col_expr = col_expr.str.replace_all(final_pattern, replacement)
+            # Execute the expression chain for the column
+            df_cleaned = df_cleaned.with_columns(col_expr.alias(col_name))
+        print(f"Cleaned {len(self.cleaners)} columns.")
         return df_cleaned

{dragon_ml_toolbox-3.3.0 → dragon_ml_toolbox-3.5.0}/ml_tools/VIF_factor.py RENAMED Viewed

@@ -35,7 +35,7 @@ def compute_vif(
     Args:
         df (pd.DataFrame): The input DataFrame.
         use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
-        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
+        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `use_columns` is provided.
         max_features_to_plot (int): Adjust the number of features shown in the plot.
         save_dir (str | Path | None): Directory to save the plot as SVG. If None, the plot is not saved.
         filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
@@ -194,7 +194,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
         output_plot_directory (str | Path): Save plots to this directory.
         output_dataset_directory (str | Path | None): If provided, saves new CSV files to this directory.
         use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
-        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
+        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `use_columns` is provided.
         max_features_to_plot (int): Adjust the number of features shown in the plot.
         fontsize (int): Base fontsize to scale title and labels on hte plot.

{dragon_ml_toolbox-3.3.0 → dragon_ml_toolbox-3.5.0}/ml_tools/utilities.py RENAMED Viewed

@@ -24,7 +24,8 @@ __all__ = [
     "threshold_binary_values_batch",
     "serialize_object",
     "deserialize_object",
-    "distribute_datasets_by_target"
+    "distribute_datasets_by_target",
+    "train_dataset_orchestrator"
 ]
@@ -497,7 +498,7 @@ def threshold_binary_values_batch(
     return np.hstack([cont_part, bin_part])
-def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[Path]:
+def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> None:
     """
     Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
@@ -505,9 +506,6 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
         obj (Any) : The Python object to serialize.
         save_dir (str | Path) : Directory path where the serialized object will be saved.
         filename (str) : Name for the output file, extension will be appended if needed.
-    Returns:
-        (Path | None) : The full file path where the object was saved if successful; otherwise, None.
     """
     try:
         save_path = make_fullpath(save_dir, make=True)
@@ -526,7 +524,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
     else:
         if verbose:
             print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
-        return full_path
+        return None
 def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
@@ -597,6 +595,54 @@ def distribute_datasets_by_target(
         yield target, subset
+def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
+                               target_columns: list[str],
+                               save_dir: Union[str,Path],
+                               safe_mode: bool=False):
+    """
+    Orchestrates the creation of single-target datasets from multiple directories each with a variable number of CSV datasets.
+    This function iterates through a list of directories, finds all CSV files,
+    and splits each dataframe based on the provided target columns. Each resulting
+    single-target dataframe is then saved to a specified directory.
+    Parameters
+    ----------
+    list_of_dirs : list[str | Path]
+        A list of directory paths where the source CSV files are located.
+    target_columns : list[str]
+        A list of column names to be used as targets for splitting the datasets.
+    save_dir : str | Path
+        The directory where the newly created single-target datasets will be saved.
+    safe_mode : bool
+        If True, prefixes the saved filename with the source directory name to prevent overwriting files with the same name from different sources.
+    """
+    all_dir_paths: list[Path] = list()
+    for dir in list_of_dirs:
+        dir_path = make_fullpath(dir)
+        if not dir_path.is_dir():
+            raise IOError(f"'{dir}' is not a directory.")
+        all_dir_paths.append(dir_path)
+    # main loop
+    total_saved = 0
+    for df_dir in all_dir_paths:
+        for df_name, df_path in list_csv_paths(df_dir).items():
+            try:
+                for target_name, df in distribute_datasets_by_target(df_or_path=df_path, target_columns=target_columns, verbose=False):
+                    if safe_mode:
+                        filename = df_dir.name + '_' + target_name + '_' + df_name
+                    else:
+                        filename = target_name + '_' + df_name
+                    save_dataframe(df=df, save_dir=save_dir, filename=filename)
+                    total_saved += 1
+            except Exception as e:
+                print(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
+                continue
+    print(f"{total_saved} single-target datasets were created.")
 class LogKeys:
     """
     Used for ML scripts only

{dragon_ml_toolbox-3.3.0 → dragon_ml_toolbox-3.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "3.3.0"
+version = "3.5.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }