PyPI - dragon-ml-toolbox - Versions diffs - 3.0.0__tar.gz → 3.2.0__tar.gz - Mend

dragon-ml-toolbox 3.0.0tar.gz → 3.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (30) hide show

{dragon_ml_toolbox-3.0.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.0.0
+Version: 3.2.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.0.0
+Version: 3.2.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/ETL_engineering.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import polars as pl
 import re
-from typing import Literal, Union, Optional, Any, Callable, List, Dict
+from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
 from .utilities import _script_info
 import pandas as pd
 from .logger import _LOGGER
@@ -11,6 +11,7 @@ __all__ = [
     "DataFrameCleaner",
     "TransformationRecipe",
     "DataProcessor",
+    "BinaryTransformer",
     "KeywordDummifier",
     "NumberExtractor",
     "MultiNumberExtractor",
@@ -25,18 +26,26 @@ __all__ = [
 class ColumnCleaner:
     """
-    Cleans and standardizes a single pandas Series based on a dictionary of regex-to-value replacement rules.
+    Cleans and standardizes a pandas Series by applying regex-to-replacement rules.
+    Supports sub-string replacements and case-insensitivity.
+    Notes:
+    - Write separate, specific rules for each case. Don't combine patterns with an "OR".
+    - Define rules from most specific to more general to create a fallback system.
+    - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
     Args:
         rules (Dict[str, str]):
-            A dictionary where each key is a regular expression pattern and
-            each value is the standardized string to replace matches with.
+            A dictionary of regex patterns to replacement strings. Can use
+            backreferences in the replacement statement (e.g., r'\\1 \\2 \\3 \\4 \\5') for captured groups.
+        case_insensitive (bool):
+            If True, regex matching ignores case.
     """
-    def __init__(self, rules: Dict[str, str]):
+    def __init__(self, rules: Dict[str, str], case_insensitive: bool = True):
         if not isinstance(rules, dict):
             raise TypeError("The 'rules' argument must be a dictionary.")
-        # Validate that all keys are valid regular expressions
+        # Validate regex patterns
         for pattern in rules.keys():
             try:
                 re.compile(pattern)
@@ -44,32 +53,52 @@ class ColumnCleaner:
                 raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
         self.rules = rules
+        self.case_insensitive = case_insensitive
     def clean(self, series: pd.Series) -> pd.Series:
         """
-        Applies the standardization rules to the provided Series (requires string data).
+        Applies the standardization rules sequentially to the provided Series.
-        Non-matching values are kept as they are.
         Args:
             series (pd.Series): The pandas Series to clean.
         Returns:
-            pd.Series: A new Series with the values cleaned and standardized.
+            pd.Series: A new Series with the regex replacements applied.
         """
-        return series.astype(str).replace(self.rules, regex=True)
+        cleaned_series = series.astype(str)
+        # Set the regex flags based on the case_insensitive setting
+        flags = re.IGNORECASE if self.case_insensitive else 0
+        # Sequentially apply each regex rule
+        for pattern, replacement in self.rules.items():
+            cleaned_series = cleaned_series.str.replace(
+                pattern,
+                replacement,
+                regex=True,
+                flags=flags
+            )
+        return cleaned_series
 class DataFrameCleaner:
     """
     Orchestrates the cleaning of multiple columns in a pandas DataFrame using a nested dictionary of rules and `ColumnCleaner` objects.
+    Chosen case-sensitivity is applied to all columns.
+    Notes:
+    - Write separate, specific rules for each case. Don't combine patterns with an "OR".
+    - Define rules from most specific to more general to create a fallback system.
+    - Beware of chain replacements (rules matching strings that have already been changed by a previous rule).
     Args:
         rules (Dict[str, Dict[str, str]]):
             A nested dictionary where each top-level key is a column name,
             and its value is a dictionary of regex rules for that column, as expected by `ColumnCleaner`.
     """
-    def __init__(self, rules: Dict[str, Dict[str, str]]):
+    def __init__(self, rules: Dict[str, Dict[str, str]], case_insensitive: bool = True):
         if not isinstance(rules, dict):
             raise TypeError("The 'rules' argument must be a nested dictionary.")
@@ -81,6 +110,7 @@ class DataFrameCleaner:
                 )
         self.rules = rules
+        self.case_insensitive = case_insensitive
     def clean(self, df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -109,13 +139,13 @@ class DataFrameCleaner:
         for column_name, column_rules in self.rules.items():
             # Create and apply the specific cleaner for the column
-            cleaner = ColumnCleaner(rules=column_rules)
+            cleaner = ColumnCleaner(rules=column_rules, case_insensitive=self.case_insensitive)
             df_cleaned[column_name] = cleaner.clean(df_cleaned[column_name])
         return df_cleaned
-############ TRANSFORM ####################
+############ TRANSFORM MAIN ####################
 # Magic word for rename-only transformation
 _RENAME = "rename"
@@ -300,6 +330,75 @@ class DataProcessor:
         """
         print(self)
+############ TRANSFORMERS ####################
+class BinaryTransformer:
+    """
+    A transformer that maps string values to a binary 1 or 0 based on keyword matching.
+    Must supply a list of keywords for either the 'true' case (1) or the 'false' case (0), but not both.
+    Args:
+        true_keywords (List[str] | None):
+            If a string contains any of these keywords, the output is 1, otherwise 0.
+        false_keywords (List[str] | None):
+            If a string contains any of these keywords, the output is 0, otherwise 1.
+    """
+    def __init__(
+        self,
+        true_keywords: Optional[List[str]] = None,
+        false_keywords: Optional[List[str]] = None,
+        case_insensitive: bool = True,
+    ):
+        # --- Validation: Enforce one and only one option ---
+        if true_keywords is not None and false_keywords is not None:
+            raise ValueError(
+                "Provide either 'true_keywords' or 'false_keywords', but not both."
+            )
+        if true_keywords is None and false_keywords is None:
+            raise ValueError(
+                "You must provide either 'true_keywords' or 'false_keywords'."
+            )
+        # --- Configuration ---
+        self.keywords: List[str] = true_keywords if true_keywords is not None else false_keywords # type: ignore
+        if not self.keywords:
+            raise ValueError("Keyword list cannot be empty.")
+        self.mode: str = "true_mode" if true_keywords is not None else "false_mode"
+        # --- Create the regex string pattern ---
+        # Escape keywords to treat them as literals
+        base_pattern = "|".join(re.escape(k) for k in self.keywords)
+        # For polars, add case-insensitivity flag `(?i)` to the pattern string itself
+        if case_insensitive:
+            self.pattern = f"(?i){base_pattern}"
+        else:
+            self.pattern = base_pattern
+    def __call__(self, column: pl.Series) -> pl.Series:
+        """
+        Applies the binary mapping logic to the input column.
+        Args:
+            column (pl.Series): The input Polars Series of string data.
+        Returns:
+            pl.Series: A new Series of type UInt8 containing 1s and 0s.
+        """
+        # Create a boolean Series: True if any keyword is found, else False
+        contains_keyword = column.str.contains(self.pattern)
+        # Apply logic and cast directly to integer type
+        if self.mode == "true_mode":
+            # True -> 1, False -> 0
+            return contains_keyword.cast(pl.UInt8)
+        else: # false_mode
+            # We want the inverse: True -> 0, False -> 1
+            return (~contains_keyword).cast(pl.UInt8)
 class KeywordDummifier:
     """
@@ -316,13 +415,16 @@ class KeywordDummifier:
         group_keywords (List[List[str]]):
             A list of lists of strings. Each inner list corresponds to a
             `group_name` at the same index and contains the keywords to search for.
+        case_insensitive (bool):
+            If True, keyword matching ignores case.
     """
-    def __init__(self, group_names: List[str], group_keywords: List[List[str]]):
+    def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
         if len(group_names) != len(group_keywords):
             raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
         self.group_names = group_names
         self.group_keywords = group_keywords
+        self.case_insensitive = case_insensitive
     def __call__(self, column: pl.Series) -> pl.DataFrame:
         """
@@ -336,9 +438,18 @@ class KeywordDummifier:
         """
         column = column.cast(pl.Utf8)
-        categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None))
+        categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
         for name, keywords in zip(self.group_names, self.group_keywords):
-            pattern = "|".join(re.escape(k) for k in keywords)
+            # Create the base regex pattern by escaping and joining keywords
+            base_pattern = "|".join(re.escape(k) for k in keywords)
+            # Add the case-insensitive flag `(?i)` to the pattern string
+            if self.case_insensitive:
+                pattern = f"(?i){base_pattern}"
+            else:
+                pattern = base_pattern
             categorize_expr = categorize_expr.when(
                 column.str.contains(pattern)
             ).then(pl.lit(name))
@@ -357,6 +468,7 @@ class KeywordDummifier:
                     df_with_dummies.get_column(dummy_col_name).alias(name)
                 )
             else:
+                # If a group had no matches, create a column of zeros
                 final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
         return pl.DataFrame(final_columns)
@@ -632,33 +744,42 @@ class RegexMapper:
     "first match wins" logic makes the order of the mapping important.
     Args:
-        mapping (Dict[str, Union[int, float]]):
+        mapping (Dict[str, [int | float]]):
             An ordered dictionary where keys are regex patterns and values are
             the numbers to map to if the pattern is found.
-        unseen_value (Optional[Union[int, float]], optional):
+        unseen_value (int | float | None):
             The numerical value to use for strings that do not match any
-            of the regex patterns. If None (default), unseen values are
-            mapped to null.
+            of the regex patterns. If None, unseen values are mapped to null.
+        case_insensitive (bool):
+            If True , the regex matching for all patterns will ignore case.
     """
     def __init__(
         self,
         mapping: Dict[str, Union[int, float]],
         unseen_value: Optional[Union[int, float]] = None,
+        case_insensitive: bool = True,
     ):
         # --- Validation ---
         if not isinstance(mapping, dict):
             raise TypeError("The 'mapping' argument must be a dictionary.")
+        self.unseen_value = unseen_value
+        # --- Process and validate patterns ---
+        # Process patterns here to be more efficient, avoiding reprocessing on every __call__.
+        self.processed_mapping: List[Tuple[str, Union[int, float]]] = []
         for pattern, value in mapping.items():
+            final_pattern = f"(?i){pattern}" if case_insensitive else pattern
+            # Validate the final pattern that will actually be used by Polars
             try:
-                re.compile(pattern)
+                re.compile(final_pattern)
             except re.error as e:
-                raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
+                raise ValueError(f"Invalid regex pattern '{final_pattern}': {e}") from e
             if not isinstance(value, (int, float)):
                 raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
-        self.mapping = mapping
-        self.unseen_value = unseen_value
+            self.processed_mapping.append((final_pattern, value))
     def __call__(self, column: pl.Series) -> pl.Series:
         """
@@ -671,22 +792,20 @@ class RegexMapper:
             pl.Series: A new Series with strings mapped to numbers based on
                        the first matching regex pattern.
         """
-        # Ensure the column is treated as a string for matching
-        str_column = column.cast(pl.Utf8)
+        # pl.String is the modern alias for pl.Utf8
+        str_column = column.cast(pl.String)
-        # Build the when/then/otherwise chain from the inside out.
-        # Start with the final fallback value for non-matches.
+        # Start with the fallback value for non-matches.
         mapping_expr = pl.lit(self.unseen_value)
-        # Iterate through the mapping in reverse to construct the nested expression
-        for pattern, value in reversed(list(self.mapping.items())):
+        # Iterate through the processed mapping in reverse to construct the nested expression
+        for pattern, value in reversed(self.processed_mapping):
             mapping_expr = (
                 pl.when(str_column.str.contains(pattern))
                 .then(pl.lit(value))
                 .otherwise(mapping_expr)
             )
-        # Execute the complete expression chain and return the resulting Series
         return pl.select(mapping_expr).to_series()

{dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/ml_tools/data_exploration.py RENAMED Viewed

@@ -587,14 +587,14 @@ def standardize_percentages(
     Standardizes numeric columns containing mixed-format percentages.
     This function cleans columns where percentages might be entered as whole
-    numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
+    numbers (55) and as proportions (0.55). It assumes values
     between 0 and 1 are proportions and multiplies them by 100.
     Args:
         df (pd.Dataframe): The input pandas DataFrame.
         columns (list[str]): A list of column names to standardize.
         treat_one_as_proportion (bool):
-            - If True (default): The value `1` is treated as a proportion and converted to `100`.
+            - If True (default): The value `1` is treated as a proportion and converted to `100%`.
             - If False: The value `1` is treated as `1%`.
         round_digits (int): The number of decimal places to round the final result to.

{dragon_ml_toolbox-3.0.0 → dragon_ml_toolbox-3.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "3.0.0"
+version = "3.2.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }