PyPI - dragon-ml-toolbox - Versions diffs - 2.2.0__tar.gz → 2.2.1__tar.gz - Mend

dragon-ml-toolbox 2.2.0tar.gz → 2.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (26) hide show

{dragon_ml_toolbox-2.2.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-2.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 2.2.0
+Version: 2.2.1
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 2.2.0
+Version: 2.2.1
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/ETL_engineering.py RENAMED Viewed

@@ -101,7 +101,7 @@ class DataProcessor:
             raise TypeError("The recipe must be an instance of TransformationRecipe.")
         if len(recipe) == 0:
             raise ValueError("The recipe cannot be empty.")
-        self.recipe = recipe
+        self._recipe = recipe
     def transform(self, df: pl.DataFrame) -> pl.DataFrame:
         """
@@ -109,7 +109,7 @@ class DataProcessor:
         """
         processed_columns = []
         # Recipe object is iterable
-        for step in self.recipe:
+        for step in self._recipe:
             input_col_name = step["input_col"]
             output_col_spec = step["output_col"]
             transform_action = step["transform"]
@@ -154,6 +154,49 @@ class DataProcessor:
             return pl.DataFrame()
         return pl.DataFrame(processed_columns)
+    def __str__(self) -> str:
+        """
+        Provides a detailed, human-readable string representation of the
+        entire processing pipeline.
+        """
+        header = "DataProcessor Pipeline"
+        divider = "-" * len(header)
+        num_steps = len(self._recipe)
+        lines = [
+            header,
+            divider,
+            f"Number of steps: {num_steps}\n"
+        ]
+        if num_steps == 0:
+            lines.append("No transformation steps defined.")
+            return "\n".join(lines)
+        for i, step in enumerate(self._recipe, 1):
+            transform_action = step["transform"]
+            # Get a clean name for the transformation action
+            if transform_action == _RENAME: # "rename"
+                transform_name = "Rename"
+            else:
+                # This works for both functions and class instances
+                transform_name = type(transform_action).__name__
+            lines.append(f"[{i}] Input: '{step['input_col']}'")
+            lines.append(f"    - Transform: {transform_name}")
+            lines.append(f"    - Output(s): {step['output_col']}")
+            if i < num_steps:
+                lines.append("") # Add a blank line between steps
+        return "\n".join(lines)
+    def inspect(self) -> None:
+        """
+        Prints the detailed string representation of the pipeline to the console.
+        """
+        print(self)
 class KeywordDummifier:
@@ -407,7 +450,22 @@ class CategoryMapper:
             pl.Series: A new Series with categories mapped to numbers.
         """
         # Ensure the column is treated as a string for matching keys
-        return column.cast(pl.Utf8).map_dict(self.mapping, default=self.default_value)
+        str_column = column.cast(pl.Utf8)
+        # Create a list of 'when/then' expressions, one for each mapping
+        mapping_expressions = [
+            pl.when(str_column == from_val).then(pl.lit(to_val))
+            for from_val, to_val in self.mapping.items()
+        ]
+        # Use coalesce to find the first non-null value.
+        # The default_value acts as the final fallback.
+        final_expr = pl.coalesce(
+            *mapping_expressions, # Unpack the list of expressions
+            pl.lit(self.default_value)
+        )
+        return pl.select(final_expr).to_series()
 class ValueBinner:

{dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/ml_tools/data_exploration.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import pandas as pd
+from pandas.api.types import is_numeric_dtype
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -24,7 +25,8 @@ __all__ = [
     "plot_value_distributions",
     "clip_outliers_single",
     "clip_outliers_multi",
-    "match_and_filter_columns_by_regex"
+    "match_and_filter_columns_by_regex",
+    "standardize_percentages"
 ]
@@ -575,6 +577,72 @@ def match_and_filter_columns_by_regex(
     return filtered_df, matched_columns
+def standardize_percentages(
+    df: pd.DataFrame,
+    columns: list[str],
+    treat_one_as_proportion: bool = True,
+    round_digits: int = 2
+) -> pd.DataFrame:
+    """
+    Standardizes numeric columns containing mixed-format percentages.
+    This function cleans columns where percentages might be entered as whole
+    numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
+    between 0 and 1 are proportions and multiplies them by 100.
+    Args:
+        df (pd.Dataframe): The input pandas DataFrame.
+        columns (list[str]): A list of column names to standardize.
+        treat_one_as_proportion (bool):
+            - If True (default): The value `1` is treated as a proportion and converted to `100`.
+            - If False: The value `1` is treated as `1%`.
+        round_digits (int): The number of decimal places to round the final result to.
+    Returns:
+        (pd.Dataframe):
+        A new DataFrame with the specified columns cleaned and standardized.
+    """
+    df_copy = df.copy()
+    if df_copy.empty:
+        return df_copy
+    # This helper function contains the core cleaning logic
+    def _clean_value(x: float) -> float:
+        """Applies the standardization rule to a single value."""
+        if pd.isna(x):
+            return x
+        # If treat_one_as_proportion is True, the range for proportions is [0, 1]
+        if treat_one_as_proportion and 0 <= x <= 1:
+            return x * 100
+        # If False, the range for proportions is [0, 1) (1 is excluded)
+        elif not treat_one_as_proportion and 0 <= x < 1:
+            return x * 100
+        # Otherwise, the value is assumed to be a correctly formatted percentage
+        return x
+    for col in columns:
+        # --- Robustness Checks ---
+        if col not in df_copy.columns:
+            print(f"Warning: Column '{col}' not found. Skipping.")
+            continue
+        if not is_numeric_dtype(df_copy[col]):
+            print(f"Warning: Column '{col}' is not numeric. Skipping.")
+            continue
+        # --- Applying the Logic ---
+        # Apply the cleaning function to every value in the column
+        df_copy[col] = df_copy[col].apply(_clean_value)
+        # Round the result
+        df_copy[col] = df_copy[col].round(round_digits)
+    return df_copy
 def _is_notebook():
     return get_ipython() is not None

{dragon_ml_toolbox-2.2.0 → dragon_ml_toolbox-2.2.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "2.2.0"
+version = "2.2.1"
 description = "A collection of tools for data science and machine learning projects"
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }