PyPI - dragon-ml-toolbox - Versions diffs - 4.2.1__py3-none-any.whl → 4.3.0__py3-none-any.whl - Mend

dragon-ml-toolbox 4.2.1py3-none-any.whl → 4.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (8) hide show

{dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 4.2.1
+Version: 4.3.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-dragon_ml_toolbox-4.2.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-4.2.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
-ml_tools/ETL_engineering.py,sha256=rlu0bUekdKREcTR0x1jn_TSEqhxgfq3QU71hy6ZyaD8,39503
+dragon_ml_toolbox-4.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-4.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+ml_tools/ETL_engineering.py,sha256=P7HN_e3vfmrOqDDK-IenyRSFQPr0N3V9e2gN75QFVWs,39372
 ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
 ml_tools/MICE_imputation.py,sha256=b6ZTs8RedXFifOpuMCzr68xM16mCBVh1Ua6kcGfiVtg,11462
 ml_tools/ML_callbacks.py,sha256=0a-Rbr0Xp_B1FNopOKBBmuJ4MqazS5JgDiT7wx1dHvE,13161
@@ -16,7 +16,7 @@ ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
 ml_tools/_pytorch_models.py,sha256=ewPPsTHgmRPzMMWwObZOdH1vxm2Ij2VWZP38NC6zSH4,10135
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/custom_logger.py,sha256=a3ywSCQT7j5ypR-usnKh2l861d_aVJ93ZRVqxrHsBBw,4112
-ml_tools/data_exploration.py,sha256=rJhvxUqVbEuB_7HG-PfLH3vaA7hrZEtbVHg9QO9VS4A,22837
+ml_tools/data_exploration.py,sha256=T4nO9YSDGvrpom7JELtoQTyg7XTEmvQz-jG0KKxqTRk,23467
 ml_tools/datasetmaster.py,sha256=_tNC2v98eCQGr3nMW_EFs83TRgRme8Uc7ttg1vosmQU,30106
 ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
 ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
@@ -24,7 +24,7 @@ ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,1300
 ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
 ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
 ml_tools/utilities.py,sha256=mz-M351DzxWxnYVcLX-7ZQ6c-RGoCV9g4VTS9Qif2Es,18348
-dragon_ml_toolbox-4.2.1.dist-info/METADATA,sha256=mzW1BLOxrCKZAoZgqzYRcNhHpO4fTNxDGvUwuF5wG88,6572
-dragon_ml_toolbox-4.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-4.2.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-4.2.1.dist-info/RECORD,,
+dragon_ml_toolbox-4.3.0.dist-info/METADATA,sha256=7aZO_5P8SDx4tPFTtb3MTAaRgf_vbcOEURaxpT3MGK8,6572
+dragon_ml_toolbox-4.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-4.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-4.3.0.dist-info/RECORD,,

ml_tools/ETL_engineering.py CHANGED Viewed

@@ -3,6 +3,7 @@ import re
 from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
 from ._script_info import _script_info
 from ._logger import _LOGGER
+import warnings
 __all__ = [
@@ -50,7 +51,7 @@ class ColumnCleaner:
     ```python
     id_rules = {
         # Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
-        r'ID[- ](\d+)': r'ID:$1'
+        r'ID[- ](\\d+)': r'ID:$1'
     }
     id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
@@ -700,26 +701,28 @@ class MultiNumberExtractor:
 class RatioCalculator:
     """
-    A transformer that parses a string ratio (e.g., "40:5" or "30/2") and computes the result of the division.
-    Args:
-        regex_pattern (str, optional):
-            The regex pattern to find the numerator and denominator. It MUST
-            contain exactly two capturing groups: the first for the
-            numerator and the second for the denominator. Defaults to a
-            pattern that handles common delimiters like ':' and '/'.
+    A transformer that parses a string ratio (e.g., "40:5" or "30/2") and
+    computes the result of the division. It gracefully handles strings that
+    do not match the pattern by returning null.
     """
     def __init__(
         self,
-        regex_pattern: str = r"(\d+\.?\d*)\s*[:/]\s*(\d+\.?\d*)"
+        # Default pattern includes the full-width colon '：'
+        regex_pattern: str = r"(\d+\.?\d*)\s*[:：/]\s*(\d+\.?\d*)"
     ):
-        # --- Validation ---
+        # --- Robust Validation ---
         try:
-            if re.compile(regex_pattern).groups != 2:
+            compiled_pattern = re.compile(regex_pattern)
+            if compiled_pattern.groups != 2:
                 raise ValueError(
-                    "regex_pattern must contain exactly two "
+                    "RatioCalculator regex_pattern must contain exactly two "
                     "capturing groups '(...)'."
                 )
+            if compiled_pattern.groupindex:
+                raise ValueError(
+                    "RatioCalculator must be initialized with unnamed capturing groups "
+                    "(e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)')."
+                )
         except re.error as e:
             raise ValueError(f"Invalid regex pattern provided: {e}") from e
@@ -728,27 +731,20 @@ class RatioCalculator:
     def __call__(self, column: pl.Series) -> pl.Series:
         """
         Applies the ratio calculation logic to the input column.
-        Args:
-            column (pl.Series): The input Polars Series of ratio strings.
-        Returns:
-            pl.Series: A new Series of floats containing the division result.
-                       Returns null for invalid formats or division by zero.
+        This version uses .str.extract() for maximum stability.
         """
-        # .extract_groups returns a struct with a field for each capture group
-        # e.g., {"group_1": "40", "group_2": "5"}
-        groups = column.str.extract_groups(self.regex_pattern)
+        # Extract numerator (group 1) and denominator (group 2) separately.
+        numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
+        denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
-        # Extract numerator and denominator, casting to float
-        # strict=False ensures that non-matches become null
-        numerator = groups.struct.field("group_1").cast(pl.Float64, strict=False)
-        denominator = groups.struct.field("group_2").cast(pl.Float64, strict=False)
+        # Calculate the ratio, handling division by zero.
+        final_expr = pl.when(denominator_expr != 0).then(
+            numerator_expr / denominator_expr
+        ).otherwise(
+            None # Handles both null denominators and division by zero
+        )
-        # Safely perform division, returning null if denominator is 0
-        final_expr = pl.when(denominator != 0).then(numerator / denominator).otherwise(None)
-        return pl.select(final_expr).to_series()
+        return pl.select(final_expr.round(4)).to_series()
 class CategoryMapper:

ml_tools/data_exploration.py CHANGED Viewed

@@ -15,9 +15,9 @@ __all__ = [
     "summarize_dataframe",
     "drop_constant_columns",
     "drop_rows_with_missing_data",
-    "split_features_targets",
     "show_null_columns",
     "drop_columns_with_missing_data",
+    "split_features_targets",
     "split_continuous_binary",
     "plot_correlation_heatmap",
     "plot_value_distributions",
@@ -125,7 +125,9 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
     # Stage 1: Drop rows with all target columns missing
     if targets is not None:
-        target_na = df_clean[targets].isnull().all(axis=1)
+        # validate targets
+        valid_targets = [target for target in targets if target in df_clean.columns]
+        target_na = df_clean[valid_targets].isnull().all(axis=1)
         if target_na.any():
             print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
             df_clean = df_clean[~target_na]
@@ -150,30 +152,6 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
     return df_clean
-def split_features_targets(df: pd.DataFrame, targets: list[str]):
-    """
-    Splits a DataFrame's columns into features and targets.
-    Args:
-        df (pd.DataFrame): Pandas DataFrame containing the dataset.
-        targets (list[str]): List of column names to be treated as target variables.
-    Returns:
-        tuple: A tuple containing:
-            - pd.DataFrame: Features dataframe.
-            - pd.DataFrame: Targets dataframe.
-    Prints:
-        - Shape of the original dataframe.
-        - Shape of the features dataframe.
-        - Shape of the targets dataframe.
-    """
-    df_targets = df[targets]
-    df_features = df.drop(columns=targets)
-    print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
-    return df_features, df_targets
 def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
     """
     Displays a table of columns with missing values, showing both the count and
@@ -202,7 +180,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
     return null_summary
-def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
+def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
     """
     Drops columns with more than `threshold` fraction of missing values.
@@ -210,11 +188,22 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
         df (pd.DataFrame): The input DataFrame.
         threshold (float): Fraction of missing values above which columns are dropped.
         show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
+        skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
     Returns:
         pd.DataFrame: A new DataFrame without the dropped columns.
     """
-    missing_fraction = df.isnull().mean()
+    # If skip_columns is provided, create a list of columns to check.
+    # Otherwise, check all columns.
+    cols_to_check = df.columns
+    if skip_columns:
+        # Use set difference for efficient exclusion
+        cols_to_check = df.columns.difference(skip_columns)
+    # Calculate the missing fraction only on the columns to be checked
+    missing_fraction = df[cols_to_check].isnull().mean()
     cols_to_drop = missing_fraction[missing_fraction > threshold].index
     if len(cols_to_drop) > 0:
@@ -231,6 +220,30 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
         return df
+def split_features_targets(df: pd.DataFrame, targets: list[str]):
+    """
+    Splits a DataFrame's columns into features and targets.
+    Args:
+        df (pd.DataFrame): Pandas DataFrame containing the dataset.
+        targets (list[str]): List of column names to be treated as target variables.
+    Returns:
+        tuple: A tuple containing:
+            - pd.DataFrame: Features dataframe.
+            - pd.DataFrame: Targets dataframe.
+    Prints:
+        - Shape of the original dataframe.
+        - Shape of the features dataframe.
+        - Shape of the targets dataframe.
+    """
+    df_targets = df[targets]
+    df_features = df.drop(columns=targets)
+    print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
+    return df_features, df_targets
 def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.

{dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 4.2.1__py3-none-any.whl → 4.3.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 4.2.1py3-none-any.whl → 4.3.0py3-none-any.whl