PyPI - polarfrost - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

polarfrost 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

polarfrost/__init__.py +63 -6
polarfrost/clustering.py +57 -5
polarfrost/mondrian.py +309 -69
polarfrost-0.2.0.dist-info/METADATA +579 -0
polarfrost-0.2.0.dist-info/RECORD +9 -0
polarfrost-0.1.0.dist-info/METADATA +0 -86
polarfrost-0.1.0.dist-info/RECORD +0 -9
{polarfrost-0.1.0.dist-info → polarfrost-0.2.0.dist-info}/WHEEL +0 -0
{polarfrost-0.1.0.dist-info → polarfrost-0.2.0.dist-info}/top_level.txt +0 -0

polarfrost/mondrian.py CHANGED Viewed

@@ -10,6 +10,7 @@ if TYPE_CHECKING:
     from pyspark.sql import DataFrame as SparkDataFrame
     from pyspark.sql.types import StructType
 # ------------------------- POLARS VERSION -------------------------
 def mondrian_k_anonymity_polars(
     df: "pl.DataFrame | pl.LazyFrame",
@@ -24,29 +25,40 @@ def mondrian_k_anonymity_polars(
     """
     if categorical is None:
         categorical = []
+    # Input validation
+    if not isinstance(df, (pl.DataFrame, pl.LazyFrame)):
+        raise ValueError("Input must be a Polars DataFrame or LazyFrame")
     # Convert to LazyFrame if not already
     if isinstance(df, pl.DataFrame):
         df = df.lazy()
-    elif not isinstance(df, pl.LazyFrame):
-        raise ValueError("Input must be a Polars DataFrame or LazyFrame")
+    # Check for empty DataFrame by collecting a sample
+    if df.select(pl.len()).collect().item(0, 0) == 0:
+        raise ValueError("Input DataFrame cannot be empty")
+    # Validate k is a positive integer
+    if not isinstance(k, (int, str)) or (isinstance(k, str) and not k.isdigit()) or int(k) < 1:
+        raise ValueError("k must be a positive integer")
+    k = int(k)  # Convert to int if it's a string of digits
     # Initialize partitions with the full dataset
     partitions = [df]
     result = []
     # Process partitions until none left
     while partitions:
         part = partitions.pop()
         # Get partition size (lazy evaluation)
         n_rows = part.select(pl.len()).collect().item(0, 0)
         # If partition is too small to split, add to results
         if n_rows < 2 * k:
             result.append(part)
             continue
         # Compute spans for each quasi-identifier
         spans: Dict[str, Any] = {}
         for col in quasi_identifiers:
@@ -56,26 +68,39 @@ def mondrian_k_anonymity_polars(
                 spans[col] = n_unique
             else:
                 # For numerical, use range as span
-                stats = part.select([
-                    pl.col(col).min().alias("min"),
-                    pl.col(col).max().alias("max")
-                ]).collect()
+                stats = part.select(
+                    [pl.col(col).min().alias("min"), pl.col(col).max().alias("max")]
+                ).collect()
                 col_min = stats[0, "min"]
                 col_max = stats[0, "max"]
-                spans[col] = col_max - col_min if col_max is not None and col_min is not None else 0
+                # Handle string comparison by converting to float if possible
+                if col_min is not None and col_max is not None:
+                    try:
+                        # Try to convert to float for comparison
+                        min_val = float(col_min) if not isinstance(col_min, (int, float)) else col_min
+                        max_val = float(col_max) if not isinstance(col_max, (int, float)) else col_max
+                        spans[col] = max_val - min_val
+                    except (ValueError, TypeError):
+                        # If conversion fails, use string length difference
+                        spans[col] = abs(len(str(col_max)) - len(str(col_min)))
+                else:
+                    spans[col] = 0
         # Find the attribute with maximum span
         split_col = max(spans, key=spans.get)  # type: ignore
         # If no split possible, add to results
         if spans[split_col] == 0:
             result.append(part)
             continue
         # Split the partition
         if split_col in categorical:
             # For categorical, split on unique values
-            uniq_vals = part.select(pl.col(split_col).unique()).collect().to_series().to_list()
+            uniq_vals = (
+                part.select(pl.col(split_col).unique()).collect().to_series().to_list()
+            )
             mid = len(uniq_vals) // 2
             left_vals = set(uniq_vals[:mid])
             right_vals = set(uniq_vals[mid:])
@@ -86,41 +111,57 @@ def mondrian_k_anonymity_polars(
             median = part.select(pl.col(split_col).median()).collect().item()
             left = part.filter(pl.col(split_col) <= median)
             right = part.filter(pl.col(split_col) > median)
         # Check if both partitions satisfy k-anonymity
         left_n = left.select(pl.len()).collect().item(0, 0)
         right_n = right.select(pl.len()).collect().item(0, 0)
         if left_n >= k and right_n >= k:
             # Both partitions are valid, continue splitting
             partitions.extend([left, right])
         else:
             # At least one partition is too small, keep as is
             result.append(part)
     # Aggregate each partition
     agg_rows = []
     for part in result:
         # Collect only the columns we need
         part_df = part.select(quasi_identifiers + [sensitive_column]).collect()
         row = {}
         # Generalize quasi-identifiers
         for col in quasi_identifiers:
             if col in categorical:
                 # For categorical, use set of unique values
-                row[col] = ','.join(sorted(map(str, part_df[col].unique())))
+                unique_vals = part_df[col].unique()
+                row[col] = ",".join(sorted(str(v) for v in unique_vals))
             else:
                 # For numerical, use range
-                row[col] = f"{part_df[col].min()}-{part_df[col].max()}"
+                min_val = part_df[col].min()
+                max_val = part_df[col].max()
+                # Ensure we have valid numeric values
+                if min_val is None or max_val is None:
+                    row[col] = "*"  # Handle null values
+                else:
+                    # Convert to string, handling bytes and other types
+                    min_str = min_val.decode("utf-8") if isinstance(min_val, bytes) else str(min_val)
+                    max_str = max_val.decode("utf-8") if isinstance(max_val, bytes) else str(max_val)
+                    # Store as string range
+                    row[col] = f"{min_str}-{max_str}"
         # Add sensitive values and count
-        row[sensitive_column] = ','.join(sorted(map(str, part_df[sensitive_column].unique())))
-        row['count'] = part_df.height
+        sensitive_vals = part_df[sensitive_column].unique()
+        row[sensitive_column] = ",".join(sorted(str(v) for v in sensitive_vals))
+        # Store count as integer
+        row["count"] = int(part_df.height)
         agg_rows.append(row)
     return pl.DataFrame(agg_rows)
 # ------------------------- PYSPARK VERSION -------------------------
 def mondrian_k_anonymity_spark(
     df: "SparkDataFrame",
@@ -132,26 +173,50 @@ def mondrian_k_anonymity_spark(
 ) -> "SparkDataFrame":
     """
     Perform Mondrian k-anonymity using PySpark for distributed processing.
+    Args:
+        df: Input PySpark DataFrame
+        quasi_identifiers: List of column names that are quasi-identifiers
+        sensitive_column: Name of the sensitive column
+        k: Anonymity parameter (minimum group size), must be a positive integer
+        categorical: List of categorical column names
+        schema: Schema for the output DataFrame
+    Returns:
+        Anonymized DataFrame with generalized quasi-identifiers
     """
     import pandas as pd
     from pyspark.sql.functions import pandas_udf, PandasUDFType
+    # Validate k parameter first
+    if not isinstance(k, int) or k <= 0:
+        raise ValueError("k must be a positive integer")
+    # Validate schema
+    if schema is None:
+        raise ValueError("Schema must be provided for PySpark UDF")
+    # Check for empty DataFrame
+    if df.rdd.isEmpty():
+        raise ValueError("Input DataFrame cannot be empty")
     if categorical is None:
         categorical = []
-    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
+    # Define the UDF with proper type hints
+    @pandas_udf(returnType=schema, functionType=PandasUDFType.GROUPED_MAP)
     def mondrian_partition(pdf: pd.DataFrame) -> pd.DataFrame:
         partitions = [pdf]
         result = []
         while partitions:
             part = partitions.pop()
             # If partition is too small to split, add to results
             if len(part) < 2 * k:
                 result.append(part)
                 continue
             # Compute spans for each quasi-identifier
             spans = {}
             for col in quasi_identifiers:
@@ -160,31 +225,42 @@ def mondrian_k_anonymity_spark(
                 else:
                     col_min = part[col].min()
                     col_max = part[col].max()
-                    spans[col] = col_max - col_min if pd.notnull(col_max) and pd.notnull(col_min) else 0
+                    spans[col] = (
+                        col_max - col_min
+                        if pd.notnull(col_max) and pd.notnull(col_min)
+                        else 0
+                    )
             # Find the attribute with maximum span
-            split_col = max(spans, key=spans.get)
+            split_col = max(spans.items(), key=lambda x: x[1])[0]  # type: ignore
             # If no split possible, add to results
-            if spans[split_col] == 0:
+            if spans.get(split_col, 0) <= 0:
                 result.append(part)
                 continue
-            # Split the partition
+            # Split on the chosen column
             if split_col in categorical:
-                # For categorical, split on unique values
-                uniq_vals = part[split_col].unique()
-                mid = len(uniq_vals) // 2
-                left_vals = set(uniq_vals[:mid])
-                right_vals = set(uniq_vals[mid:])
-                left = part[part[split_col].isin(left_vals)]
-                right = part[part[split_col].isin(right_vals)]
+                # For categorical, split on median value
+                value_counts = part[split_col].value_counts()
+                if len(value_counts) > 0:
+                    split_val = value_counts.index[len(value_counts) // 2]
+                    mask = part[split_col] == split_val
+                    left = part[mask]
+                    right = part[~mask]
+                else:
+                    result.append(part)
+                    continue
             else:
                 # For numerical, split on median
-                median = part[split_col].median()
-                left = part[part[split_col] <= median]
-                right = part[part[split_col] > median]
+                median_val = part[split_col].median()
+                if pd.notna(median_val):
+                    left = part[part[split_col] <= median_val]
+                    right = part[part[split_col] > median_val]
+                else:
+                    result.append(part)
+                    continue
             # Check if both partitions satisfy k-anonymity
             if len(left) >= k and len(right) >= k:
                 # Both partitions are valid, continue splitting
@@ -192,33 +268,37 @@ def mondrian_k_anonymity_spark(
             else:
                 # At least one partition is too small, keep as is
                 result.append(part)
         # Aggregate the results
         agg_rows = []
         for part in result:
             row = {}
             # Generalize quasi-identifiers
             for col in quasi_identifiers:
                 if col in categorical:
                     # For categorical, use set of unique values
-                    row[col] = ','.join(sorted(map(str, part[col].unique())))
+                    row[col] = ",".join(sorted(map(str, part[col].unique())))
                 else:
                     # For numerical, use range
                     row[col] = f"{part[col].min()}-{part[col].max()}"
             # Add sensitive values and count
-            row[sensitive_column] = ','.join(sorted(map(str, part[sensitive_column].unique())))
-            row['count'] = len(part)
+            row[sensitive_column] = ",".join(
+                sorted(str(v) for v in part[sensitive_column].unique())
+            )
+            # Store count as integer
+            row["count"] = int(len(part))
             agg_rows.append(row)
         return pd.DataFrame(agg_rows)
-    # Apply the function to the entire DataFrame
-    if schema is not None:
-        return df.groupBy().applyInPandas(mondrian_partition, schema=schema)
-    else:
-        return df.groupBy().applyInPandas(mondrian_partition)
+    # Apply the UDF with explicit schema
+    result_df = df.groupBy().applyInPandas(
+        mondrian_partition, schema=schema  # type: ignore
+    )
+    return result_df
 # ------------------------- DISPATCHER -------------------------
 def mondrian_k_anonymity(
@@ -231,7 +311,7 @@ def mondrian_k_anonymity(
 ) -> Union[pl.DataFrame, "SparkDataFrame"]:
     """
     Dispatcher: Use Polars or PySpark Mondrian k-anonymity depending on input type.
     Args:
         df: Input DataFrame (Polars or PySpark)
         quasi_identifiers: List of column names that are quasi-identifiers
@@ -239,18 +319,178 @@ def mondrian_k_anonymity(
         k: Anonymity parameter (minimum group size)
         categorical: List of categorical column names
         schema: Schema for PySpark output (required for PySpark)
     Returns:
         Anonymized DataFrame with generalized quasi-identifiers
     """
     try:
         from pyspark.sql import DataFrame as SparkDataFrame
         if isinstance(df, SparkDataFrame):
-            return mondrian_k_anonymity_spark(df, quasi_identifiers, sensitive_column, k, categorical, schema)
+            return mondrian_k_anonymity_spark(
+                df, quasi_identifiers, sensitive_column, k, categorical, schema
+            )
     except ImportError:
         pass
     if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
-        return mondrian_k_anonymity_polars(df, quasi_identifiers, sensitive_column, k, categorical)
+        return mondrian_k_anonymity_polars(
+            df, quasi_identifiers, sensitive_column, k, categorical
+        )
+    raise ValueError(
+        "Input df must be a polars.DataFrame, polars.LazyFrame, or pyspark.sql.DataFrame"
+    )
+def _generalize_partition(
+    partition: pl.DataFrame,
+    quasi_identifiers: List[str],
+    categorical: List[str],
+    mask_value: str = "masked"
+) -> pl.DataFrame:
+    """Generalize a partition by applying Mondrian-style generalization."""
+    result = partition.clone()
+    for col in quasi_identifiers:
+        is_cat = col in categorical
+        if is_cat:
+            # For categoricals, use mask if multiple values exist
+            if result[col].n_unique() > 1:
+                result = result.with_columns(pl.lit(mask_value).alias(col))
+        else:
+            # For numerical, create a range
+            min_val = result[col].min()
+            max_val = result[col].max()
+            if min_val == max_val:
+                result = result.with_columns(pl.lit(min_val).alias(col))
+            else:
+                result = result.with_columns(pl.lit(f"[{min_val}-{max_val}]").alias(col))
+    return result
+def mondrian_k_anonymity_alt(
+    df: pl.LazyFrame,
+    quasi_identifiers: List[str],
+    sensitive_column: str,
+    k: int,
+    categorical: Optional[List[str]] = None,
+    mask_value: str = "masked",
+    group_columns: Optional[List[str]] = None,
+) -> pl.LazyFrame:
+    """
+    Alternative Mondrian k-anonymity that preserves the original row count.
+    Args:
+        df: Input LazyFrame
+        quasi_identifiers: List of column names that are quasi-identifiers
+        sensitive_column: Name of the sensitive column
+        k: Anonymity parameter (minimum group size)
+        categorical: List of categorical column names
+        mask_value: Value to use for masking small groups
+        group_columns: Additional columns to use for grouping but keep unchanged
+    Returns:
+        Anonymized LazyFrame with same row count as input
+    """
+    if not isinstance(df, pl.LazyFrame):
+        raise ValueError("Input must be a Polars LazyFrame")
+    # Get schema to preserve column order
+    schema = df.schema
+    all_columns = list(schema.keys())
+    # Initialize parameters
+    categorical = categorical or []
+    group_columns = group_columns or []
+    # Validate inputs
+    if k < 1:
+        raise ValueError("k must be a positive integer")
+    # Check if all specified columns exist
+    for col in set(quasi_identifiers + [sensitive_column] + group_columns + categorical):
+        if col not in schema:
+            raise ValueError(f"Column '{col}' not found in DataFrame")
+    # Ensure no overlap between group_columns and QIs
+    if any(col in quasi_identifiers for col in group_columns):
+        raise ValueError("group_columns cannot overlap with quasi_identifiers")
+    # Collect the data once
+    df_collected = df.collect()
+    # Process each group
+    if group_columns:
+        # Get unique group combinations
+        groups = df_collected.select(group_columns).unique()
+        results = []
+        for group in groups.rows(named=True):
+            # Filter current group
+            condition = pl.lit(True)
+            for col, val in group.items():
+                condition = condition & (pl.col(col) == val)
+            group_df = df_collected.filter(condition)
+            group_size = len(group_df)
+            if group_size < k:
+                # Mask QIs and sensitive column for small groups
+                masked_cols = {}
+                # Mask all QIs
+                for col in quasi_identifiers:
+                    if col in categorical:
+                        masked_cols[col] = pl.lit(mask_value)
+                # Always mask the sensitive column for small groups
+                masked_cols[sensitive_column] = pl.lit(mask_value)
+                if masked_cols:
+                    group_df = group_df.with_columns(**masked_cols)
+                results.append(group_df)
+            else:
+                # Apply generalization to QIs
+                if quasi_identifiers:
+                    group_df = _generalize_partition(
+                        group_df,
+                        quasi_identifiers,
+                        categorical,
+                        mask_value
+                    )
+                results.append(group_df)
+        # Combine results
+        result_df = pl.concat(results)
+    else:
+        # Process entire dataset as one group
+        if len(df_collected) < k:
+            # Mask all QIs and sensitive column
+            masked_cols = {}
+            # Mask all QIs
+            for col in quasi_identifiers:
+                if col in categorical:
+                    masked_cols[col] = pl.lit(mask_value)
+            # Always mask the sensitive column for small groups
+            masked_cols[sensitive_column] = pl.lit(mask_value)
+            if masked_cols:
+                result_df = df_collected.with_columns(**masked_cols)
+            else:
+                result_df = df_collected
+        else:
+            # Apply generalization to QIs
+            if quasi_identifiers:
+                result_df = _generalize_partition(
+                    df_collected,
+                    quasi_identifiers,
+                    categorical,
+                    mask_value
+                )
+            else:
+                result_df = df_collected
-    raise ValueError("Input df must be a polars.DataFrame, polars.LazyFrame, or pyspark.sql.DataFrame")
+    # Ensure original column order and return as LazyFrame
+    return result_df.select(all_columns).lazy()

polarfrost 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

polarfrost 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl