PyPI - polarfrost - Versions diffs - 0.1.0__py3-none-any.whl - Mend

polarfrost 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

polarfrost/__init__.py +26 -0
polarfrost/clustering.py +29 -0
polarfrost/mondrian.py +256 -0
polarfrost/py.typed +2 -0
polarfrost/tests/__init__.py +0 -0
polarfrost-0.1.0.dist-info/METADATA +86 -0
polarfrost-0.1.0.dist-info/RECORD +9 -0
polarfrost-0.1.0.dist-info/WHEEL +5 -0
polarfrost-0.1.0.dist-info/top_level.txt +1 -0

polarfrost/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""
+PolarFrost: Fast k-anonymity implementation using Polars and PySpark.
+This package provides efficient implementations of k-anonymity algorithms,
+including the Mondrian algorithm, with support for both local (Polars)
+and distributed (PySpark) processing.
+"""
+__version__ = "0.1.0"
+# Import main functions
+try:
+    from .mondrian import (
+        mondrian_k_anonymity,
+        mondrian_k_anonymity_polars,
+        mondrian_k_anonymity_spark
+    )
+    __all__ = [
+        'mondrian_k_anonymity',
+        'mondrian_k_anonymity_polars',
+        'mondrian_k_anonymity_spark'
+    ]
+except ImportError as e:
+    import warnings
+    warnings.warn(f"Could not import mondrian: {e}")
+    __all__ = []

polarfrost/clustering.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""
+Clustering-based k-anonymity implementation using Polars.
+"""
+from typing import List, Optional, Union
+import polars as pl
+def clustering_k_anonymity(
+    df: Union[pl.DataFrame, pl.LazyFrame],
+    quasi_identifiers: List[str],
+    sensitive_column: str,
+    k: int,
+    categorical: Optional[List[str]] = None,
+    method: str = "fcbg",
+) -> pl.DataFrame:
+    """
+    Perform clustering-based k-anonymity using Polars.
+    Args:
+        df: Input DataFrame or LazyFrame
+        quasi_identifiers: List of column names to use for clustering
+        sensitive_column: Column containing sensitive information
+        k: Minimum group size for k-anonymity
+        categorical: List of categorical column names
+        method: Clustering method ('fcbg', 'rsc', or 'random')
+    Returns:
+        Anonymized DataFrame with generalized quasi-identifiers
+    """
+    raise NotImplementedError("Clustering k-anonymity will be implemented soon")

polarfrost/mondrian.py ADDED Viewed

@@ -0,0 +1,256 @@
+"""
+Efficient Mondrian k-Anonymity implementation using Polars and PySpark.
+Compatible with local (Polars) and Databricks/Spark (PySpark) environments.
+"""
+from typing import List, Optional, Union, Dict, Any, cast, TYPE_CHECKING
+import polars as pl
+if TYPE_CHECKING:
+    from pyspark.sql import DataFrame as SparkDataFrame
+    from pyspark.sql.types import StructType
+# ------------------------- POLARS VERSION -------------------------
+def mondrian_k_anonymity_polars(
+    df: "pl.DataFrame | pl.LazyFrame",
+    quasi_identifiers: List[str],
+    sensitive_column: str,
+    k: int,
+    categorical: Optional[List[str]] = None,
+) -> pl.DataFrame:
+    """
+    Perform Mondrian k-anonymity using Polars LazyFrame for local processing.
+    Accepts either DataFrame or LazyFrame as input.
+    """
+    if categorical is None:
+        categorical = []
+    # Convert to LazyFrame if not already
+    if isinstance(df, pl.DataFrame):
+        df = df.lazy()
+    elif not isinstance(df, pl.LazyFrame):
+        raise ValueError("Input must be a Polars DataFrame or LazyFrame")
+    # Initialize partitions with the full dataset
+    partitions = [df]
+    result = []
+    # Process partitions until none left
+    while partitions:
+        part = partitions.pop()
+        # Get partition size (lazy evaluation)
+        n_rows = part.select(pl.len()).collect().item(0, 0)
+        # If partition is too small to split, add to results
+        if n_rows < 2 * k:
+            result.append(part)
+            continue
+        # Compute spans for each quasi-identifier
+        spans: Dict[str, Any] = {}
+        for col in quasi_identifiers:
+            if col in categorical:
+                # For categorical, use number of unique values as span
+                n_unique = part.select(pl.col(col).n_unique()).collect().item()
+                spans[col] = n_unique
+            else:
+                # For numerical, use range as span
+                stats = part.select([
+                    pl.col(col).min().alias("min"),
+                    pl.col(col).max().alias("max")
+                ]).collect()
+                col_min = stats[0, "min"]
+                col_max = stats[0, "max"]
+                spans[col] = col_max - col_min if col_max is not None and col_min is not None else 0
+        # Find the attribute with maximum span
+        split_col = max(spans, key=spans.get)  # type: ignore
+        # If no split possible, add to results
+        if spans[split_col] == 0:
+            result.append(part)
+            continue
+        # Split the partition
+        if split_col in categorical:
+            # For categorical, split on unique values
+            uniq_vals = part.select(pl.col(split_col).unique()).collect().to_series().to_list()
+            mid = len(uniq_vals) // 2
+            left_vals = set(uniq_vals[:mid])
+            right_vals = set(uniq_vals[mid:])
+            left = part.filter(pl.col(split_col).is_in(left_vals))
+            right = part.filter(pl.col(split_col).is_in(right_vals))
+        else:
+            # For numerical, split on median
+            median = part.select(pl.col(split_col).median()).collect().item()
+            left = part.filter(pl.col(split_col) <= median)
+            right = part.filter(pl.col(split_col) > median)
+        # Check if both partitions satisfy k-anonymity
+        left_n = left.select(pl.len()).collect().item(0, 0)
+        right_n = right.select(pl.len()).collect().item(0, 0)
+        if left_n >= k and right_n >= k:
+            # Both partitions are valid, continue splitting
+            partitions.extend([left, right])
+        else:
+            # At least one partition is too small, keep as is
+            result.append(part)
+    # Aggregate each partition
+    agg_rows = []
+    for part in result:
+        # Collect only the columns we need
+        part_df = part.select(quasi_identifiers + [sensitive_column]).collect()
+        row = {}
+        # Generalize quasi-identifiers
+        for col in quasi_identifiers:
+            if col in categorical:
+                # For categorical, use set of unique values
+                row[col] = ','.join(sorted(map(str, part_df[col].unique())))
+            else:
+                # For numerical, use range
+                row[col] = f"{part_df[col].min()}-{part_df[col].max()}"
+        # Add sensitive values and count
+        row[sensitive_column] = ','.join(sorted(map(str, part_df[sensitive_column].unique())))
+        row['count'] = part_df.height
+        agg_rows.append(row)
+    return pl.DataFrame(agg_rows)
+# ------------------------- PYSPARK VERSION -------------------------
+def mondrian_k_anonymity_spark(
+    df: "SparkDataFrame",
+    quasi_identifiers: List[str],
+    sensitive_column: str,
+    k: int,
+    categorical: Optional[List[str]] = None,
+    schema: Optional["StructType"] = None,
+) -> "SparkDataFrame":
+    """
+    Perform Mondrian k-anonymity using PySpark for distributed processing.
+    """
+    import pandas as pd
+    from pyspark.sql.functions import pandas_udf, PandasUDFType
+    if categorical is None:
+        categorical = []
+    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
+    def mondrian_partition(pdf: pd.DataFrame) -> pd.DataFrame:
+        partitions = [pdf]
+        result = []
+        while partitions:
+            part = partitions.pop()
+            # If partition is too small to split, add to results
+            if len(part) < 2 * k:
+                result.append(part)
+                continue
+            # Compute spans for each quasi-identifier
+            spans = {}
+            for col in quasi_identifiers:
+                if col in categorical:
+                    spans[col] = part[col].nunique()
+                else:
+                    col_min = part[col].min()
+                    col_max = part[col].max()
+                    spans[col] = col_max - col_min if pd.notnull(col_max) and pd.notnull(col_min) else 0
+            # Find the attribute with maximum span
+            split_col = max(spans, key=spans.get)
+            # If no split possible, add to results
+            if spans[split_col] == 0:
+                result.append(part)
+                continue
+            # Split the partition
+            if split_col in categorical:
+                # For categorical, split on unique values
+                uniq_vals = part[split_col].unique()
+                mid = len(uniq_vals) // 2
+                left_vals = set(uniq_vals[:mid])
+                right_vals = set(uniq_vals[mid:])
+                left = part[part[split_col].isin(left_vals)]
+                right = part[part[split_col].isin(right_vals)]
+            else:
+                # For numerical, split on median
+                median = part[split_col].median()
+                left = part[part[split_col] <= median]
+                right = part[part[split_col] > median]
+            # Check if both partitions satisfy k-anonymity
+            if len(left) >= k and len(right) >= k:
+                # Both partitions are valid, continue splitting
+                partitions.extend([left, right])
+            else:
+                # At least one partition is too small, keep as is
+                result.append(part)
+        # Aggregate the results
+        agg_rows = []
+        for part in result:
+            row = {}
+            # Generalize quasi-identifiers
+            for col in quasi_identifiers:
+                if col in categorical:
+                    # For categorical, use set of unique values
+                    row[col] = ','.join(sorted(map(str, part[col].unique())))
+                else:
+                    # For numerical, use range
+                    row[col] = f"{part[col].min()}-{part[col].max()}"
+            # Add sensitive values and count
+            row[sensitive_column] = ','.join(sorted(map(str, part[sensitive_column].unique())))
+            row['count'] = len(part)
+            agg_rows.append(row)
+        return pd.DataFrame(agg_rows)
+    # Apply the function to the entire DataFrame
+    if schema is not None:
+        return df.groupBy().applyInPandas(mondrian_partition, schema=schema)
+    else:
+        return df.groupBy().applyInPandas(mondrian_partition)
+# ------------------------- DISPATCHER -------------------------
+def mondrian_k_anonymity(
+    df: Union[pl.DataFrame, pl.LazyFrame, "SparkDataFrame"],
+    quasi_identifiers: List[str],
+    sensitive_column: str,
+    k: int,
+    categorical: Optional[List[str]] = None,
+    schema: Optional["StructType"] = None,
+) -> Union[pl.DataFrame, "SparkDataFrame"]:
+    """
+    Dispatcher: Use Polars or PySpark Mondrian k-anonymity depending on input type.
+    Args:
+        df: Input DataFrame (Polars or PySpark)
+        quasi_identifiers: List of column names that are quasi-identifiers
+        sensitive_column: Name of the sensitive column
+        k: Anonymity parameter (minimum group size)
+        categorical: List of categorical column names
+        schema: Schema for PySpark output (required for PySpark)
+    Returns:
+        Anonymized DataFrame with generalized quasi-identifiers
+    """
+    try:
+        from pyspark.sql import DataFrame as SparkDataFrame
+        if isinstance(df, SparkDataFrame):
+            return mondrian_k_anonymity_spark(df, quasi_identifiers, sensitive_column, k, categorical, schema)
+    except ImportError:
+        pass
+    if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
+        return mondrian_k_anonymity_polars(df, quasi_identifiers, sensitive_column, k, categorical)
+    raise ValueError("Input df must be a polars.DataFrame, polars.LazyFrame, or pyspark.sql.DataFrame")

polarfrost/py.typed ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # This file indicates that this package is typed according to PEP 561
2	+ # It allows type checkers to recognize the package as type-annotated

polarfrost/tests/__init__.py ADDED Viewed

File without changes

polarfrost-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,86 @@
+Metadata-Version: 2.4
+Name: polarfrost
+Version: 0.1.0
+Summary: A fast k-anonymity implementation using Polars and PySpark
+Home-page: https://github.com/rglew/polarfrost
+Author: Richard Glew
+Author-email: richard.glew@hotmail.com
+Keywords: anonymization,privacy,polars,k-anonymity,data-privacy
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Security
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: polars>=0.13.0
+Requires-Dist: pandas>=1.3.0
+Requires-Dist: numpy>=1.21.0
+Provides-Extra: spark
+Requires-Dist: pyspark>=3.0.0; extra == "spark"
+Provides-Extra: dev
+Requires-Dist: pytest>=6.0; extra == "dev"
+Requires-Dist: pytest-cov>=2.0; extra == "dev"
+Requires-Dist: black>=21.0; extra == "dev"
+Requires-Dist: isort>=5.0; extra == "dev"
+Requires-Dist: mypy>=0.900; extra == "dev"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# Polarfrost
+A fast k-anonymity implementation using Polars, featuring both Mondrian and Clustering algorithms for efficient privacy-preserving data analysis.
+## Features
+- 🚀 Blazing fast k-anonymity using Polars
+- 🧊 Supports both local (Polars) and distributed (PySpark) processing
+- 📊 Preserves data utility while ensuring privacy
+- 🐍 Simple Python API
+## Installation
+```bash
+pip install polarfrost
+```
+## Quick Start
+```python
+import polars as pl
+from polarfrost import mondrian_k_anonymity
+# Load your data
+df = pl.read_csv("your_data.csv")
+# Apply k-anonymity
+anonymized = mondrian_k_anonymity(
+    df,
+    quasi_identifiers=["age", "gender", "zipcode"],
+    sensitive_column="income",
+    k=3,
+    categorical=["gender", "zipcode"]
+)
+print(anonymized)
+```
+## License
+MIT

polarfrost-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+polarfrost/__init__.py,sha256=f8nFJQsdr5ykHIY69PM5x11gOLRNgJXEty6DR8OQ5eU,697
+polarfrost/clustering.py,sha256=9wJ237zQAZXHlimmch-1Yr3xGiSu6GjioxQ2xvd7vqM,955
+polarfrost/mondrian.py,sha256=6-V5_uhx8UqNiuVKRPMYzSE51O8FsQEaHBJbyZhoJLU,9839
+polarfrost/py.typed,sha256=M2mJCnUN7Ice7bLDMBMcrHzD8_Cjh2U52FOGVfM7c5o,139
+polarfrost/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+polarfrost-0.1.0.dist-info/METADATA,sha256=uI2hX_xs-02m495-zdhmelVs8gMPlyvSxruvuZQ3Z1E,2380
+polarfrost-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+polarfrost-0.1.0.dist-info/top_level.txt,sha256=sYpSVIpjaKGJfdvJtbHvo6usiVi0SxqXjdJ_pB_JD0c,11
+polarfrost-0.1.0.dist-info/RECORD,,

polarfrost-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

polarfrost-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ polarfrost