polarfrost 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polarfrost/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ """
2
+ PolarFrost: Fast k-anonymity implementation using Polars and PySpark.
3
+
4
+ This package provides efficient implementations of k-anonymity algorithms,
5
+ including the Mondrian algorithm, with support for both local (Polars)
6
+ and distributed (PySpark) processing.
7
+ """
8
+
9
+ __version__ = "0.1.0"
10
+
11
+ # Import main functions
12
+ try:
13
+ from .mondrian import (
14
+ mondrian_k_anonymity,
15
+ mondrian_k_anonymity_polars,
16
+ mondrian_k_anonymity_spark
17
+ )
18
+ __all__ = [
19
+ 'mondrian_k_anonymity',
20
+ 'mondrian_k_anonymity_polars',
21
+ 'mondrian_k_anonymity_spark'
22
+ ]
23
+ except ImportError as e:
24
+ import warnings
25
+ warnings.warn(f"Could not import mondrian: {e}")
26
+ __all__ = []
@@ -0,0 +1,29 @@
1
+ """
2
+ Clustering-based k-anonymity implementation using Polars.
3
+ """
4
+ from typing import List, Optional, Union
5
+ import polars as pl
6
+
7
+ def clustering_k_anonymity(
8
+ df: Union[pl.DataFrame, pl.LazyFrame],
9
+ quasi_identifiers: List[str],
10
+ sensitive_column: str,
11
+ k: int,
12
+ categorical: Optional[List[str]] = None,
13
+ method: str = "fcbg",
14
+ ) -> pl.DataFrame:
15
+ """
16
+ Perform clustering-based k-anonymity using Polars.
17
+
18
+ Args:
19
+ df: Input DataFrame or LazyFrame
20
+ quasi_identifiers: List of column names to use for clustering
21
+ sensitive_column: Column containing sensitive information
22
+ k: Minimum group size for k-anonymity
23
+ categorical: List of categorical column names
24
+ method: Clustering method ('fcbg', 'rsc', or 'random')
25
+
26
+ Returns:
27
+ Anonymized DataFrame with generalized quasi-identifiers
28
+ """
29
+ raise NotImplementedError("Clustering k-anonymity will be implemented soon")
polarfrost/mondrian.py ADDED
@@ -0,0 +1,256 @@
1
+ """
2
+ Efficient Mondrian k-Anonymity implementation using Polars and PySpark.
3
+ Compatible with local (Polars) and Databricks/Spark (PySpark) environments.
4
+ """
5
+
6
+ from typing import List, Optional, Union, Dict, Any, cast, TYPE_CHECKING
7
+ import polars as pl
8
+
9
+ if TYPE_CHECKING:
10
+ from pyspark.sql import DataFrame as SparkDataFrame
11
+ from pyspark.sql.types import StructType
12
+
13
+ # ------------------------- POLARS VERSION -------------------------
14
+ def mondrian_k_anonymity_polars(
15
+ df: "pl.DataFrame | pl.LazyFrame",
16
+ quasi_identifiers: List[str],
17
+ sensitive_column: str,
18
+ k: int,
19
+ categorical: Optional[List[str]] = None,
20
+ ) -> pl.DataFrame:
21
+ """
22
+ Perform Mondrian k-anonymity using Polars LazyFrame for local processing.
23
+ Accepts either DataFrame or LazyFrame as input.
24
+ """
25
+ if categorical is None:
26
+ categorical = []
27
+
28
+ # Convert to LazyFrame if not already
29
+ if isinstance(df, pl.DataFrame):
30
+ df = df.lazy()
31
+ elif not isinstance(df, pl.LazyFrame):
32
+ raise ValueError("Input must be a Polars DataFrame or LazyFrame")
33
+
34
+ # Initialize partitions with the full dataset
35
+ partitions = [df]
36
+ result = []
37
+
38
+ # Process partitions until none left
39
+ while partitions:
40
+ part = partitions.pop()
41
+
42
+ # Get partition size (lazy evaluation)
43
+ n_rows = part.select(pl.len()).collect().item(0, 0)
44
+
45
+ # If partition is too small to split, add to results
46
+ if n_rows < 2 * k:
47
+ result.append(part)
48
+ continue
49
+
50
+ # Compute spans for each quasi-identifier
51
+ spans: Dict[str, Any] = {}
52
+ for col in quasi_identifiers:
53
+ if col in categorical:
54
+ # For categorical, use number of unique values as span
55
+ n_unique = part.select(pl.col(col).n_unique()).collect().item()
56
+ spans[col] = n_unique
57
+ else:
58
+ # For numerical, use range as span
59
+ stats = part.select([
60
+ pl.col(col).min().alias("min"),
61
+ pl.col(col).max().alias("max")
62
+ ]).collect()
63
+ col_min = stats[0, "min"]
64
+ col_max = stats[0, "max"]
65
+ spans[col] = col_max - col_min if col_max is not None and col_min is not None else 0
66
+
67
+ # Find the attribute with maximum span
68
+ split_col = max(spans, key=spans.get) # type: ignore
69
+
70
+ # If no split possible, add to results
71
+ if spans[split_col] == 0:
72
+ result.append(part)
73
+ continue
74
+
75
+ # Split the partition
76
+ if split_col in categorical:
77
+ # For categorical, split on unique values
78
+ uniq_vals = part.select(pl.col(split_col).unique()).collect().to_series().to_list()
79
+ mid = len(uniq_vals) // 2
80
+ left_vals = set(uniq_vals[:mid])
81
+ right_vals = set(uniq_vals[mid:])
82
+ left = part.filter(pl.col(split_col).is_in(left_vals))
83
+ right = part.filter(pl.col(split_col).is_in(right_vals))
84
+ else:
85
+ # For numerical, split on median
86
+ median = part.select(pl.col(split_col).median()).collect().item()
87
+ left = part.filter(pl.col(split_col) <= median)
88
+ right = part.filter(pl.col(split_col) > median)
89
+
90
+ # Check if both partitions satisfy k-anonymity
91
+ left_n = left.select(pl.len()).collect().item(0, 0)
92
+ right_n = right.select(pl.len()).collect().item(0, 0)
93
+
94
+ if left_n >= k and right_n >= k:
95
+ # Both partitions are valid, continue splitting
96
+ partitions.extend([left, right])
97
+ else:
98
+ # At least one partition is too small, keep as is
99
+ result.append(part)
100
+
101
+ # Aggregate each partition
102
+ agg_rows = []
103
+ for part in result:
104
+ # Collect only the columns we need
105
+ part_df = part.select(quasi_identifiers + [sensitive_column]).collect()
106
+ row = {}
107
+
108
+ # Generalize quasi-identifiers
109
+ for col in quasi_identifiers:
110
+ if col in categorical:
111
+ # For categorical, use set of unique values
112
+ row[col] = ','.join(sorted(map(str, part_df[col].unique())))
113
+ else:
114
+ # For numerical, use range
115
+ row[col] = f"{part_df[col].min()}-{part_df[col].max()}"
116
+
117
+ # Add sensitive values and count
118
+ row[sensitive_column] = ','.join(sorted(map(str, part_df[sensitive_column].unique())))
119
+ row['count'] = part_df.height
120
+ agg_rows.append(row)
121
+
122
+ return pl.DataFrame(agg_rows)
123
+
124
+ # ------------------------- PYSPARK VERSION -------------------------
125
+ def mondrian_k_anonymity_spark(
126
+ df: "SparkDataFrame",
127
+ quasi_identifiers: List[str],
128
+ sensitive_column: str,
129
+ k: int,
130
+ categorical: Optional[List[str]] = None,
131
+ schema: Optional["StructType"] = None,
132
+ ) -> "SparkDataFrame":
133
+ """
134
+ Perform Mondrian k-anonymity using PySpark for distributed processing.
135
+ """
136
+ import pandas as pd
137
+ from pyspark.sql.functions import pandas_udf, PandasUDFType
138
+
139
+ if categorical is None:
140
+ categorical = []
141
+
142
+ @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
143
+ def mondrian_partition(pdf: pd.DataFrame) -> pd.DataFrame:
144
+ partitions = [pdf]
145
+ result = []
146
+
147
+ while partitions:
148
+ part = partitions.pop()
149
+
150
+ # If partition is too small to split, add to results
151
+ if len(part) < 2 * k:
152
+ result.append(part)
153
+ continue
154
+
155
+ # Compute spans for each quasi-identifier
156
+ spans = {}
157
+ for col in quasi_identifiers:
158
+ if col in categorical:
159
+ spans[col] = part[col].nunique()
160
+ else:
161
+ col_min = part[col].min()
162
+ col_max = part[col].max()
163
+ spans[col] = col_max - col_min if pd.notnull(col_max) and pd.notnull(col_min) else 0
164
+
165
+ # Find the attribute with maximum span
166
+ split_col = max(spans, key=spans.get)
167
+
168
+ # If no split possible, add to results
169
+ if spans[split_col] == 0:
170
+ result.append(part)
171
+ continue
172
+
173
+ # Split the partition
174
+ if split_col in categorical:
175
+ # For categorical, split on unique values
176
+ uniq_vals = part[split_col].unique()
177
+ mid = len(uniq_vals) // 2
178
+ left_vals = set(uniq_vals[:mid])
179
+ right_vals = set(uniq_vals[mid:])
180
+ left = part[part[split_col].isin(left_vals)]
181
+ right = part[part[split_col].isin(right_vals)]
182
+ else:
183
+ # For numerical, split on median
184
+ median = part[split_col].median()
185
+ left = part[part[split_col] <= median]
186
+ right = part[part[split_col] > median]
187
+
188
+ # Check if both partitions satisfy k-anonymity
189
+ if len(left) >= k and len(right) >= k:
190
+ # Both partitions are valid, continue splitting
191
+ partitions.extend([left, right])
192
+ else:
193
+ # At least one partition is too small, keep as is
194
+ result.append(part)
195
+
196
+ # Aggregate the results
197
+ agg_rows = []
198
+ for part in result:
199
+ row = {}
200
+
201
+ # Generalize quasi-identifiers
202
+ for col in quasi_identifiers:
203
+ if col in categorical:
204
+ # For categorical, use set of unique values
205
+ row[col] = ','.join(sorted(map(str, part[col].unique())))
206
+ else:
207
+ # For numerical, use range
208
+ row[col] = f"{part[col].min()}-{part[col].max()}"
209
+
210
+ # Add sensitive values and count
211
+ row[sensitive_column] = ','.join(sorted(map(str, part[sensitive_column].unique())))
212
+ row['count'] = len(part)
213
+ agg_rows.append(row)
214
+
215
+ return pd.DataFrame(agg_rows)
216
+
217
+ # Apply the function to the entire DataFrame
218
+ if schema is not None:
219
+ return df.groupBy().applyInPandas(mondrian_partition, schema=schema)
220
+ else:
221
+ return df.groupBy().applyInPandas(mondrian_partition)
222
+
223
+ # ------------------------- DISPATCHER -------------------------
224
+ def mondrian_k_anonymity(
225
+ df: Union[pl.DataFrame, pl.LazyFrame, "SparkDataFrame"],
226
+ quasi_identifiers: List[str],
227
+ sensitive_column: str,
228
+ k: int,
229
+ categorical: Optional[List[str]] = None,
230
+ schema: Optional["StructType"] = None,
231
+ ) -> Union[pl.DataFrame, "SparkDataFrame"]:
232
+ """
233
+ Dispatcher: Use Polars or PySpark Mondrian k-anonymity depending on input type.
234
+
235
+ Args:
236
+ df: Input DataFrame (Polars or PySpark)
237
+ quasi_identifiers: List of column names that are quasi-identifiers
238
+ sensitive_column: Name of the sensitive column
239
+ k: Anonymity parameter (minimum group size)
240
+ categorical: List of categorical column names
241
+ schema: Schema for PySpark output (required for PySpark)
242
+
243
+ Returns:
244
+ Anonymized DataFrame with generalized quasi-identifiers
245
+ """
246
+ try:
247
+ from pyspark.sql import DataFrame as SparkDataFrame
248
+ if isinstance(df, SparkDataFrame):
249
+ return mondrian_k_anonymity_spark(df, quasi_identifiers, sensitive_column, k, categorical, schema)
250
+ except ImportError:
251
+ pass
252
+
253
+ if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
254
+ return mondrian_k_anonymity_polars(df, quasi_identifiers, sensitive_column, k, categorical)
255
+
256
+ raise ValueError("Input df must be a polars.DataFrame, polars.LazyFrame, or pyspark.sql.DataFrame")
polarfrost/py.typed ADDED
@@ -0,0 +1,2 @@
1
+ # This file indicates that this package is typed according to PEP 561
2
+ # It allows type checkers to recognize the package as type-annotated
File without changes
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: polarfrost
3
+ Version: 0.1.0
4
+ Summary: A fast k-anonymity implementation using Polars and PySpark
5
+ Home-page: https://github.com/rglew/polarfrost
6
+ Author: Richard Glew
7
+ Author-email: richard.glew@hotmail.com
8
+ Keywords: anonymization,privacy,polars,k-anonymity,data-privacy
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Security
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: polars>=0.13.0
24
+ Requires-Dist: pandas>=1.3.0
25
+ Requires-Dist: numpy>=1.21.0
26
+ Provides-Extra: spark
27
+ Requires-Dist: pyspark>=3.0.0; extra == "spark"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=6.0; extra == "dev"
30
+ Requires-Dist: pytest-cov>=2.0; extra == "dev"
31
+ Requires-Dist: black>=21.0; extra == "dev"
32
+ Requires-Dist: isort>=5.0; extra == "dev"
33
+ Requires-Dist: mypy>=0.900; extra == "dev"
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: classifier
37
+ Dynamic: description
38
+ Dynamic: description-content-type
39
+ Dynamic: home-page
40
+ Dynamic: keywords
41
+ Dynamic: provides-extra
42
+ Dynamic: requires-dist
43
+ Dynamic: requires-python
44
+ Dynamic: summary
45
+
46
+ # Polarfrost
47
+
48
+ A fast k-anonymity implementation using Polars, featuring both Mondrian and Clustering algorithms for efficient privacy-preserving data analysis.
49
+
50
+ ## Features
51
+
52
+ - 🚀 Blazing fast k-anonymity using Polars
53
+ - 🧊 Supports both local (Polars) and distributed (PySpark) processing
54
+ - 📊 Preserves data utility while ensuring privacy
55
+ - 🐍 Simple Python API
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install polarfrost
61
+ ```
62
+
63
+ ## Quick Start
64
+
65
+ ```python
66
+ import polars as pl
67
+ from polarfrost import mondrian_k_anonymity
68
+
69
+ # Load your data
70
+ df = pl.read_csv("your_data.csv")
71
+
72
+ # Apply k-anonymity
73
+ anonymized = mondrian_k_anonymity(
74
+ df,
75
+ quasi_identifiers=["age", "gender", "zipcode"],
76
+ sensitive_column="income",
77
+ k=3,
78
+ categorical=["gender", "zipcode"]
79
+ )
80
+
81
+ print(anonymized)
82
+ ```
83
+
84
+ ## License
85
+
86
+ MIT
@@ -0,0 +1,9 @@
1
+ polarfrost/__init__.py,sha256=f8nFJQsdr5ykHIY69PM5x11gOLRNgJXEty6DR8OQ5eU,697
2
+ polarfrost/clustering.py,sha256=9wJ237zQAZXHlimmch-1Yr3xGiSu6GjioxQ2xvd7vqM,955
3
+ polarfrost/mondrian.py,sha256=6-V5_uhx8UqNiuVKRPMYzSE51O8FsQEaHBJbyZhoJLU,9839
4
+ polarfrost/py.typed,sha256=M2mJCnUN7Ice7bLDMBMcrHzD8_Cjh2U52FOGVfM7c5o,139
5
+ polarfrost/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ polarfrost-0.1.0.dist-info/METADATA,sha256=uI2hX_xs-02m495-zdhmelVs8gMPlyvSxruvuZQ3Z1E,2380
7
+ polarfrost-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ polarfrost-0.1.0.dist-info/top_level.txt,sha256=sYpSVIpjaKGJfdvJtbHvo6usiVi0SxqXjdJ_pB_JD0c,11
9
+ polarfrost-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ polarfrost