polarfrost 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polarfrost-0.1.0/PKG-INFO +86 -0
- polarfrost-0.1.0/README.md +41 -0
- polarfrost-0.1.0/polarfrost/__init__.py +26 -0
- polarfrost-0.1.0/polarfrost/clustering.py +29 -0
- polarfrost-0.1.0/polarfrost/mondrian.py +256 -0
- polarfrost-0.1.0/polarfrost/py.typed +2 -0
- polarfrost-0.1.0/polarfrost/tests/__init__.py +0 -0
- polarfrost-0.1.0/polarfrost.egg-info/PKG-INFO +86 -0
- polarfrost-0.1.0/polarfrost.egg-info/SOURCES.txt +16 -0
- polarfrost-0.1.0/polarfrost.egg-info/dependency_links.txt +1 -0
- polarfrost-0.1.0/polarfrost.egg-info/not-zip-safe +1 -0
- polarfrost-0.1.0/polarfrost.egg-info/requires.txt +13 -0
- polarfrost-0.1.0/polarfrost.egg-info/top_level.txt +1 -0
- polarfrost-0.1.0/setup.cfg +4 -0
- polarfrost-0.1.0/setup.py +54 -0
- polarfrost-0.1.0/tests/test_imports.py +9 -0
- polarfrost-0.1.0/tests/test_mondrian.py +28 -0
- polarfrost-0.1.0/tests/test_mondrian_implementation.py +88 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: polarfrost
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A fast k-anonymity implementation using Polars and PySpark
|
5
|
+
Home-page: https://github.com/rglew/polarfrost
|
6
|
+
Author: Richard Glew
|
7
|
+
Author-email: richard.glew@hotmail.com
|
8
|
+
Keywords: anonymization,privacy,polars,k-anonymity,data-privacy
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
19
|
+
Classifier: Topic :: Security
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Requires-Python: >=3.8
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
Requires-Dist: polars>=0.13.0
|
24
|
+
Requires-Dist: pandas>=1.3.0
|
25
|
+
Requires-Dist: numpy>=1.21.0
|
26
|
+
Provides-Extra: spark
|
27
|
+
Requires-Dist: pyspark>=3.0.0; extra == "spark"
|
28
|
+
Provides-Extra: dev
|
29
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
30
|
+
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
31
|
+
Requires-Dist: black>=21.0; extra == "dev"
|
32
|
+
Requires-Dist: isort>=5.0; extra == "dev"
|
33
|
+
Requires-Dist: mypy>=0.900; extra == "dev"
|
34
|
+
Dynamic: author
|
35
|
+
Dynamic: author-email
|
36
|
+
Dynamic: classifier
|
37
|
+
Dynamic: description
|
38
|
+
Dynamic: description-content-type
|
39
|
+
Dynamic: home-page
|
40
|
+
Dynamic: keywords
|
41
|
+
Dynamic: provides-extra
|
42
|
+
Dynamic: requires-dist
|
43
|
+
Dynamic: requires-python
|
44
|
+
Dynamic: summary
|
45
|
+
|
46
|
+
# Polarfrost
|
47
|
+
|
48
|
+
A fast k-anonymity implementation using Polars, featuring both Mondrian and Clustering algorithms for efficient privacy-preserving data analysis.
|
49
|
+
|
50
|
+
## Features
|
51
|
+
|
52
|
+
- 🚀 Blazing fast k-anonymity using Polars
|
53
|
+
- 🧊 Supports both local (Polars) and distributed (PySpark) processing
|
54
|
+
- 📊 Preserves data utility while ensuring privacy
|
55
|
+
- 🐍 Simple Python API
|
56
|
+
|
57
|
+
## Installation
|
58
|
+
|
59
|
+
```bash
|
60
|
+
pip install polarfrost
|
61
|
+
```
|
62
|
+
|
63
|
+
## Quick Start
|
64
|
+
|
65
|
+
```python
|
66
|
+
import polars as pl
|
67
|
+
from polarfrost import mondrian_k_anonymity
|
68
|
+
|
69
|
+
# Load your data
|
70
|
+
df = pl.read_csv("your_data.csv")
|
71
|
+
|
72
|
+
# Apply k-anonymity
|
73
|
+
anonymized = mondrian_k_anonymity(
|
74
|
+
df,
|
75
|
+
quasi_identifiers=["age", "gender", "zipcode"],
|
76
|
+
sensitive_column="income",
|
77
|
+
k=3,
|
78
|
+
categorical=["gender", "zipcode"]
|
79
|
+
)
|
80
|
+
|
81
|
+
print(anonymized)
|
82
|
+
```
|
83
|
+
|
84
|
+
## License
|
85
|
+
|
86
|
+
MIT
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Polarfrost
|
2
|
+
|
3
|
+
A fast k-anonymity implementation using Polars, featuring both Mondrian and Clustering algorithms for efficient privacy-preserving data analysis.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
- 🚀 Blazing fast k-anonymity using Polars
|
8
|
+
- 🧊 Supports both local (Polars) and distributed (PySpark) processing
|
9
|
+
- 📊 Preserves data utility while ensuring privacy
|
10
|
+
- 🐍 Simple Python API
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
```bash
|
15
|
+
pip install polarfrost
|
16
|
+
```
|
17
|
+
|
18
|
+
## Quick Start
|
19
|
+
|
20
|
+
```python
|
21
|
+
import polars as pl
|
22
|
+
from polarfrost import mondrian_k_anonymity
|
23
|
+
|
24
|
+
# Load your data
|
25
|
+
df = pl.read_csv("your_data.csv")
|
26
|
+
|
27
|
+
# Apply k-anonymity
|
28
|
+
anonymized = mondrian_k_anonymity(
|
29
|
+
df,
|
30
|
+
quasi_identifiers=["age", "gender", "zipcode"],
|
31
|
+
sensitive_column="income",
|
32
|
+
k=3,
|
33
|
+
categorical=["gender", "zipcode"]
|
34
|
+
)
|
35
|
+
|
36
|
+
print(anonymized)
|
37
|
+
```
|
38
|
+
|
39
|
+
## License
|
40
|
+
|
41
|
+
MIT
|
@@ -0,0 +1,26 @@
|
|
1
|
+
"""
|
2
|
+
PolarFrost: Fast k-anonymity implementation using Polars and PySpark.
|
3
|
+
|
4
|
+
This package provides efficient implementations of k-anonymity algorithms,
|
5
|
+
including the Mondrian algorithm, with support for both local (Polars)
|
6
|
+
and distributed (PySpark) processing.
|
7
|
+
"""
|
8
|
+
|
9
|
+
__version__ = "0.1.0"
|
10
|
+
|
11
|
+
# Import main functions
|
12
|
+
try:
|
13
|
+
from .mondrian import (
|
14
|
+
mondrian_k_anonymity,
|
15
|
+
mondrian_k_anonymity_polars,
|
16
|
+
mondrian_k_anonymity_spark
|
17
|
+
)
|
18
|
+
__all__ = [
|
19
|
+
'mondrian_k_anonymity',
|
20
|
+
'mondrian_k_anonymity_polars',
|
21
|
+
'mondrian_k_anonymity_spark'
|
22
|
+
]
|
23
|
+
except ImportError as e:
|
24
|
+
import warnings
|
25
|
+
warnings.warn(f"Could not import mondrian: {e}")
|
26
|
+
__all__ = []
|
@@ -0,0 +1,29 @@
|
|
1
|
+
"""
|
2
|
+
Clustering-based k-anonymity implementation using Polars.
|
3
|
+
"""
|
4
|
+
from typing import List, Optional, Union
|
5
|
+
import polars as pl
|
6
|
+
|
7
|
+
def clustering_k_anonymity(
|
8
|
+
df: Union[pl.DataFrame, pl.LazyFrame],
|
9
|
+
quasi_identifiers: List[str],
|
10
|
+
sensitive_column: str,
|
11
|
+
k: int,
|
12
|
+
categorical: Optional[List[str]] = None,
|
13
|
+
method: str = "fcbg",
|
14
|
+
) -> pl.DataFrame:
|
15
|
+
"""
|
16
|
+
Perform clustering-based k-anonymity using Polars.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
df: Input DataFrame or LazyFrame
|
20
|
+
quasi_identifiers: List of column names to use for clustering
|
21
|
+
sensitive_column: Column containing sensitive information
|
22
|
+
k: Minimum group size for k-anonymity
|
23
|
+
categorical: List of categorical column names
|
24
|
+
method: Clustering method ('fcbg', 'rsc', or 'random')
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
Anonymized DataFrame with generalized quasi-identifiers
|
28
|
+
"""
|
29
|
+
raise NotImplementedError("Clustering k-anonymity will be implemented soon")
|
@@ -0,0 +1,256 @@
|
|
1
|
+
"""
|
2
|
+
Efficient Mondrian k-Anonymity implementation using Polars and PySpark.
|
3
|
+
Compatible with local (Polars) and Databricks/Spark (PySpark) environments.
|
4
|
+
"""
|
5
|
+
|
6
|
+
from typing import List, Optional, Union, Dict, Any, cast, TYPE_CHECKING
|
7
|
+
import polars as pl
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
11
|
+
from pyspark.sql.types import StructType
|
12
|
+
|
13
|
+
# ------------------------- POLARS VERSION -------------------------
|
14
|
+
def mondrian_k_anonymity_polars(
|
15
|
+
df: "pl.DataFrame | pl.LazyFrame",
|
16
|
+
quasi_identifiers: List[str],
|
17
|
+
sensitive_column: str,
|
18
|
+
k: int,
|
19
|
+
categorical: Optional[List[str]] = None,
|
20
|
+
) -> pl.DataFrame:
|
21
|
+
"""
|
22
|
+
Perform Mondrian k-anonymity using Polars LazyFrame for local processing.
|
23
|
+
Accepts either DataFrame or LazyFrame as input.
|
24
|
+
"""
|
25
|
+
if categorical is None:
|
26
|
+
categorical = []
|
27
|
+
|
28
|
+
# Convert to LazyFrame if not already
|
29
|
+
if isinstance(df, pl.DataFrame):
|
30
|
+
df = df.lazy()
|
31
|
+
elif not isinstance(df, pl.LazyFrame):
|
32
|
+
raise ValueError("Input must be a Polars DataFrame or LazyFrame")
|
33
|
+
|
34
|
+
# Initialize partitions with the full dataset
|
35
|
+
partitions = [df]
|
36
|
+
result = []
|
37
|
+
|
38
|
+
# Process partitions until none left
|
39
|
+
while partitions:
|
40
|
+
part = partitions.pop()
|
41
|
+
|
42
|
+
# Get partition size (lazy evaluation)
|
43
|
+
n_rows = part.select(pl.len()).collect().item(0, 0)
|
44
|
+
|
45
|
+
# If partition is too small to split, add to results
|
46
|
+
if n_rows < 2 * k:
|
47
|
+
result.append(part)
|
48
|
+
continue
|
49
|
+
|
50
|
+
# Compute spans for each quasi-identifier
|
51
|
+
spans: Dict[str, Any] = {}
|
52
|
+
for col in quasi_identifiers:
|
53
|
+
if col in categorical:
|
54
|
+
# For categorical, use number of unique values as span
|
55
|
+
n_unique = part.select(pl.col(col).n_unique()).collect().item()
|
56
|
+
spans[col] = n_unique
|
57
|
+
else:
|
58
|
+
# For numerical, use range as span
|
59
|
+
stats = part.select([
|
60
|
+
pl.col(col).min().alias("min"),
|
61
|
+
pl.col(col).max().alias("max")
|
62
|
+
]).collect()
|
63
|
+
col_min = stats[0, "min"]
|
64
|
+
col_max = stats[0, "max"]
|
65
|
+
spans[col] = col_max - col_min if col_max is not None and col_min is not None else 0
|
66
|
+
|
67
|
+
# Find the attribute with maximum span
|
68
|
+
split_col = max(spans, key=spans.get) # type: ignore
|
69
|
+
|
70
|
+
# If no split possible, add to results
|
71
|
+
if spans[split_col] == 0:
|
72
|
+
result.append(part)
|
73
|
+
continue
|
74
|
+
|
75
|
+
# Split the partition
|
76
|
+
if split_col in categorical:
|
77
|
+
# For categorical, split on unique values
|
78
|
+
uniq_vals = part.select(pl.col(split_col).unique()).collect().to_series().to_list()
|
79
|
+
mid = len(uniq_vals) // 2
|
80
|
+
left_vals = set(uniq_vals[:mid])
|
81
|
+
right_vals = set(uniq_vals[mid:])
|
82
|
+
left = part.filter(pl.col(split_col).is_in(left_vals))
|
83
|
+
right = part.filter(pl.col(split_col).is_in(right_vals))
|
84
|
+
else:
|
85
|
+
# For numerical, split on median
|
86
|
+
median = part.select(pl.col(split_col).median()).collect().item()
|
87
|
+
left = part.filter(pl.col(split_col) <= median)
|
88
|
+
right = part.filter(pl.col(split_col) > median)
|
89
|
+
|
90
|
+
# Check if both partitions satisfy k-anonymity
|
91
|
+
left_n = left.select(pl.len()).collect().item(0, 0)
|
92
|
+
right_n = right.select(pl.len()).collect().item(0, 0)
|
93
|
+
|
94
|
+
if left_n >= k and right_n >= k:
|
95
|
+
# Both partitions are valid, continue splitting
|
96
|
+
partitions.extend([left, right])
|
97
|
+
else:
|
98
|
+
# At least one partition is too small, keep as is
|
99
|
+
result.append(part)
|
100
|
+
|
101
|
+
# Aggregate each partition
|
102
|
+
agg_rows = []
|
103
|
+
for part in result:
|
104
|
+
# Collect only the columns we need
|
105
|
+
part_df = part.select(quasi_identifiers + [sensitive_column]).collect()
|
106
|
+
row = {}
|
107
|
+
|
108
|
+
# Generalize quasi-identifiers
|
109
|
+
for col in quasi_identifiers:
|
110
|
+
if col in categorical:
|
111
|
+
# For categorical, use set of unique values
|
112
|
+
row[col] = ','.join(sorted(map(str, part_df[col].unique())))
|
113
|
+
else:
|
114
|
+
# For numerical, use range
|
115
|
+
row[col] = f"{part_df[col].min()}-{part_df[col].max()}"
|
116
|
+
|
117
|
+
# Add sensitive values and count
|
118
|
+
row[sensitive_column] = ','.join(sorted(map(str, part_df[sensitive_column].unique())))
|
119
|
+
row['count'] = part_df.height
|
120
|
+
agg_rows.append(row)
|
121
|
+
|
122
|
+
return pl.DataFrame(agg_rows)
|
123
|
+
|
124
|
+
# ------------------------- PYSPARK VERSION -------------------------
|
125
|
+
def mondrian_k_anonymity_spark(
|
126
|
+
df: "SparkDataFrame",
|
127
|
+
quasi_identifiers: List[str],
|
128
|
+
sensitive_column: str,
|
129
|
+
k: int,
|
130
|
+
categorical: Optional[List[str]] = None,
|
131
|
+
schema: Optional["StructType"] = None,
|
132
|
+
) -> "SparkDataFrame":
|
133
|
+
"""
|
134
|
+
Perform Mondrian k-anonymity using PySpark for distributed processing.
|
135
|
+
"""
|
136
|
+
import pandas as pd
|
137
|
+
from pyspark.sql.functions import pandas_udf, PandasUDFType
|
138
|
+
|
139
|
+
if categorical is None:
|
140
|
+
categorical = []
|
141
|
+
|
142
|
+
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
|
143
|
+
def mondrian_partition(pdf: pd.DataFrame) -> pd.DataFrame:
|
144
|
+
partitions = [pdf]
|
145
|
+
result = []
|
146
|
+
|
147
|
+
while partitions:
|
148
|
+
part = partitions.pop()
|
149
|
+
|
150
|
+
# If partition is too small to split, add to results
|
151
|
+
if len(part) < 2 * k:
|
152
|
+
result.append(part)
|
153
|
+
continue
|
154
|
+
|
155
|
+
# Compute spans for each quasi-identifier
|
156
|
+
spans = {}
|
157
|
+
for col in quasi_identifiers:
|
158
|
+
if col in categorical:
|
159
|
+
spans[col] = part[col].nunique()
|
160
|
+
else:
|
161
|
+
col_min = part[col].min()
|
162
|
+
col_max = part[col].max()
|
163
|
+
spans[col] = col_max - col_min if pd.notnull(col_max) and pd.notnull(col_min) else 0
|
164
|
+
|
165
|
+
# Find the attribute with maximum span
|
166
|
+
split_col = max(spans, key=spans.get)
|
167
|
+
|
168
|
+
# If no split possible, add to results
|
169
|
+
if spans[split_col] == 0:
|
170
|
+
result.append(part)
|
171
|
+
continue
|
172
|
+
|
173
|
+
# Split the partition
|
174
|
+
if split_col in categorical:
|
175
|
+
# For categorical, split on unique values
|
176
|
+
uniq_vals = part[split_col].unique()
|
177
|
+
mid = len(uniq_vals) // 2
|
178
|
+
left_vals = set(uniq_vals[:mid])
|
179
|
+
right_vals = set(uniq_vals[mid:])
|
180
|
+
left = part[part[split_col].isin(left_vals)]
|
181
|
+
right = part[part[split_col].isin(right_vals)]
|
182
|
+
else:
|
183
|
+
# For numerical, split on median
|
184
|
+
median = part[split_col].median()
|
185
|
+
left = part[part[split_col] <= median]
|
186
|
+
right = part[part[split_col] > median]
|
187
|
+
|
188
|
+
# Check if both partitions satisfy k-anonymity
|
189
|
+
if len(left) >= k and len(right) >= k:
|
190
|
+
# Both partitions are valid, continue splitting
|
191
|
+
partitions.extend([left, right])
|
192
|
+
else:
|
193
|
+
# At least one partition is too small, keep as is
|
194
|
+
result.append(part)
|
195
|
+
|
196
|
+
# Aggregate the results
|
197
|
+
agg_rows = []
|
198
|
+
for part in result:
|
199
|
+
row = {}
|
200
|
+
|
201
|
+
# Generalize quasi-identifiers
|
202
|
+
for col in quasi_identifiers:
|
203
|
+
if col in categorical:
|
204
|
+
# For categorical, use set of unique values
|
205
|
+
row[col] = ','.join(sorted(map(str, part[col].unique())))
|
206
|
+
else:
|
207
|
+
# For numerical, use range
|
208
|
+
row[col] = f"{part[col].min()}-{part[col].max()}"
|
209
|
+
|
210
|
+
# Add sensitive values and count
|
211
|
+
row[sensitive_column] = ','.join(sorted(map(str, part[sensitive_column].unique())))
|
212
|
+
row['count'] = len(part)
|
213
|
+
agg_rows.append(row)
|
214
|
+
|
215
|
+
return pd.DataFrame(agg_rows)
|
216
|
+
|
217
|
+
# Apply the function to the entire DataFrame
|
218
|
+
if schema is not None:
|
219
|
+
return df.groupBy().applyInPandas(mondrian_partition, schema=schema)
|
220
|
+
else:
|
221
|
+
return df.groupBy().applyInPandas(mondrian_partition)
|
222
|
+
|
223
|
+
# ------------------------- DISPATCHER -------------------------
|
224
|
+
def mondrian_k_anonymity(
|
225
|
+
df: Union[pl.DataFrame, pl.LazyFrame, "SparkDataFrame"],
|
226
|
+
quasi_identifiers: List[str],
|
227
|
+
sensitive_column: str,
|
228
|
+
k: int,
|
229
|
+
categorical: Optional[List[str]] = None,
|
230
|
+
schema: Optional["StructType"] = None,
|
231
|
+
) -> Union[pl.DataFrame, "SparkDataFrame"]:
|
232
|
+
"""
|
233
|
+
Dispatcher: Use Polars or PySpark Mondrian k-anonymity depending on input type.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
df: Input DataFrame (Polars or PySpark)
|
237
|
+
quasi_identifiers: List of column names that are quasi-identifiers
|
238
|
+
sensitive_column: Name of the sensitive column
|
239
|
+
k: Anonymity parameter (minimum group size)
|
240
|
+
categorical: List of categorical column names
|
241
|
+
schema: Schema for PySpark output (required for PySpark)
|
242
|
+
|
243
|
+
Returns:
|
244
|
+
Anonymized DataFrame with generalized quasi-identifiers
|
245
|
+
"""
|
246
|
+
try:
|
247
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
248
|
+
if isinstance(df, SparkDataFrame):
|
249
|
+
return mondrian_k_anonymity_spark(df, quasi_identifiers, sensitive_column, k, categorical, schema)
|
250
|
+
except ImportError:
|
251
|
+
pass
|
252
|
+
|
253
|
+
if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
|
254
|
+
return mondrian_k_anonymity_polars(df, quasi_identifiers, sensitive_column, k, categorical)
|
255
|
+
|
256
|
+
raise ValueError("Input df must be a polars.DataFrame, polars.LazyFrame, or pyspark.sql.DataFrame")
|
File without changes
|
@@ -0,0 +1,86 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: polarfrost
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A fast k-anonymity implementation using Polars and PySpark
|
5
|
+
Home-page: https://github.com/rglew/polarfrost
|
6
|
+
Author: Richard Glew
|
7
|
+
Author-email: richard.glew@hotmail.com
|
8
|
+
Keywords: anonymization,privacy,polars,k-anonymity,data-privacy
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
19
|
+
Classifier: Topic :: Security
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Requires-Python: >=3.8
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
Requires-Dist: polars>=0.13.0
|
24
|
+
Requires-Dist: pandas>=1.3.0
|
25
|
+
Requires-Dist: numpy>=1.21.0
|
26
|
+
Provides-Extra: spark
|
27
|
+
Requires-Dist: pyspark>=3.0.0; extra == "spark"
|
28
|
+
Provides-Extra: dev
|
29
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
30
|
+
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
31
|
+
Requires-Dist: black>=21.0; extra == "dev"
|
32
|
+
Requires-Dist: isort>=5.0; extra == "dev"
|
33
|
+
Requires-Dist: mypy>=0.900; extra == "dev"
|
34
|
+
Dynamic: author
|
35
|
+
Dynamic: author-email
|
36
|
+
Dynamic: classifier
|
37
|
+
Dynamic: description
|
38
|
+
Dynamic: description-content-type
|
39
|
+
Dynamic: home-page
|
40
|
+
Dynamic: keywords
|
41
|
+
Dynamic: provides-extra
|
42
|
+
Dynamic: requires-dist
|
43
|
+
Dynamic: requires-python
|
44
|
+
Dynamic: summary
|
45
|
+
|
46
|
+
# Polarfrost
|
47
|
+
|
48
|
+
A fast k-anonymity implementation using Polars, featuring both Mondrian and Clustering algorithms for efficient privacy-preserving data analysis.
|
49
|
+
|
50
|
+
## Features
|
51
|
+
|
52
|
+
- 🚀 Blazing fast k-anonymity using Polars
|
53
|
+
- 🧊 Supports both local (Polars) and distributed (PySpark) processing
|
54
|
+
- 📊 Preserves data utility while ensuring privacy
|
55
|
+
- 🐍 Simple Python API
|
56
|
+
|
57
|
+
## Installation
|
58
|
+
|
59
|
+
```bash
|
60
|
+
pip install polarfrost
|
61
|
+
```
|
62
|
+
|
63
|
+
## Quick Start
|
64
|
+
|
65
|
+
```python
|
66
|
+
import polars as pl
|
67
|
+
from polarfrost import mondrian_k_anonymity
|
68
|
+
|
69
|
+
# Load your data
|
70
|
+
df = pl.read_csv("your_data.csv")
|
71
|
+
|
72
|
+
# Apply k-anonymity
|
73
|
+
anonymized = mondrian_k_anonymity(
|
74
|
+
df,
|
75
|
+
quasi_identifiers=["age", "gender", "zipcode"],
|
76
|
+
sensitive_column="income",
|
77
|
+
k=3,
|
78
|
+
categorical=["gender", "zipcode"]
|
79
|
+
)
|
80
|
+
|
81
|
+
print(anonymized)
|
82
|
+
```
|
83
|
+
|
84
|
+
## License
|
85
|
+
|
86
|
+
MIT
|
@@ -0,0 +1,16 @@
|
|
1
|
+
README.md
|
2
|
+
setup.py
|
3
|
+
polarfrost/__init__.py
|
4
|
+
polarfrost/clustering.py
|
5
|
+
polarfrost/mondrian.py
|
6
|
+
polarfrost/py.typed
|
7
|
+
polarfrost.egg-info/PKG-INFO
|
8
|
+
polarfrost.egg-info/SOURCES.txt
|
9
|
+
polarfrost.egg-info/dependency_links.txt
|
10
|
+
polarfrost.egg-info/not-zip-safe
|
11
|
+
polarfrost.egg-info/requires.txt
|
12
|
+
polarfrost.egg-info/top_level.txt
|
13
|
+
polarfrost/tests/__init__.py
|
14
|
+
tests/test_imports.py
|
15
|
+
tests/test_mondrian.py
|
16
|
+
tests/test_mondrian_implementation.py
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
polarfrost
|
@@ -0,0 +1,54 @@
|
|
1
|
+
from setuptools import setup, find_packages
|
2
|
+
import pathlib
|
3
|
+
|
4
|
+
# Read the contents of README.md
|
5
|
+
this_directory = pathlib.Path(__file__).parent
|
6
|
+
long_description = (this_directory / "README.md").read_text()
|
7
|
+
|
8
|
+
setup(
|
9
|
+
name="polarfrost",
|
10
|
+
version="0.1.0",
|
11
|
+
packages=find_packages(exclude=["tests*"]),
|
12
|
+
python_requires=">=3.8",
|
13
|
+
install_requires=[
|
14
|
+
"polars>=0.13.0",
|
15
|
+
"pandas>=1.3.0",
|
16
|
+
"numpy>=1.21.0",
|
17
|
+
],
|
18
|
+
extras_require={
|
19
|
+
"spark": ["pyspark>=3.0.0"],
|
20
|
+
"dev": [
|
21
|
+
"pytest>=6.0",
|
22
|
+
"pytest-cov>=2.0",
|
23
|
+
"black>=21.0",
|
24
|
+
"isort>=5.0",
|
25
|
+
"mypy>=0.900",
|
26
|
+
],
|
27
|
+
},
|
28
|
+
author="Richard Glew",
|
29
|
+
author_email="richard.glew@hotmail.com",
|
30
|
+
description="A fast k-anonymity implementation using Polars and PySpark",
|
31
|
+
long_description=long_description,
|
32
|
+
long_description_content_type="text/markdown",
|
33
|
+
url="https://github.com/rglew/polarfrost",
|
34
|
+
keywords=["anonymization", "privacy", "polars", "k-anonymity", "data-privacy"],
|
35
|
+
classifiers=[
|
36
|
+
"Development Status :: 3 - Alpha",
|
37
|
+
"Intended Audience :: Developers",
|
38
|
+
"Intended Audience :: Science/Research",
|
39
|
+
"License :: OSI Approved :: MIT License",
|
40
|
+
"Programming Language :: Python :: 3",
|
41
|
+
"Programming Language :: Python :: 3.8",
|
42
|
+
"Programming Language :: Python :: 3.9",
|
43
|
+
"Programming Language :: Python :: 3.10",
|
44
|
+
"Programming Language :: Python :: 3.11",
|
45
|
+
"Topic :: Scientific/Engineering",
|
46
|
+
"Topic :: Security",
|
47
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
48
|
+
],
|
49
|
+
package_data={
|
50
|
+
"polarfrost": ["py.typed"],
|
51
|
+
},
|
52
|
+
include_package_data=True,
|
53
|
+
zip_safe=False,
|
54
|
+
)
|
@@ -0,0 +1,9 @@
|
|
1
|
+
"""Test basic imports."""
|
2
|
+
|
3
|
+
def test_import():
|
4
|
+
"""Test that the package can be imported and has the expected attributes."""
|
5
|
+
import polarfrost
|
6
|
+
assert polarfrost.__version__ == "0.1.0"
|
7
|
+
assert hasattr(polarfrost, 'mondrian_k_anonymity')
|
8
|
+
assert hasattr(polarfrost, 'mondrian_k_anonymity_polars')
|
9
|
+
assert hasattr(polarfrost, 'mondrian_k_anonymity_spark')
|
@@ -0,0 +1,28 @@
|
|
1
|
+
import os
|
2
|
+
import polars as pl
|
3
|
+
from polarfrost.mondrian import mondrian_k_anonymity
|
4
|
+
|
5
|
+
def test_mondrian_basic():
|
6
|
+
"""Test basic k-anonymity functionality."""
|
7
|
+
# Sample data
|
8
|
+
data = {
|
9
|
+
"age": [25, 25, 35, 35, 45, 45],
|
10
|
+
"gender": ["M", "M", "F", "F", "M", "M"],
|
11
|
+
"zipcode": ["12345", "12345", "12345", "12345", "67890", "67890"],
|
12
|
+
"income": [50000, 55000, 60000, 65000, 70000, 75000]
|
13
|
+
}
|
14
|
+
df = pl.DataFrame(data)
|
15
|
+
|
16
|
+
# Apply k-anonymity with k=2
|
17
|
+
result = mondrian_k_anonymity(
|
18
|
+
df,
|
19
|
+
quasi_identifiers=["age", "gender", "zipcode"],
|
20
|
+
sensitive_column="income",
|
21
|
+
k=2,
|
22
|
+
categorical=["gender", "zipcode"]
|
23
|
+
)
|
24
|
+
|
25
|
+
# Basic assertions
|
26
|
+
assert len(result) > 0
|
27
|
+
assert "count" in result.columns
|
28
|
+
assert all(result["count"] >= 2)
|
@@ -0,0 +1,88 @@
|
|
1
|
+
"""
|
2
|
+
Test the Mondrian k-anonymity implementation with sample data.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import polars as pl
|
6
|
+
import pytest
|
7
|
+
from polarfrost import mondrian_k_anonymity
|
8
|
+
|
9
|
+
def test_mondrian_basic():
|
10
|
+
"""Test basic Mondrian k-anonymity with a small dataset."""
|
11
|
+
# Create a small test dataset
|
12
|
+
data = {
|
13
|
+
"age": [25, 25, 35, 35, 45, 45, 55, 55],
|
14
|
+
"gender": ["M", "M", "F", "F", "M", "M", "F", "F"],
|
15
|
+
"zipcode": ["12345", "12345", "12345", "12345", "67890", "67890", "67890", "67890"],
|
16
|
+
"income": [50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000]
|
17
|
+
}
|
18
|
+
df = pl.DataFrame(data)
|
19
|
+
|
20
|
+
# Apply k-anonymity with k=2
|
21
|
+
quasi_identifiers = ["age", "gender", "zipcode"]
|
22
|
+
sensitive_column = "income"
|
23
|
+
categorical = ["gender", "zipcode"]
|
24
|
+
k = 2
|
25
|
+
|
26
|
+
anon_df = mondrian_k_anonymity(
|
27
|
+
df,
|
28
|
+
quasi_identifiers=quasi_identifiers,
|
29
|
+
sensitive_column=sensitive_column,
|
30
|
+
k=k,
|
31
|
+
categorical=categorical
|
32
|
+
)
|
33
|
+
|
34
|
+
# Verify the output
|
35
|
+
assert isinstance(anon_df, pl.DataFrame)
|
36
|
+
assert len(anon_df) > 0 # Should have at least one group
|
37
|
+
assert all(count >= k for count in anon_df["count"]) # All groups should satisfy k-anonymity
|
38
|
+
|
39
|
+
# Check that all quasi-identifiers are generalized
|
40
|
+
for col in quasi_identifiers:
|
41
|
+
assert col in anon_df.columns
|
42
|
+
|
43
|
+
# Check that the sensitive column is included
|
44
|
+
assert sensitive_column in anon_df.columns
|
45
|
+
assert "count" in anon_df.columns
|
46
|
+
|
47
|
+
def test_mondrian_with_lazyframe():
|
48
|
+
"""Test that the function works with LazyFrames."""
|
49
|
+
data = {
|
50
|
+
"age": [25, 25, 35, 35, 45, 45],
|
51
|
+
"gender": ["M", "M", "F", "F", "M", "M"],
|
52
|
+
"zipcode": ["12345", "12345", "12345", "12345", "67890", "67890"],
|
53
|
+
"income": [50000, 55000, 60000, 65000, 70000, 75000]
|
54
|
+
}
|
55
|
+
df = pl.LazyFrame(data)
|
56
|
+
|
57
|
+
anon_df = mondrian_k_anonymity(
|
58
|
+
df,
|
59
|
+
quasi_identifiers=["age", "gender", "zipcode"],
|
60
|
+
sensitive_column="income",
|
61
|
+
k=2,
|
62
|
+
categorical=["gender", "zipcode"]
|
63
|
+
)
|
64
|
+
|
65
|
+
assert isinstance(anon_df, pl.DataFrame)
|
66
|
+
assert len(anon_df) > 0
|
67
|
+
|
68
|
+
def test_mondrian_invalid_input():
|
69
|
+
"""Test that invalid inputs raise appropriate errors."""
|
70
|
+
df = pl.DataFrame({"age": [1, 2, 3], "income": [10, 20, 30]})
|
71
|
+
|
72
|
+
# Test with k larger than dataset size - should return a single group
|
73
|
+
result = mondrian_k_anonymity(df, ["age"], "income", k=5)
|
74
|
+
assert len(result) == 1 # Should return a single group with all records
|
75
|
+
assert result["count"][0] == 3 # All records should be in one group
|
76
|
+
|
77
|
+
# Test with invalid column names
|
78
|
+
with pytest.raises(pl.exceptions.ColumnNotFoundError):
|
79
|
+
mondrian_k_anonymity(df, ["invalid"], "income", k=2)
|
80
|
+
|
81
|
+
with pytest.raises(pl.exceptions.ColumnNotFoundError):
|
82
|
+
mondrian_k_anonymity(df, ["age"], "invalid", k=2)
|
83
|
+
|
84
|
+
if __name__ == "__main__":
|
85
|
+
test_mondrian_basic()
|
86
|
+
test_mondrian_with_lazyframe()
|
87
|
+
test_mondrian_invalid_input()
|
88
|
+
print("All tests passed!")
|