polarfrost 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: polarfrost
3
+ Version: 0.1.0
4
+ Summary: A fast k-anonymity implementation using Polars and PySpark
5
+ Home-page: https://github.com/rglew/polarfrost
6
+ Author: Richard Glew
7
+ Author-email: richard.glew@hotmail.com
8
+ Keywords: anonymization,privacy,polars,k-anonymity,data-privacy
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Security
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: polars>=0.13.0
24
+ Requires-Dist: pandas>=1.3.0
25
+ Requires-Dist: numpy>=1.21.0
26
+ Provides-Extra: spark
27
+ Requires-Dist: pyspark>=3.0.0; extra == "spark"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=6.0; extra == "dev"
30
+ Requires-Dist: pytest-cov>=2.0; extra == "dev"
31
+ Requires-Dist: black>=21.0; extra == "dev"
32
+ Requires-Dist: isort>=5.0; extra == "dev"
33
+ Requires-Dist: mypy>=0.900; extra == "dev"
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: classifier
37
+ Dynamic: description
38
+ Dynamic: description-content-type
39
+ Dynamic: home-page
40
+ Dynamic: keywords
41
+ Dynamic: provides-extra
42
+ Dynamic: requires-dist
43
+ Dynamic: requires-python
44
+ Dynamic: summary
45
+
46
+ # Polarfrost
47
+
48
+ A fast k-anonymity implementation using Polars, featuring both Mondrian and Clustering algorithms for efficient privacy-preserving data analysis.
49
+
50
+ ## Features
51
+
52
+ - 🚀 Blazing fast k-anonymity using Polars
53
+ - 🧊 Supports both local (Polars) and distributed (PySpark) processing
54
+ - 📊 Preserves data utility while ensuring privacy
55
+ - 🐍 Simple Python API
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install polarfrost
61
+ ```
62
+
63
+ ## Quick Start
64
+
65
+ ```python
66
+ import polars as pl
67
+ from polarfrost import mondrian_k_anonymity
68
+
69
+ # Load your data
70
+ df = pl.read_csv("your_data.csv")
71
+
72
+ # Apply k-anonymity
73
+ anonymized = mondrian_k_anonymity(
74
+ df,
75
+ quasi_identifiers=["age", "gender", "zipcode"],
76
+ sensitive_column="income",
77
+ k=3,
78
+ categorical=["gender", "zipcode"]
79
+ )
80
+
81
+ print(anonymized)
82
+ ```
83
+
84
+ ## License
85
+
86
+ MIT
@@ -0,0 +1,41 @@
1
+ # Polarfrost
2
+
3
+ A fast k-anonymity implementation using Polars, featuring both Mondrian and Clustering algorithms for efficient privacy-preserving data analysis.
4
+
5
+ ## Features
6
+
7
+ - 🚀 Blazing fast k-anonymity using Polars
8
+ - 🧊 Supports both local (Polars) and distributed (PySpark) processing
9
+ - 📊 Preserves data utility while ensuring privacy
10
+ - 🐍 Simple Python API
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install polarfrost
16
+ ```
17
+
18
+ ## Quick Start
19
+
20
+ ```python
21
+ import polars as pl
22
+ from polarfrost import mondrian_k_anonymity
23
+
24
+ # Load your data
25
+ df = pl.read_csv("your_data.csv")
26
+
27
+ # Apply k-anonymity
28
+ anonymized = mondrian_k_anonymity(
29
+ df,
30
+ quasi_identifiers=["age", "gender", "zipcode"],
31
+ sensitive_column="income",
32
+ k=3,
33
+ categorical=["gender", "zipcode"]
34
+ )
35
+
36
+ print(anonymized)
37
+ ```
38
+
39
+ ## License
40
+
41
+ MIT
@@ -0,0 +1,26 @@
1
+ """
2
+ PolarFrost: Fast k-anonymity implementation using Polars and PySpark.
3
+
4
+ This package provides efficient implementations of k-anonymity algorithms,
5
+ including the Mondrian algorithm, with support for both local (Polars)
6
+ and distributed (PySpark) processing.
7
+ """
8
+
9
+ __version__ = "0.1.0"
10
+
11
+ # Import main functions
12
+ try:
13
+ from .mondrian import (
14
+ mondrian_k_anonymity,
15
+ mondrian_k_anonymity_polars,
16
+ mondrian_k_anonymity_spark
17
+ )
18
+ __all__ = [
19
+ 'mondrian_k_anonymity',
20
+ 'mondrian_k_anonymity_polars',
21
+ 'mondrian_k_anonymity_spark'
22
+ ]
23
+ except ImportError as e:
24
+ import warnings
25
+ warnings.warn(f"Could not import mondrian: {e}")
26
+ __all__ = []
@@ -0,0 +1,29 @@
1
+ """
2
+ Clustering-based k-anonymity implementation using Polars.
3
+ """
4
+ from typing import List, Optional, Union
5
+ import polars as pl
6
+
7
+ def clustering_k_anonymity(
8
+ df: Union[pl.DataFrame, pl.LazyFrame],
9
+ quasi_identifiers: List[str],
10
+ sensitive_column: str,
11
+ k: int,
12
+ categorical: Optional[List[str]] = None,
13
+ method: str = "fcbg",
14
+ ) -> pl.DataFrame:
15
+ """
16
+ Perform clustering-based k-anonymity using Polars.
17
+
18
+ Args:
19
+ df: Input DataFrame or LazyFrame
20
+ quasi_identifiers: List of column names to use for clustering
21
+ sensitive_column: Column containing sensitive information
22
+ k: Minimum group size for k-anonymity
23
+ categorical: List of categorical column names
24
+ method: Clustering method ('fcbg', 'rsc', or 'random')
25
+
26
+ Returns:
27
+ Anonymized DataFrame with generalized quasi-identifiers
28
+ """
29
+ raise NotImplementedError("Clustering k-anonymity will be implemented soon")
@@ -0,0 +1,256 @@
1
+ """
2
+ Efficient Mondrian k-Anonymity implementation using Polars and PySpark.
3
+ Compatible with local (Polars) and Databricks/Spark (PySpark) environments.
4
+ """
5
+
6
+ from typing import List, Optional, Union, Dict, Any, cast, TYPE_CHECKING
7
+ import polars as pl
8
+
9
+ if TYPE_CHECKING:
10
+ from pyspark.sql import DataFrame as SparkDataFrame
11
+ from pyspark.sql.types import StructType
12
+
13
+ # ------------------------- POLARS VERSION -------------------------
14
+ def mondrian_k_anonymity_polars(
15
+ df: "pl.DataFrame | pl.LazyFrame",
16
+ quasi_identifiers: List[str],
17
+ sensitive_column: str,
18
+ k: int,
19
+ categorical: Optional[List[str]] = None,
20
+ ) -> pl.DataFrame:
21
+ """
22
+ Perform Mondrian k-anonymity using Polars LazyFrame for local processing.
23
+ Accepts either DataFrame or LazyFrame as input.
24
+ """
25
+ if categorical is None:
26
+ categorical = []
27
+
28
+ # Convert to LazyFrame if not already
29
+ if isinstance(df, pl.DataFrame):
30
+ df = df.lazy()
31
+ elif not isinstance(df, pl.LazyFrame):
32
+ raise ValueError("Input must be a Polars DataFrame or LazyFrame")
33
+
34
+ # Initialize partitions with the full dataset
35
+ partitions = [df]
36
+ result = []
37
+
38
+ # Process partitions until none left
39
+ while partitions:
40
+ part = partitions.pop()
41
+
42
+ # Get partition size (lazy evaluation)
43
+ n_rows = part.select(pl.len()).collect().item(0, 0)
44
+
45
+ # If partition is too small to split, add to results
46
+ if n_rows < 2 * k:
47
+ result.append(part)
48
+ continue
49
+
50
+ # Compute spans for each quasi-identifier
51
+ spans: Dict[str, Any] = {}
52
+ for col in quasi_identifiers:
53
+ if col in categorical:
54
+ # For categorical, use number of unique values as span
55
+ n_unique = part.select(pl.col(col).n_unique()).collect().item()
56
+ spans[col] = n_unique
57
+ else:
58
+ # For numerical, use range as span
59
+ stats = part.select([
60
+ pl.col(col).min().alias("min"),
61
+ pl.col(col).max().alias("max")
62
+ ]).collect()
63
+ col_min = stats[0, "min"]
64
+ col_max = stats[0, "max"]
65
+ spans[col] = col_max - col_min if col_max is not None and col_min is not None else 0
66
+
67
+ # Find the attribute with maximum span
68
+ split_col = max(spans, key=spans.get) # type: ignore
69
+
70
+ # If no split possible, add to results
71
+ if spans[split_col] == 0:
72
+ result.append(part)
73
+ continue
74
+
75
+ # Split the partition
76
+ if split_col in categorical:
77
+ # For categorical, split on unique values
78
+ uniq_vals = part.select(pl.col(split_col).unique()).collect().to_series().to_list()
79
+ mid = len(uniq_vals) // 2
80
+ left_vals = set(uniq_vals[:mid])
81
+ right_vals = set(uniq_vals[mid:])
82
+ left = part.filter(pl.col(split_col).is_in(left_vals))
83
+ right = part.filter(pl.col(split_col).is_in(right_vals))
84
+ else:
85
+ # For numerical, split on median
86
+ median = part.select(pl.col(split_col).median()).collect().item()
87
+ left = part.filter(pl.col(split_col) <= median)
88
+ right = part.filter(pl.col(split_col) > median)
89
+
90
+ # Check if both partitions satisfy k-anonymity
91
+ left_n = left.select(pl.len()).collect().item(0, 0)
92
+ right_n = right.select(pl.len()).collect().item(0, 0)
93
+
94
+ if left_n >= k and right_n >= k:
95
+ # Both partitions are valid, continue splitting
96
+ partitions.extend([left, right])
97
+ else:
98
+ # At least one partition is too small, keep as is
99
+ result.append(part)
100
+
101
+ # Aggregate each partition
102
+ agg_rows = []
103
+ for part in result:
104
+ # Collect only the columns we need
105
+ part_df = part.select(quasi_identifiers + [sensitive_column]).collect()
106
+ row = {}
107
+
108
+ # Generalize quasi-identifiers
109
+ for col in quasi_identifiers:
110
+ if col in categorical:
111
+ # For categorical, use set of unique values
112
+ row[col] = ','.join(sorted(map(str, part_df[col].unique())))
113
+ else:
114
+ # For numerical, use range
115
+ row[col] = f"{part_df[col].min()}-{part_df[col].max()}"
116
+
117
+ # Add sensitive values and count
118
+ row[sensitive_column] = ','.join(sorted(map(str, part_df[sensitive_column].unique())))
119
+ row['count'] = part_df.height
120
+ agg_rows.append(row)
121
+
122
+ return pl.DataFrame(agg_rows)
123
+
124
+ # ------------------------- PYSPARK VERSION -------------------------
125
+ def mondrian_k_anonymity_spark(
126
+ df: "SparkDataFrame",
127
+ quasi_identifiers: List[str],
128
+ sensitive_column: str,
129
+ k: int,
130
+ categorical: Optional[List[str]] = None,
131
+ schema: Optional["StructType"] = None,
132
+ ) -> "SparkDataFrame":
133
+ """
134
+ Perform Mondrian k-anonymity using PySpark for distributed processing.
135
+ """
136
+ import pandas as pd
137
+ from pyspark.sql.functions import pandas_udf, PandasUDFType
138
+
139
+ if categorical is None:
140
+ categorical = []
141
+
142
+ @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
143
+ def mondrian_partition(pdf: pd.DataFrame) -> pd.DataFrame:
144
+ partitions = [pdf]
145
+ result = []
146
+
147
+ while partitions:
148
+ part = partitions.pop()
149
+
150
+ # If partition is too small to split, add to results
151
+ if len(part) < 2 * k:
152
+ result.append(part)
153
+ continue
154
+
155
+ # Compute spans for each quasi-identifier
156
+ spans = {}
157
+ for col in quasi_identifiers:
158
+ if col in categorical:
159
+ spans[col] = part[col].nunique()
160
+ else:
161
+ col_min = part[col].min()
162
+ col_max = part[col].max()
163
+ spans[col] = col_max - col_min if pd.notnull(col_max) and pd.notnull(col_min) else 0
164
+
165
+ # Find the attribute with maximum span
166
+ split_col = max(spans, key=spans.get)
167
+
168
+ # If no split possible, add to results
169
+ if spans[split_col] == 0:
170
+ result.append(part)
171
+ continue
172
+
173
+ # Split the partition
174
+ if split_col in categorical:
175
+ # For categorical, split on unique values
176
+ uniq_vals = part[split_col].unique()
177
+ mid = len(uniq_vals) // 2
178
+ left_vals = set(uniq_vals[:mid])
179
+ right_vals = set(uniq_vals[mid:])
180
+ left = part[part[split_col].isin(left_vals)]
181
+ right = part[part[split_col].isin(right_vals)]
182
+ else:
183
+ # For numerical, split on median
184
+ median = part[split_col].median()
185
+ left = part[part[split_col] <= median]
186
+ right = part[part[split_col] > median]
187
+
188
+ # Check if both partitions satisfy k-anonymity
189
+ if len(left) >= k and len(right) >= k:
190
+ # Both partitions are valid, continue splitting
191
+ partitions.extend([left, right])
192
+ else:
193
+ # At least one partition is too small, keep as is
194
+ result.append(part)
195
+
196
+ # Aggregate the results
197
+ agg_rows = []
198
+ for part in result:
199
+ row = {}
200
+
201
+ # Generalize quasi-identifiers
202
+ for col in quasi_identifiers:
203
+ if col in categorical:
204
+ # For categorical, use set of unique values
205
+ row[col] = ','.join(sorted(map(str, part[col].unique())))
206
+ else:
207
+ # For numerical, use range
208
+ row[col] = f"{part[col].min()}-{part[col].max()}"
209
+
210
+ # Add sensitive values and count
211
+ row[sensitive_column] = ','.join(sorted(map(str, part[sensitive_column].unique())))
212
+ row['count'] = len(part)
213
+ agg_rows.append(row)
214
+
215
+ return pd.DataFrame(agg_rows)
216
+
217
+ # Apply the function to the entire DataFrame
218
+ if schema is not None:
219
+ return df.groupBy().applyInPandas(mondrian_partition, schema=schema)
220
+ else:
221
+ return df.groupBy().applyInPandas(mondrian_partition)
222
+
223
+ # ------------------------- DISPATCHER -------------------------
224
+ def mondrian_k_anonymity(
225
+ df: Union[pl.DataFrame, pl.LazyFrame, "SparkDataFrame"],
226
+ quasi_identifiers: List[str],
227
+ sensitive_column: str,
228
+ k: int,
229
+ categorical: Optional[List[str]] = None,
230
+ schema: Optional["StructType"] = None,
231
+ ) -> Union[pl.DataFrame, "SparkDataFrame"]:
232
+ """
233
+ Dispatcher: Use Polars or PySpark Mondrian k-anonymity depending on input type.
234
+
235
+ Args:
236
+ df: Input DataFrame (Polars or PySpark)
237
+ quasi_identifiers: List of column names that are quasi-identifiers
238
+ sensitive_column: Name of the sensitive column
239
+ k: Anonymity parameter (minimum group size)
240
+ categorical: List of categorical column names
241
+ schema: Schema for PySpark output (required for PySpark)
242
+
243
+ Returns:
244
+ Anonymized DataFrame with generalized quasi-identifiers
245
+ """
246
+ try:
247
+ from pyspark.sql import DataFrame as SparkDataFrame
248
+ if isinstance(df, SparkDataFrame):
249
+ return mondrian_k_anonymity_spark(df, quasi_identifiers, sensitive_column, k, categorical, schema)
250
+ except ImportError:
251
+ pass
252
+
253
+ if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
254
+ return mondrian_k_anonymity_polars(df, quasi_identifiers, sensitive_column, k, categorical)
255
+
256
+ raise ValueError("Input df must be a polars.DataFrame, polars.LazyFrame, or pyspark.sql.DataFrame")
@@ -0,0 +1,2 @@
1
+ # This file indicates that this package is typed according to PEP 561
2
+ # It allows type checkers to recognize the package as type-annotated
File without changes
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: polarfrost
3
+ Version: 0.1.0
4
+ Summary: A fast k-anonymity implementation using Polars and PySpark
5
+ Home-page: https://github.com/rglew/polarfrost
6
+ Author: Richard Glew
7
+ Author-email: richard.glew@hotmail.com
8
+ Keywords: anonymization,privacy,polars,k-anonymity,data-privacy
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Security
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: polars>=0.13.0
24
+ Requires-Dist: pandas>=1.3.0
25
+ Requires-Dist: numpy>=1.21.0
26
+ Provides-Extra: spark
27
+ Requires-Dist: pyspark>=3.0.0; extra == "spark"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=6.0; extra == "dev"
30
+ Requires-Dist: pytest-cov>=2.0; extra == "dev"
31
+ Requires-Dist: black>=21.0; extra == "dev"
32
+ Requires-Dist: isort>=5.0; extra == "dev"
33
+ Requires-Dist: mypy>=0.900; extra == "dev"
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: classifier
37
+ Dynamic: description
38
+ Dynamic: description-content-type
39
+ Dynamic: home-page
40
+ Dynamic: keywords
41
+ Dynamic: provides-extra
42
+ Dynamic: requires-dist
43
+ Dynamic: requires-python
44
+ Dynamic: summary
45
+
46
+ # Polarfrost
47
+
48
+ A fast k-anonymity implementation using Polars, featuring both Mondrian and Clustering algorithms for efficient privacy-preserving data analysis.
49
+
50
+ ## Features
51
+
52
+ - 🚀 Blazing fast k-anonymity using Polars
53
+ - 🧊 Supports both local (Polars) and distributed (PySpark) processing
54
+ - 📊 Preserves data utility while ensuring privacy
55
+ - 🐍 Simple Python API
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install polarfrost
61
+ ```
62
+
63
+ ## Quick Start
64
+
65
+ ```python
66
+ import polars as pl
67
+ from polarfrost import mondrian_k_anonymity
68
+
69
+ # Load your data
70
+ df = pl.read_csv("your_data.csv")
71
+
72
+ # Apply k-anonymity
73
+ anonymized = mondrian_k_anonymity(
74
+ df,
75
+ quasi_identifiers=["age", "gender", "zipcode"],
76
+ sensitive_column="income",
77
+ k=3,
78
+ categorical=["gender", "zipcode"]
79
+ )
80
+
81
+ print(anonymized)
82
+ ```
83
+
84
+ ## License
85
+
86
+ MIT
@@ -0,0 +1,16 @@
1
+ README.md
2
+ setup.py
3
+ polarfrost/__init__.py
4
+ polarfrost/clustering.py
5
+ polarfrost/mondrian.py
6
+ polarfrost/py.typed
7
+ polarfrost.egg-info/PKG-INFO
8
+ polarfrost.egg-info/SOURCES.txt
9
+ polarfrost.egg-info/dependency_links.txt
10
+ polarfrost.egg-info/not-zip-safe
11
+ polarfrost.egg-info/requires.txt
12
+ polarfrost.egg-info/top_level.txt
13
+ polarfrost/tests/__init__.py
14
+ tests/test_imports.py
15
+ tests/test_mondrian.py
16
+ tests/test_mondrian_implementation.py
@@ -0,0 +1,13 @@
1
+ polars>=0.13.0
2
+ pandas>=1.3.0
3
+ numpy>=1.21.0
4
+
5
+ [dev]
6
+ pytest>=6.0
7
+ pytest-cov>=2.0
8
+ black>=21.0
9
+ isort>=5.0
10
+ mypy>=0.900
11
+
12
+ [spark]
13
+ pyspark>=3.0.0
@@ -0,0 +1 @@
1
+ polarfrost
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,54 @@
1
+ from setuptools import setup, find_packages
2
+ import pathlib
3
+
4
+ # Read the contents of README.md
5
+ this_directory = pathlib.Path(__file__).parent
6
+ long_description = (this_directory / "README.md").read_text()
7
+
8
+ setup(
9
+ name="polarfrost",
10
+ version="0.1.0",
11
+ packages=find_packages(exclude=["tests*"]),
12
+ python_requires=">=3.8",
13
+ install_requires=[
14
+ "polars>=0.13.0",
15
+ "pandas>=1.3.0",
16
+ "numpy>=1.21.0",
17
+ ],
18
+ extras_require={
19
+ "spark": ["pyspark>=3.0.0"],
20
+ "dev": [
21
+ "pytest>=6.0",
22
+ "pytest-cov>=2.0",
23
+ "black>=21.0",
24
+ "isort>=5.0",
25
+ "mypy>=0.900",
26
+ ],
27
+ },
28
+ author="Richard Glew",
29
+ author_email="richard.glew@hotmail.com",
30
+ description="A fast k-anonymity implementation using Polars and PySpark",
31
+ long_description=long_description,
32
+ long_description_content_type="text/markdown",
33
+ url="https://github.com/rglew/polarfrost",
34
+ keywords=["anonymization", "privacy", "polars", "k-anonymity", "data-privacy"],
35
+ classifiers=[
36
+ "Development Status :: 3 - Alpha",
37
+ "Intended Audience :: Developers",
38
+ "Intended Audience :: Science/Research",
39
+ "License :: OSI Approved :: MIT License",
40
+ "Programming Language :: Python :: 3",
41
+ "Programming Language :: Python :: 3.8",
42
+ "Programming Language :: Python :: 3.9",
43
+ "Programming Language :: Python :: 3.10",
44
+ "Programming Language :: Python :: 3.11",
45
+ "Topic :: Scientific/Engineering",
46
+ "Topic :: Security",
47
+ "Topic :: Software Development :: Libraries :: Python Modules",
48
+ ],
49
+ package_data={
50
+ "polarfrost": ["py.typed"],
51
+ },
52
+ include_package_data=True,
53
+ zip_safe=False,
54
+ )
@@ -0,0 +1,9 @@
1
+ """Test basic imports."""
2
+
3
+ def test_import():
4
+ """Test that the package can be imported and has the expected attributes."""
5
+ import polarfrost
6
+ assert polarfrost.__version__ == "0.1.0"
7
+ assert hasattr(polarfrost, 'mondrian_k_anonymity')
8
+ assert hasattr(polarfrost, 'mondrian_k_anonymity_polars')
9
+ assert hasattr(polarfrost, 'mondrian_k_anonymity_spark')
@@ -0,0 +1,28 @@
1
+ import os
2
+ import polars as pl
3
+ from polarfrost.mondrian import mondrian_k_anonymity
4
+
5
+ def test_mondrian_basic():
6
+ """Test basic k-anonymity functionality."""
7
+ # Sample data
8
+ data = {
9
+ "age": [25, 25, 35, 35, 45, 45],
10
+ "gender": ["M", "M", "F", "F", "M", "M"],
11
+ "zipcode": ["12345", "12345", "12345", "12345", "67890", "67890"],
12
+ "income": [50000, 55000, 60000, 65000, 70000, 75000]
13
+ }
14
+ df = pl.DataFrame(data)
15
+
16
+ # Apply k-anonymity with k=2
17
+ result = mondrian_k_anonymity(
18
+ df,
19
+ quasi_identifiers=["age", "gender", "zipcode"],
20
+ sensitive_column="income",
21
+ k=2,
22
+ categorical=["gender", "zipcode"]
23
+ )
24
+
25
+ # Basic assertions
26
+ assert len(result) > 0
27
+ assert "count" in result.columns
28
+ assert all(result["count"] >= 2)
@@ -0,0 +1,88 @@
1
+ """
2
+ Test the Mondrian k-anonymity implementation with sample data.
3
+ """
4
+ import os
5
+ import polars as pl
6
+ import pytest
7
+ from polarfrost import mondrian_k_anonymity
8
+
9
+ def test_mondrian_basic():
10
+ """Test basic Mondrian k-anonymity with a small dataset."""
11
+ # Create a small test dataset
12
+ data = {
13
+ "age": [25, 25, 35, 35, 45, 45, 55, 55],
14
+ "gender": ["M", "M", "F", "F", "M", "M", "F", "F"],
15
+ "zipcode": ["12345", "12345", "12345", "12345", "67890", "67890", "67890", "67890"],
16
+ "income": [50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000]
17
+ }
18
+ df = pl.DataFrame(data)
19
+
20
+ # Apply k-anonymity with k=2
21
+ quasi_identifiers = ["age", "gender", "zipcode"]
22
+ sensitive_column = "income"
23
+ categorical = ["gender", "zipcode"]
24
+ k = 2
25
+
26
+ anon_df = mondrian_k_anonymity(
27
+ df,
28
+ quasi_identifiers=quasi_identifiers,
29
+ sensitive_column=sensitive_column,
30
+ k=k,
31
+ categorical=categorical
32
+ )
33
+
34
+ # Verify the output
35
+ assert isinstance(anon_df, pl.DataFrame)
36
+ assert len(anon_df) > 0 # Should have at least one group
37
+ assert all(count >= k for count in anon_df["count"]) # All groups should satisfy k-anonymity
38
+
39
+ # Check that all quasi-identifiers are generalized
40
+ for col in quasi_identifiers:
41
+ assert col in anon_df.columns
42
+
43
+ # Check that the sensitive column is included
44
+ assert sensitive_column in anon_df.columns
45
+ assert "count" in anon_df.columns
46
+
47
+ def test_mondrian_with_lazyframe():
48
+ """Test that the function works with LazyFrames."""
49
+ data = {
50
+ "age": [25, 25, 35, 35, 45, 45],
51
+ "gender": ["M", "M", "F", "F", "M", "M"],
52
+ "zipcode": ["12345", "12345", "12345", "12345", "67890", "67890"],
53
+ "income": [50000, 55000, 60000, 65000, 70000, 75000]
54
+ }
55
+ df = pl.LazyFrame(data)
56
+
57
+ anon_df = mondrian_k_anonymity(
58
+ df,
59
+ quasi_identifiers=["age", "gender", "zipcode"],
60
+ sensitive_column="income",
61
+ k=2,
62
+ categorical=["gender", "zipcode"]
63
+ )
64
+
65
+ assert isinstance(anon_df, pl.DataFrame)
66
+ assert len(anon_df) > 0
67
+
68
+ def test_mondrian_invalid_input():
69
+ """Test that invalid inputs raise appropriate errors."""
70
+ df = pl.DataFrame({"age": [1, 2, 3], "income": [10, 20, 30]})
71
+
72
+ # Test with k larger than dataset size - should return a single group
73
+ result = mondrian_k_anonymity(df, ["age"], "income", k=5)
74
+ assert len(result) == 1 # Should return a single group with all records
75
+ assert result["count"][0] == 3 # All records should be in one group
76
+
77
+ # Test with invalid column names
78
+ with pytest.raises(pl.exceptions.ColumnNotFoundError):
79
+ mondrian_k_anonymity(df, ["invalid"], "income", k=2)
80
+
81
+ with pytest.raises(pl.exceptions.ColumnNotFoundError):
82
+ mondrian_k_anonymity(df, ["age"], "invalid", k=2)
83
+
84
+ if __name__ == "__main__":
85
+ test_mondrian_basic()
86
+ test_mondrian_with_lazyframe()
87
+ test_mondrian_invalid_input()
88
+ print("All tests passed!")