polarfrost 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polarfrost/__init__.py +63 -6
- polarfrost/clustering.py +57 -5
- polarfrost/mondrian.py +309 -69
- polarfrost-0.2.0.dist-info/METADATA +579 -0
- polarfrost-0.2.0.dist-info/RECORD +9 -0
- polarfrost-0.1.0.dist-info/METADATA +0 -86
- polarfrost-0.1.0.dist-info/RECORD +0 -9
- {polarfrost-0.1.0.dist-info → polarfrost-0.2.0.dist-info}/WHEEL +0 -0
- {polarfrost-0.1.0.dist-info → polarfrost-0.2.0.dist-info}/top_level.txt +0 -0
polarfrost/__init__.py
CHANGED
@@ -6,21 +6,78 @@ including the Mondrian algorithm, with support for both local (Polars)
|
|
6
6
|
and distributed (PySpark) processing.
|
7
7
|
"""
|
8
8
|
|
9
|
-
__version__ = "0.
|
9
|
+
__version__ = "0.2.0"
|
10
10
|
|
11
11
|
# Import main functions
|
12
12
|
try:
|
13
13
|
from .mondrian import (
|
14
14
|
mondrian_k_anonymity,
|
15
|
+
mondrian_k_anonymity_alt,
|
15
16
|
mondrian_k_anonymity_polars,
|
16
|
-
mondrian_k_anonymity_spark
|
17
|
+
mondrian_k_anonymity_spark,
|
17
18
|
)
|
19
|
+
from .clustering import clustering_k_anonymity
|
20
|
+
|
18
21
|
__all__ = [
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
"mondrian_k_anonymity",
|
23
|
+
"mondrian_k_anonymity_alt",
|
24
|
+
"mondrian_k_anonymity_polars",
|
25
|
+
"mondrian_k_anonymity_spark",
|
26
|
+
"clustering_k_anonymity",
|
22
27
|
]
|
23
28
|
except ImportError as e:
|
24
29
|
import warnings
|
25
|
-
|
30
|
+
|
31
|
+
warnings.warn(f"Could not import all dependencies: {e}")
|
32
|
+
|
33
|
+
# Define dummy functions for type checking
|
34
|
+
from typing import Any, List, Optional, Union
|
35
|
+
from typing_extensions import Literal
|
36
|
+
from pyspark.sql.types import StructType
|
37
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
38
|
+
from polars import DataFrame as PolarsDataFrame, LazyFrame
|
39
|
+
|
40
|
+
def mondrian_k_anonymity(
|
41
|
+
df: Union[PolarsDataFrame, LazyFrame, SparkDataFrame],
|
42
|
+
quasi_identifiers: List[str],
|
43
|
+
sensitive_column: str,
|
44
|
+
k: int,
|
45
|
+
categorical: Optional[List[str]] = None,
|
46
|
+
schema: Optional[StructType] = None
|
47
|
+
) -> Union[PolarsDataFrame, SparkDataFrame]:
|
48
|
+
"""Dummy function for type checking when dependencies are missing."""
|
49
|
+
raise ImportError("Mondrian k-anonymity is not available due to missing dependencies")
|
50
|
+
|
51
|
+
def mondrian_k_anonymity_polars(
|
52
|
+
df: Union[PolarsDataFrame, LazyFrame],
|
53
|
+
quasi_identifiers: List[str],
|
54
|
+
sensitive_column: str,
|
55
|
+
k: int,
|
56
|
+
categorical: Optional[List[str]] = None
|
57
|
+
) -> PolarsDataFrame:
|
58
|
+
"""Dummy function for type checking when dependencies are missing."""
|
59
|
+
raise ImportError("Mondrian k-anonymity (Polars) is not available due to missing dependencies")
|
60
|
+
|
61
|
+
def mondrian_k_anonymity_spark(
|
62
|
+
df: SparkDataFrame,
|
63
|
+
quasi_identifiers: List[str],
|
64
|
+
sensitive_column: str,
|
65
|
+
k: int,
|
66
|
+
categorical: Optional[List[str]] = None,
|
67
|
+
schema: Optional[StructType] = None
|
68
|
+
) -> SparkDataFrame:
|
69
|
+
"""Dummy function for type checking when dependencies are missing."""
|
70
|
+
raise ImportError("Mondrian k-anonymity (PySpark) is not available due to missing dependencies")
|
71
|
+
|
72
|
+
def clustering_k_anonymity(
|
73
|
+
df: Union[PolarsDataFrame, LazyFrame],
|
74
|
+
quasi_identifiers: List[str],
|
75
|
+
sensitive_column: str,
|
76
|
+
k: int,
|
77
|
+
categorical: Optional[List[str]] = None,
|
78
|
+
method: str = "kmeans"
|
79
|
+
) -> PolarsDataFrame:
|
80
|
+
"""Dummy function for type checking when dependencies are missing."""
|
81
|
+
raise ImportError("Clustering k-anonymity is not available due to missing dependencies")
|
82
|
+
|
26
83
|
__all__ = []
|
polarfrost/clustering.py
CHANGED
@@ -1,9 +1,19 @@
|
|
1
1
|
"""
|
2
2
|
Clustering-based k-anonymity implementation using Polars.
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
|
+
from typing import List, Optional, Union, cast, TypeVar, Any
|
5
6
|
import polars as pl
|
6
7
|
|
8
|
+
# Define type variable for Polars expressions
|
9
|
+
PolarsExpr = TypeVar('PolarsExpr', bound='pl.Expr')
|
10
|
+
|
11
|
+
|
12
|
+
class ClusteringError(ValueError):
|
13
|
+
"""Custom exception for clustering-related errors."""
|
14
|
+
pass
|
15
|
+
|
16
|
+
|
7
17
|
def clustering_k_anonymity(
|
8
18
|
df: Union[pl.DataFrame, pl.LazyFrame],
|
9
19
|
quasi_identifiers: List[str],
|
@@ -14,16 +24,58 @@ def clustering_k_anonymity(
|
|
14
24
|
) -> pl.DataFrame:
|
15
25
|
"""
|
16
26
|
Perform clustering-based k-anonymity using Polars.
|
17
|
-
|
27
|
+
|
18
28
|
Args:
|
19
29
|
df: Input DataFrame or LazyFrame
|
20
30
|
quasi_identifiers: List of column names to use for clustering
|
21
31
|
sensitive_column: Column containing sensitive information
|
22
32
|
k: Minimum group size for k-anonymity
|
23
|
-
categorical: List of categorical column names
|
24
|
-
method: Clustering method ('fcbg', 'rsc', or 'random')
|
25
|
-
|
33
|
+
categorical: List of categorical column names (default: None)
|
34
|
+
method: Clustering method ('fcbg', 'rsc', or 'random') (default: 'fcbg')
|
35
|
+
|
26
36
|
Returns:
|
27
37
|
Anonymized DataFrame with generalized quasi-identifiers
|
38
|
+
|
39
|
+
Raises:
|
40
|
+
ValueError: If input validation fails
|
41
|
+
NotImplementedError: If the method is not implemented yet
|
28
42
|
"""
|
43
|
+
# Input validation
|
44
|
+
if not isinstance(df, (pl.DataFrame, pl.LazyFrame)):
|
45
|
+
raise ValueError("Input must be a Polars DataFrame or LazyFrame")
|
46
|
+
|
47
|
+
# Validate quasi_identifiers
|
48
|
+
if not isinstance(quasi_identifiers, list) or not quasi_identifiers:
|
49
|
+
raise ValueError("quasi_identifiers must be a non-empty list")
|
50
|
+
|
51
|
+
# Convert to LazyFrame if not already
|
52
|
+
is_lazy = isinstance(df, pl.LazyFrame)
|
53
|
+
if not is_lazy:
|
54
|
+
df = df.lazy()
|
55
|
+
|
56
|
+
# Check for empty DataFrame
|
57
|
+
df_len = df.select(pl.len()).collect().item(0, 0) if not is_lazy else df.select(pl.len()).collect().item(0, 0)
|
58
|
+
if df_len == 0:
|
59
|
+
raise ValueError("Input DataFrame cannot be empty")
|
60
|
+
|
61
|
+
# Validate k is a positive integer
|
62
|
+
if not isinstance(k, (int, str)) or (isinstance(k, str) and not str(k).isdigit()) or int(k) < 1:
|
63
|
+
raise ValueError("k must be a positive integer")
|
64
|
+
k = int(k) # Convert to int if it's a string of digits
|
65
|
+
|
66
|
+
# Get all columns that will be used
|
67
|
+
all_columns = set(quasi_identifiers + [sensitive_column])
|
68
|
+
if categorical is not None:
|
69
|
+
all_columns.update(categorical)
|
70
|
+
|
71
|
+
# Check if columns exist
|
72
|
+
missing_columns = [col for col in all_columns if col not in df.columns]
|
73
|
+
if missing_columns:
|
74
|
+
raise ValueError(f"Columns not found in DataFrame: {missing_columns}")
|
75
|
+
|
76
|
+
# Validate method
|
77
|
+
if method not in ["fcbg", "rsc", "random"]:
|
78
|
+
raise ValueError(f"Unsupported clustering method: {method}")
|
79
|
+
|
80
|
+
# For now, just raise NotImplementedError
|
29
81
|
raise NotImplementedError("Clustering k-anonymity will be implemented soon")
|