polarfrost 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polarfrost/__init__.py CHANGED
@@ -6,21 +6,78 @@ including the Mondrian algorithm, with support for both local (Polars)
6
6
  and distributed (PySpark) processing.
7
7
  """
8
8
 
9
- __version__ = "0.1.0"
9
+ __version__ = "0.2.0"
10
10
 
11
11
  # Import main functions
12
12
  try:
13
13
  from .mondrian import (
14
14
  mondrian_k_anonymity,
15
+ mondrian_k_anonymity_alt,
15
16
  mondrian_k_anonymity_polars,
16
- mondrian_k_anonymity_spark
17
+ mondrian_k_anonymity_spark,
17
18
  )
19
+ from .clustering import clustering_k_anonymity
20
+
18
21
  __all__ = [
19
- 'mondrian_k_anonymity',
20
- 'mondrian_k_anonymity_polars',
21
- 'mondrian_k_anonymity_spark'
22
+ "mondrian_k_anonymity",
23
+ "mondrian_k_anonymity_alt",
24
+ "mondrian_k_anonymity_polars",
25
+ "mondrian_k_anonymity_spark",
26
+ "clustering_k_anonymity",
22
27
  ]
23
28
  except ImportError as e:
24
29
  import warnings
25
- warnings.warn(f"Could not import mondrian: {e}")
30
+
31
+ warnings.warn(f"Could not import all dependencies: {e}")
32
+
33
+ # Define dummy functions for type checking
34
+ from typing import Any, List, Optional, Union
35
+ from typing_extensions import Literal
36
+ from pyspark.sql.types import StructType
37
+ from pyspark.sql import DataFrame as SparkDataFrame
38
+ from polars import DataFrame as PolarsDataFrame, LazyFrame
39
+
40
+ def mondrian_k_anonymity(
41
+ df: Union[PolarsDataFrame, LazyFrame, SparkDataFrame],
42
+ quasi_identifiers: List[str],
43
+ sensitive_column: str,
44
+ k: int,
45
+ categorical: Optional[List[str]] = None,
46
+ schema: Optional[StructType] = None
47
+ ) -> Union[PolarsDataFrame, SparkDataFrame]:
48
+ """Dummy function for type checking when dependencies are missing."""
49
+ raise ImportError("Mondrian k-anonymity is not available due to missing dependencies")
50
+
51
+ def mondrian_k_anonymity_polars(
52
+ df: Union[PolarsDataFrame, LazyFrame],
53
+ quasi_identifiers: List[str],
54
+ sensitive_column: str,
55
+ k: int,
56
+ categorical: Optional[List[str]] = None
57
+ ) -> PolarsDataFrame:
58
+ """Dummy function for type checking when dependencies are missing."""
59
+ raise ImportError("Mondrian k-anonymity (Polars) is not available due to missing dependencies")
60
+
61
+ def mondrian_k_anonymity_spark(
62
+ df: SparkDataFrame,
63
+ quasi_identifiers: List[str],
64
+ sensitive_column: str,
65
+ k: int,
66
+ categorical: Optional[List[str]] = None,
67
+ schema: Optional[StructType] = None
68
+ ) -> SparkDataFrame:
69
+ """Dummy function for type checking when dependencies are missing."""
70
+ raise ImportError("Mondrian k-anonymity (PySpark) is not available due to missing dependencies")
71
+
72
+ def clustering_k_anonymity(
73
+ df: Union[PolarsDataFrame, LazyFrame],
74
+ quasi_identifiers: List[str],
75
+ sensitive_column: str,
76
+ k: int,
77
+ categorical: Optional[List[str]] = None,
78
+ method: str = "kmeans"
79
+ ) -> PolarsDataFrame:
80
+ """Dummy function for type checking when dependencies are missing."""
81
+ raise ImportError("Clustering k-anonymity is not available due to missing dependencies")
82
+
26
83
  __all__ = []
polarfrost/clustering.py CHANGED
@@ -1,9 +1,19 @@
1
1
  """
2
2
  Clustering-based k-anonymity implementation using Polars.
3
3
  """
4
- from typing import List, Optional, Union
4
+
5
+ from typing import List, Optional, Union, cast, TypeVar, Any
5
6
  import polars as pl
6
7
 
8
+ # Define type variable for Polars expressions
9
+ PolarsExpr = TypeVar('PolarsExpr', bound='pl.Expr')
10
+
11
+
12
+ class ClusteringError(ValueError):
13
+ """Custom exception for clustering-related errors."""
14
+ pass
15
+
16
+
7
17
  def clustering_k_anonymity(
8
18
  df: Union[pl.DataFrame, pl.LazyFrame],
9
19
  quasi_identifiers: List[str],
@@ -14,16 +24,58 @@ def clustering_k_anonymity(
14
24
  ) -> pl.DataFrame:
15
25
  """
16
26
  Perform clustering-based k-anonymity using Polars.
17
-
27
+
18
28
  Args:
19
29
  df: Input DataFrame or LazyFrame
20
30
  quasi_identifiers: List of column names to use for clustering
21
31
  sensitive_column: Column containing sensitive information
22
32
  k: Minimum group size for k-anonymity
23
- categorical: List of categorical column names
24
- method: Clustering method ('fcbg', 'rsc', or 'random')
25
-
33
+ categorical: List of categorical column names (default: None)
34
+ method: Clustering method ('fcbg', 'rsc', or 'random') (default: 'fcbg')
35
+
26
36
  Returns:
27
37
  Anonymized DataFrame with generalized quasi-identifiers
38
+
39
+ Raises:
40
+ ValueError: If input validation fails
41
+ NotImplementedError: If the method is not implemented yet
28
42
  """
43
+ # Input validation
44
+ if not isinstance(df, (pl.DataFrame, pl.LazyFrame)):
45
+ raise ValueError("Input must be a Polars DataFrame or LazyFrame")
46
+
47
+ # Validate quasi_identifiers
48
+ if not isinstance(quasi_identifiers, list) or not quasi_identifiers:
49
+ raise ValueError("quasi_identifiers must be a non-empty list")
50
+
51
+ # Convert to LazyFrame if not already
52
+ is_lazy = isinstance(df, pl.LazyFrame)
53
+ if not is_lazy:
54
+ df = df.lazy()
55
+
56
+ # Check for empty DataFrame
57
+ df_len = df.select(pl.len()).collect().item(0, 0) if not is_lazy else df.select(pl.len()).collect().item(0, 0)
58
+ if df_len == 0:
59
+ raise ValueError("Input DataFrame cannot be empty")
60
+
61
+ # Validate k is a positive integer
62
+ if not isinstance(k, (int, str)) or (isinstance(k, str) and not str(k).isdigit()) or int(k) < 1:
63
+ raise ValueError("k must be a positive integer")
64
+ k = int(k) # Convert to int if it's a string of digits
65
+
66
+ # Get all columns that will be used
67
+ all_columns = set(quasi_identifiers + [sensitive_column])
68
+ if categorical is not None:
69
+ all_columns.update(categorical)
70
+
71
+ # Check if columns exist
72
+ missing_columns = [col for col in all_columns if col not in df.columns]
73
+ if missing_columns:
74
+ raise ValueError(f"Columns not found in DataFrame: {missing_columns}")
75
+
76
+ # Validate method
77
+ if method not in ["fcbg", "rsc", "random"]:
78
+ raise ValueError(f"Unsupported clustering method: {method}")
79
+
80
+ # For now, just raise NotImplementedError
29
81
  raise NotImplementedError("Clustering k-anonymity will be implemented soon")