fraiseql-confiture 0.3.4__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- confiture/__init__.py +48 -0
- confiture/_core.cp311-win_amd64.pyd +0 -0
- confiture/cli/__init__.py +0 -0
- confiture/cli/dry_run.py +116 -0
- confiture/cli/lint_formatter.py +193 -0
- confiture/cli/main.py +1656 -0
- confiture/config/__init__.py +0 -0
- confiture/config/environment.py +263 -0
- confiture/core/__init__.py +51 -0
- confiture/core/anonymization/__init__.py +0 -0
- confiture/core/anonymization/audit.py +485 -0
- confiture/core/anonymization/benchmarking.py +372 -0
- confiture/core/anonymization/breach_notification.py +652 -0
- confiture/core/anonymization/compliance.py +617 -0
- confiture/core/anonymization/composer.py +298 -0
- confiture/core/anonymization/data_subject_rights.py +669 -0
- confiture/core/anonymization/factory.py +319 -0
- confiture/core/anonymization/governance.py +737 -0
- confiture/core/anonymization/performance.py +1092 -0
- confiture/core/anonymization/profile.py +284 -0
- confiture/core/anonymization/registry.py +195 -0
- confiture/core/anonymization/security/kms_manager.py +547 -0
- confiture/core/anonymization/security/lineage.py +888 -0
- confiture/core/anonymization/security/token_store.py +686 -0
- confiture/core/anonymization/strategies/__init__.py +41 -0
- confiture/core/anonymization/strategies/address.py +359 -0
- confiture/core/anonymization/strategies/credit_card.py +374 -0
- confiture/core/anonymization/strategies/custom.py +161 -0
- confiture/core/anonymization/strategies/date.py +218 -0
- confiture/core/anonymization/strategies/differential_privacy.py +398 -0
- confiture/core/anonymization/strategies/email.py +141 -0
- confiture/core/anonymization/strategies/format_preserving_encryption.py +310 -0
- confiture/core/anonymization/strategies/hash.py +150 -0
- confiture/core/anonymization/strategies/ip_address.py +235 -0
- confiture/core/anonymization/strategies/masking_retention.py +252 -0
- confiture/core/anonymization/strategies/name.py +298 -0
- confiture/core/anonymization/strategies/phone.py +119 -0
- confiture/core/anonymization/strategies/preserve.py +85 -0
- confiture/core/anonymization/strategies/redact.py +101 -0
- confiture/core/anonymization/strategies/salted_hashing.py +322 -0
- confiture/core/anonymization/strategies/text_redaction.py +183 -0
- confiture/core/anonymization/strategies/tokenization.py +334 -0
- confiture/core/anonymization/strategy.py +241 -0
- confiture/core/anonymization/syncer_audit.py +357 -0
- confiture/core/blue_green.py +683 -0
- confiture/core/builder.py +500 -0
- confiture/core/checksum.py +358 -0
- confiture/core/connection.py +132 -0
- confiture/core/differ.py +522 -0
- confiture/core/drift.py +564 -0
- confiture/core/dry_run.py +182 -0
- confiture/core/health.py +313 -0
- confiture/core/hooks/__init__.py +87 -0
- confiture/core/hooks/base.py +232 -0
- confiture/core/hooks/context.py +146 -0
- confiture/core/hooks/execution_strategies.py +57 -0
- confiture/core/hooks/observability.py +220 -0
- confiture/core/hooks/phases.py +53 -0
- confiture/core/hooks/registry.py +295 -0
- confiture/core/large_tables.py +775 -0
- confiture/core/linting/__init__.py +70 -0
- confiture/core/linting/composer.py +192 -0
- confiture/core/linting/libraries/__init__.py +17 -0
- confiture/core/linting/libraries/gdpr.py +168 -0
- confiture/core/linting/libraries/general.py +184 -0
- confiture/core/linting/libraries/hipaa.py +144 -0
- confiture/core/linting/libraries/pci_dss.py +104 -0
- confiture/core/linting/libraries/sox.py +120 -0
- confiture/core/linting/schema_linter.py +491 -0
- confiture/core/linting/versioning.py +151 -0
- confiture/core/locking.py +389 -0
- confiture/core/migration_generator.py +298 -0
- confiture/core/migrator.py +793 -0
- confiture/core/observability/__init__.py +44 -0
- confiture/core/observability/audit.py +323 -0
- confiture/core/observability/logging.py +187 -0
- confiture/core/observability/metrics.py +174 -0
- confiture/core/observability/tracing.py +192 -0
- confiture/core/pg_version.py +418 -0
- confiture/core/pool.py +406 -0
- confiture/core/risk/__init__.py +39 -0
- confiture/core/risk/predictor.py +188 -0
- confiture/core/risk/scoring.py +248 -0
- confiture/core/rollback_generator.py +388 -0
- confiture/core/schema_analyzer.py +769 -0
- confiture/core/schema_to_schema.py +590 -0
- confiture/core/security/__init__.py +32 -0
- confiture/core/security/logging.py +201 -0
- confiture/core/security/validation.py +416 -0
- confiture/core/signals.py +371 -0
- confiture/core/syncer.py +540 -0
- confiture/exceptions.py +192 -0
- confiture/integrations/__init__.py +0 -0
- confiture/models/__init__.py +0 -0
- confiture/models/lint.py +193 -0
- confiture/models/migration.py +180 -0
- confiture/models/schema.py +203 -0
- confiture/scenarios/__init__.py +36 -0
- confiture/scenarios/compliance.py +586 -0
- confiture/scenarios/ecommerce.py +199 -0
- confiture/scenarios/financial.py +253 -0
- confiture/scenarios/healthcare.py +315 -0
- confiture/scenarios/multi_tenant.py +340 -0
- confiture/scenarios/saas.py +295 -0
- confiture/testing/FRAMEWORK_API.md +722 -0
- confiture/testing/__init__.py +38 -0
- confiture/testing/fixtures/__init__.py +11 -0
- confiture/testing/fixtures/data_validator.py +229 -0
- confiture/testing/fixtures/migration_runner.py +167 -0
- confiture/testing/fixtures/schema_snapshotter.py +352 -0
- confiture/testing/frameworks/__init__.py +10 -0
- confiture/testing/frameworks/mutation.py +587 -0
- confiture/testing/frameworks/performance.py +479 -0
- confiture/testing/utils/__init__.py +0 -0
- fraiseql_confiture-0.3.4.dist-info/METADATA +438 -0
- fraiseql_confiture-0.3.4.dist-info/RECORD +119 -0
- fraiseql_confiture-0.3.4.dist-info/WHEEL +4 -0
- fraiseql_confiture-0.3.4.dist-info/entry_points.txt +2 -0
- fraiseql_confiture-0.3.4.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Date masking anonymization strategy.
|
|
2
|
+
|
|
3
|
+
Provides flexible date anonymization with preservation options:
|
|
4
|
+
- Preserve year only (replace month/day)
|
|
5
|
+
- Preserve month/year (jitter day)
|
|
6
|
+
- Full anonymization (replace entire date)
|
|
7
|
+
|
|
8
|
+
Uses seeded randomization for deterministic output and jitter.
|
|
9
|
+
Supports multiple date formats (ISO 8601, US, UK, etc).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import random
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from datetime import datetime, timedelta
|
|
15
|
+
|
|
16
|
+
from confiture.core.anonymization.strategy import AnonymizationStrategy, StrategyConfig
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class DateMaskConfig(StrategyConfig):
|
|
21
|
+
"""Configuration for date masking strategy.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
seed: Seed for deterministic randomization
|
|
25
|
+
preserve: What to preserve:
|
|
26
|
+
- "none": Fully anonymize (replace entire date)
|
|
27
|
+
- "year": Keep year, jitter month/day
|
|
28
|
+
- "month": Keep year/month, jitter day (useful for healthcare)
|
|
29
|
+
jitter_days: Number of days to jitter (default 30)
|
|
30
|
+
output_format: Output format (default: same as input)
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> config = DateMaskConfig(seed=12345, preserve="year", jitter_days=30)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
preserve: str = "year" # none, year, month
|
|
37
|
+
jitter_days: int = 30
|
|
38
|
+
output_format: str | None = None # If None, preserve input format
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DateMaskingStrategy(AnonymizationStrategy):
|
|
42
|
+
"""Anonymization strategy for masking dates.
|
|
43
|
+
|
|
44
|
+
Provides configurable date anonymization with preservation options:
|
|
45
|
+
- Preserve year but jitter month/day
|
|
46
|
+
- Preserve year/month but jitter day
|
|
47
|
+
- Fully replace date
|
|
48
|
+
|
|
49
|
+
Features:
|
|
50
|
+
- Deterministic jitter (same seed = same jitter)
|
|
51
|
+
- Multiple format support (ISO 8601, US MM/DD/YYYY, UK DD/MM/YYYY)
|
|
52
|
+
- Preserves date boundaries (valid dates only)
|
|
53
|
+
- Handles NULL and edge cases
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
>>> config = DateMaskConfig(seed=12345, preserve="year", jitter_days=30)
|
|
57
|
+
>>> strategy = DateMaskingStrategy(config)
|
|
58
|
+
>>> strategy.anonymize("2020-05-15")
|
|
59
|
+
'2020-03-22' # Same year, different month/day
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
config_type = DateMaskConfig
|
|
63
|
+
strategy_name = "date"
|
|
64
|
+
|
|
65
|
+
# Common date formats to try
|
|
66
|
+
DATE_FORMATS = [
|
|
67
|
+
"%Y-%m-%d", # ISO 8601: 2020-05-15
|
|
68
|
+
"%m/%d/%Y", # US: 05/15/2020
|
|
69
|
+
"%d/%m/%Y", # UK: 15/05/2020
|
|
70
|
+
"%Y/%m/%d", # 2020/05/15
|
|
71
|
+
"%d-%m-%Y", # 15-05-2020
|
|
72
|
+
"%B %d, %Y", # May 15, 2020
|
|
73
|
+
"%b %d, %Y", # May 15, 2020
|
|
74
|
+
"%Y-%m-%d %H:%M:%S", # ISO with time
|
|
75
|
+
"%m/%d/%Y %H:%M:%S", # US with time
|
|
76
|
+
"%d/%m/%Y %H:%M:%S", # UK with time
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
def anonymize(self, value: str | None) -> str | None:
|
|
80
|
+
"""Anonymize a date value.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
value: Date string to anonymize
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Anonymized date in same format as input
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
>>> strategy.anonymize("2020-05-15")
|
|
90
|
+
'2020-03-22'
|
|
91
|
+
"""
|
|
92
|
+
if value is None:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
if isinstance(value, str) and not value.strip():
|
|
96
|
+
return value
|
|
97
|
+
|
|
98
|
+
# Parse the date
|
|
99
|
+
parsed_date, detected_format = self._parse_date(value)
|
|
100
|
+
|
|
101
|
+
if parsed_date is None:
|
|
102
|
+
# Could not parse - return as-is
|
|
103
|
+
return value
|
|
104
|
+
|
|
105
|
+
# Apply anonymization based on config
|
|
106
|
+
if self.config.preserve == "none":
|
|
107
|
+
anonymized_date = self._anonymize_full(parsed_date)
|
|
108
|
+
elif self.config.preserve == "year":
|
|
109
|
+
anonymized_date = self._anonymize_preserve_year(parsed_date)
|
|
110
|
+
elif self.config.preserve == "month":
|
|
111
|
+
anonymized_date = self._anonymize_preserve_month(parsed_date)
|
|
112
|
+
else:
|
|
113
|
+
raise ValueError(f"Unknown preserve mode: {self.config.preserve}")
|
|
114
|
+
|
|
115
|
+
# Format output
|
|
116
|
+
output_format = self.config.output_format or detected_format
|
|
117
|
+
return anonymized_date.strftime(output_format)
|
|
118
|
+
|
|
119
|
+
def _parse_date(self, value: str) -> tuple[datetime | None, str | None]:
|
|
120
|
+
"""Parse date string in any supported format.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
value: Date string to parse
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Tuple of (parsed datetime, detected format) or (None, None)
|
|
127
|
+
"""
|
|
128
|
+
for fmt in self.DATE_FORMATS:
|
|
129
|
+
try:
|
|
130
|
+
parsed = datetime.strptime(value.strip(), fmt)
|
|
131
|
+
return parsed, fmt
|
|
132
|
+
except ValueError:
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
# Could not parse
|
|
136
|
+
return None, None
|
|
137
|
+
|
|
138
|
+
def _anonymize_full(self, date: datetime) -> datetime:
|
|
139
|
+
"""Fully anonymize date (replace entire date).
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
date: Date to anonymize
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Anonymized date
|
|
146
|
+
"""
|
|
147
|
+
# Use seed to generate deterministic jitter
|
|
148
|
+
rng = random.Random(f"{self.config.seed}:{date.isoformat()}".encode())
|
|
149
|
+
|
|
150
|
+
# Random jitter in days
|
|
151
|
+
jitter = rng.randint(-self.config.jitter_days, self.config.jitter_days)
|
|
152
|
+
|
|
153
|
+
return date + timedelta(days=jitter)
|
|
154
|
+
|
|
155
|
+
def _anonymize_preserve_year(self, date: datetime) -> datetime:
|
|
156
|
+
"""Anonymize but preserve year.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
date: Date to anonymize
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Anonymized date with same year
|
|
163
|
+
"""
|
|
164
|
+
rng = random.Random(f"{self.config.seed}:{date.isoformat()}:year".encode())
|
|
165
|
+
|
|
166
|
+
# Random month (1-12)
|
|
167
|
+
month = rng.randint(1, 12)
|
|
168
|
+
|
|
169
|
+
# Random day (1-28 to be safe for all months)
|
|
170
|
+
day = rng.randint(1, 28)
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
return date.replace(month=month, day=day)
|
|
174
|
+
except ValueError:
|
|
175
|
+
# Invalid date (e.g., Feb 30) - return as-is
|
|
176
|
+
return date
|
|
177
|
+
|
|
178
|
+
def _anonymize_preserve_month(self, date: datetime) -> datetime:
|
|
179
|
+
"""Anonymize but preserve year and month.
|
|
180
|
+
|
|
181
|
+
Jitter the day only (useful for healthcare data where month can be significant).
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
date: Date to anonymize
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Anonymized date with same year/month
|
|
188
|
+
"""
|
|
189
|
+
rng = random.Random(f"{self.config.seed}:{date.isoformat()}:month".encode())
|
|
190
|
+
|
|
191
|
+
# Random day within same month
|
|
192
|
+
# For simplicity, use day 1-28 to be safe
|
|
193
|
+
day = rng.randint(1, 28)
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
return date.replace(day=day)
|
|
197
|
+
except ValueError:
|
|
198
|
+
# Invalid date - return as-is
|
|
199
|
+
return date
|
|
200
|
+
|
|
201
|
+
def validate(self, value: str) -> bool:
|
|
202
|
+
"""Check if strategy can handle this value type.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
value: Sample value to validate
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
True if value is a string or None
|
|
209
|
+
"""
|
|
210
|
+
return isinstance(value, str) or value is None
|
|
211
|
+
|
|
212
|
+
def short_name(self) -> str:
|
|
213
|
+
"""Return short strategy name for logging.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Short name (e.g., "date:preserve_year")
|
|
217
|
+
"""
|
|
218
|
+
return f"{self.strategy_name}:preserve_{self.config.preserve}"
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
"""Differential privacy anonymization strategy.
|
|
2
|
+
|
|
3
|
+
Provides mathematical privacy guarantee using noise addition. Adds carefully
|
|
4
|
+
calibrated random noise to numerical data to prevent individual re-identification.
|
|
5
|
+
|
|
6
|
+
Features:
|
|
7
|
+
- Mathematical privacy guarantee (epsilon-delta privacy)
|
|
8
|
+
- Noise calibration: Scale noise to data sensitivity
|
|
9
|
+
- Budget tracking: Track privacy budget consumption
|
|
10
|
+
- Configurable mechanisms: Laplace, Gaussian, Exponential
|
|
11
|
+
- Utility-privacy tradeoff: Control accuracy vs privacy
|
|
12
|
+
|
|
13
|
+
Mathematical Background:
|
|
14
|
+
Differential privacy: For any two adjacent datasets D and D',
|
|
15
|
+
P(M(D) ∈ S) ≤ e^ε * P(M(D') ∈ S) + δ
|
|
16
|
+
|
|
17
|
+
Where:
|
|
18
|
+
- M: privacy mechanism (adds noise)
|
|
19
|
+
- ε (epsilon): privacy parameter (lower = more private)
|
|
20
|
+
- δ (delta): failure probability (usually ≈ 1/n)
|
|
21
|
+
- S: set of possible outputs
|
|
22
|
+
|
|
23
|
+
Use Cases:
|
|
24
|
+
- Statistical aggregate queries (average age, sum of purchases)
|
|
25
|
+
- Census data (count distributions)
|
|
26
|
+
- Salary data (ranges, distributions)
|
|
27
|
+
- Location data (geographic aggregates)
|
|
28
|
+
- Sensor data (aggregate statistics)
|
|
29
|
+
|
|
30
|
+
Privacy Levels:
|
|
31
|
+
ε = 10: Strong privacy, significant noise, utility degraded
|
|
32
|
+
ε = 1: Very strong privacy, significant noise impact
|
|
33
|
+
ε = 0.1: Extremely strong privacy, high noise, low utility
|
|
34
|
+
ε = ∞: No privacy (no noise added)
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
Age: 35 → 35 + noise ≈ 37.2 (with ε=1, Δf=1)
|
|
38
|
+
Salary: 50000 → 50000 + noise ≈ 50241.5 (with ε=0.5, Δf=1000)
|
|
39
|
+
|
|
40
|
+
Mechanisms:
|
|
41
|
+
- Laplace: Fast, simple, works well for small datasets
|
|
42
|
+
- Gaussian: Better utility for large datasets
|
|
43
|
+
- Exponential: For exponential-family distributions
|
|
44
|
+
|
|
45
|
+
NOT suitable for:
|
|
46
|
+
- Individual records (differential privacy is for aggregates)
|
|
47
|
+
- Categorical data (use hashing instead)
|
|
48
|
+
- Small datasets (noise makes utility poor)
|
|
49
|
+
- Real-time applications (budget tracking needed)
|
|
50
|
+
- High-accuracy requirements (inherent noise trade-off)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
import random
|
|
54
|
+
from dataclasses import dataclass
|
|
55
|
+
from typing import Any
|
|
56
|
+
|
|
57
|
+
from confiture.core.anonymization.strategy import (
|
|
58
|
+
AnonymizationStrategy,
|
|
59
|
+
StrategyConfig,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class DifferentialPrivacyConfig(StrategyConfig):
|
|
65
|
+
"""Configuration for DifferentialPrivacyStrategy.
|
|
66
|
+
|
|
67
|
+
Attributes:
|
|
68
|
+
epsilon: Privacy budget (lower = more private)
|
|
69
|
+
delta: Failure probability (usually 1/dataset_size)
|
|
70
|
+
mechanism: Noise mechanism ('laplace', 'gaussian', 'exponential')
|
|
71
|
+
data_type: Type of data ('numeric', 'categorical', 'location')
|
|
72
|
+
sensitivity: Data sensitivity (max change in one record)
|
|
73
|
+
budget_total: Total privacy budget available
|
|
74
|
+
budget_per_value: Budget per anonymization operation
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
epsilon: float = 1.0
|
|
78
|
+
"""Privacy budget (lower = more private, 0.1-10 typical)."""
|
|
79
|
+
|
|
80
|
+
delta: float = 1e-5
|
|
81
|
+
"""Failure probability (typically 1/dataset_size)."""
|
|
82
|
+
|
|
83
|
+
mechanism: str = "laplace"
|
|
84
|
+
"""Noise mechanism: laplace, gaussian, exponential."""
|
|
85
|
+
|
|
86
|
+
data_type: str = "numeric"
|
|
87
|
+
"""Type of data: numeric, categorical, location."""
|
|
88
|
+
|
|
89
|
+
sensitivity: float = 1.0
|
|
90
|
+
"""Data sensitivity (max change from one record)."""
|
|
91
|
+
|
|
92
|
+
budget_total: float = 10.0
|
|
93
|
+
"""Total privacy budget available."""
|
|
94
|
+
|
|
95
|
+
budget_per_value: float = 0.1
|
|
96
|
+
"""Budget consumed per anonymization operation."""
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class DifferentialPrivacyStrategy(AnonymizationStrategy):
|
|
100
|
+
"""Differential privacy using noise addition.
|
|
101
|
+
|
|
102
|
+
Provides formal mathematical privacy guarantee by adding noise to
|
|
103
|
+
numerical data. Suitable for aggregate data and statistical queries,
|
|
104
|
+
NOT for individual records.
|
|
105
|
+
|
|
106
|
+
Features:
|
|
107
|
+
- Math privacy: ε-δ differential privacy guarantee
|
|
108
|
+
- Noise calibration: Automatic scale to data
|
|
109
|
+
- Budget tracking: Monitor privacy budget
|
|
110
|
+
- Mechanism choice: Laplace, Gaussian, Exponential
|
|
111
|
+
- Configurable: Control privacy-utility tradeoff
|
|
112
|
+
|
|
113
|
+
Privacy Mathematics:
|
|
114
|
+
Differential privacy ensures:
|
|
115
|
+
P(M(D) ∈ S) ≤ e^ε * P(M(D') ∈ S) + δ
|
|
116
|
+
|
|
117
|
+
Interpretation:
|
|
118
|
+
- Small ε: Difficult to determine if specific person in data
|
|
119
|
+
- Large ε: Easy to determine presence
|
|
120
|
+
- ε = 1: Strong but not extreme privacy
|
|
121
|
+
- ε = 10: Weaker privacy, less noise
|
|
122
|
+
|
|
123
|
+
How It Works:
|
|
124
|
+
1. Calculate data sensitivity (max change from one record)
|
|
125
|
+
2. Calculate noise scale based on ε and sensitivity
|
|
126
|
+
3. Sample noise from chosen distribution
|
|
127
|
+
4. Add noise to value
|
|
128
|
+
5. Track privacy budget consumption
|
|
129
|
+
|
|
130
|
+
Privacy Budget:
|
|
131
|
+
Each anonymization consumes budget:
|
|
132
|
+
budget_remaining -= budget_per_value
|
|
133
|
+
|
|
134
|
+
When budget exhausted: Stop anonymization or reject operations
|
|
135
|
+
|
|
136
|
+
NOT Suitable For:
|
|
137
|
+
- Individual PII (use hashing or FPE)
|
|
138
|
+
- Identifying records (differential privacy for aggregates)
|
|
139
|
+
- Categorical data (use hashing)
|
|
140
|
+
- Exact values needed (noise decreases accuracy)
|
|
141
|
+
- Real-time systems (budget tracking overhead)
|
|
142
|
+
|
|
143
|
+
Suitable For:
|
|
144
|
+
- Statistical queries (avg age, sum amounts)
|
|
145
|
+
- Census data (population counts)
|
|
146
|
+
- Aggregate salary data (salary ranges, distributions)
|
|
147
|
+
- Location heatmaps (aggregate geographic data)
|
|
148
|
+
- Sensor networks (aggregate sensor readings)
|
|
149
|
+
|
|
150
|
+
Example:
|
|
151
|
+
>>> config = DifferentialPrivacyConfig(
|
|
152
|
+
... epsilon=1.0,
|
|
153
|
+
... delta=1e-5,
|
|
154
|
+
... mechanism='laplace',
|
|
155
|
+
... data_type='numeric',
|
|
156
|
+
... sensitivity=1.0,
|
|
157
|
+
... budget_total=10.0,
|
|
158
|
+
... budget_per_value=0.1
|
|
159
|
+
... )
|
|
160
|
+
>>> strategy = DifferentialPrivacyStrategy(config)
|
|
161
|
+
>>>
|
|
162
|
+
>>> # Anonymize numeric values
|
|
163
|
+
>>> values = [35, 42, 28, 55] # Ages
|
|
164
|
+
>>> anonymized = [strategy.anonymize(v) for v in values]
|
|
165
|
+
>>> # [36.2, 40.8, 27.5, 56.1] (with noise added)
|
|
166
|
+
>>>
|
|
167
|
+
>>> # Budget tracking
|
|
168
|
+
>>> print(f"Budget remaining: {strategy.budget_remaining:.1f}")
|
|
169
|
+
>>> # Budget remaining: 9.6
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
budget_remaining: float = 0.0
|
|
173
|
+
"""Remaining privacy budget (decreases as values processed)."""
|
|
174
|
+
|
|
175
|
+
def __init__(self, config: DifferentialPrivacyConfig | None = None):
|
|
176
|
+
"""Initialize differential privacy strategy.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
config: DifferentialPrivacyConfig instance
|
|
180
|
+
|
|
181
|
+
Raises:
|
|
182
|
+
ValueError: If configuration invalid
|
|
183
|
+
"""
|
|
184
|
+
config = config or DifferentialPrivacyConfig()
|
|
185
|
+
super().__init__(config)
|
|
186
|
+
self.config: DifferentialPrivacyConfig = config
|
|
187
|
+
self.budget_remaining = config.budget_total
|
|
188
|
+
self._validate_config()
|
|
189
|
+
|
|
190
|
+
def _validate_config(self) -> None:
|
|
191
|
+
"""Validate configuration values.
|
|
192
|
+
|
|
193
|
+
Raises:
|
|
194
|
+
ValueError: If invalid values
|
|
195
|
+
"""
|
|
196
|
+
if self.config.epsilon <= 0:
|
|
197
|
+
raise ValueError("Epsilon must be positive")
|
|
198
|
+
|
|
199
|
+
if self.config.delta < 0 or self.config.delta >= 1:
|
|
200
|
+
raise ValueError("Delta must be in [0, 1)")
|
|
201
|
+
|
|
202
|
+
if self.config.sensitivity <= 0:
|
|
203
|
+
raise ValueError("Sensitivity must be positive")
|
|
204
|
+
|
|
205
|
+
if self.config.mechanism not in {"laplace", "gaussian", "exponential"}:
|
|
206
|
+
raise ValueError("Mechanism must be laplace, gaussian, or exponential")
|
|
207
|
+
|
|
208
|
+
if self.config.data_type not in {"numeric", "categorical", "location"}:
|
|
209
|
+
raise ValueError("Data type must be numeric, categorical, or location")
|
|
210
|
+
|
|
211
|
+
def anonymize(self, value: Any) -> Any:
|
|
212
|
+
"""Add noise to value using differential privacy.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
value: Numeric value to anonymize
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Noisy value (float)
|
|
219
|
+
|
|
220
|
+
Raises:
|
|
221
|
+
ValueError: If value is not numeric or privacy budget exhausted
|
|
222
|
+
"""
|
|
223
|
+
# Check budget
|
|
224
|
+
if self.budget_remaining <= 0:
|
|
225
|
+
raise ValueError("Privacy budget exhausted. Cannot anonymize more values.")
|
|
226
|
+
|
|
227
|
+
# Handle NULL
|
|
228
|
+
if value is None:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
# Validate numeric
|
|
232
|
+
try:
|
|
233
|
+
numeric_value = float(value)
|
|
234
|
+
except (TypeError, ValueError) as e:
|
|
235
|
+
raise ValueError(
|
|
236
|
+
f"DifferentialPrivacyStrategy only works with numeric values, "
|
|
237
|
+
f"got {type(value).__name__}: {value}"
|
|
238
|
+
) from e
|
|
239
|
+
|
|
240
|
+
# Calculate noise scale
|
|
241
|
+
noise_scale = self._calculate_noise_scale()
|
|
242
|
+
|
|
243
|
+
# Sample noise
|
|
244
|
+
noise = self._sample_noise(noise_scale)
|
|
245
|
+
|
|
246
|
+
# Consume budget
|
|
247
|
+
self.budget_remaining -= self.config.budget_per_value
|
|
248
|
+
|
|
249
|
+
# Return noisy value
|
|
250
|
+
return numeric_value + noise
|
|
251
|
+
|
|
252
|
+
def _calculate_noise_scale(self) -> float:
|
|
253
|
+
"""Calculate scale for noise distribution.
|
|
254
|
+
|
|
255
|
+
Scale depends on:
|
|
256
|
+
- Epsilon (privacy parameter)
|
|
257
|
+
- Sensitivity (max change from one record)
|
|
258
|
+
- Mechanism type
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Scale for noise distribution
|
|
262
|
+
"""
|
|
263
|
+
# Scale = Δf / ε
|
|
264
|
+
# Where Δf is sensitivity, ε is privacy budget
|
|
265
|
+
scale = self.config.sensitivity / self.config.epsilon
|
|
266
|
+
|
|
267
|
+
return scale
|
|
268
|
+
|
|
269
|
+
def _sample_noise(self, scale: float) -> float:
|
|
270
|
+
"""Sample noise from chosen distribution.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
scale: Scale parameter for distribution
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Sampled noise value
|
|
277
|
+
"""
|
|
278
|
+
if self.config.mechanism == "laplace":
|
|
279
|
+
# Laplace distribution: symmetric around 0
|
|
280
|
+
# Variance = 2 * scale^2
|
|
281
|
+
u = random.uniform(-0.5, 0.5)
|
|
282
|
+
noise = (
|
|
283
|
+
-scale
|
|
284
|
+
* (1 if u > 0 else -1)
|
|
285
|
+
* sum(1 for _ in range(int(-scale * __import__("math").log(2 * abs(u)))))
|
|
286
|
+
)
|
|
287
|
+
# Simplified: use exponential approximation
|
|
288
|
+
noise = (
|
|
289
|
+
scale * __import__("math").log(random.random())
|
|
290
|
+
if random.random() > 0.5
|
|
291
|
+
else -scale * __import__("math").log(random.random())
|
|
292
|
+
)
|
|
293
|
+
return noise
|
|
294
|
+
|
|
295
|
+
elif self.config.mechanism == "gaussian":
|
|
296
|
+
# Gaussian distribution: normal distribution
|
|
297
|
+
# Variance = 2 * scale^2 / delta (for (ε, δ)-DP)
|
|
298
|
+
import math
|
|
299
|
+
|
|
300
|
+
variance = 2 * (scale**2) / self.config.delta
|
|
301
|
+
stddev = math.sqrt(variance)
|
|
302
|
+
noise = random.gauss(0, stddev)
|
|
303
|
+
return noise
|
|
304
|
+
|
|
305
|
+
elif self.config.mechanism == "exponential":
|
|
306
|
+
# Exponential mechanism: for exponential-family distributions
|
|
307
|
+
scale_exp = 2 * scale / self.config.epsilon
|
|
308
|
+
noise = random.expovariate(1 / scale_exp)
|
|
309
|
+
if random.random() > 0.5:
|
|
310
|
+
noise = -noise
|
|
311
|
+
return noise
|
|
312
|
+
|
|
313
|
+
return 0.0
|
|
314
|
+
|
|
315
|
+
def validate(self, value: Any) -> bool:
|
|
316
|
+
"""Differential privacy only works with numeric values.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
value: Value to validate
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
True if value is numeric
|
|
323
|
+
"""
|
|
324
|
+
try:
|
|
325
|
+
float(value)
|
|
326
|
+
return True
|
|
327
|
+
except (TypeError, ValueError):
|
|
328
|
+
return False
|
|
329
|
+
|
|
330
|
+
def validate_comprehensive(
|
|
331
|
+
self,
|
|
332
|
+
value: Any,
|
|
333
|
+
column_name: str = "",
|
|
334
|
+
table_name: str = "",
|
|
335
|
+
) -> tuple[bool, list[str]]:
|
|
336
|
+
"""Comprehensive validation for differential privacy.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
value: Value to validate
|
|
340
|
+
column_name: Column name (for error context)
|
|
341
|
+
table_name: Table name (for error context)
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Tuple of (is_valid: bool, errors: list[str])
|
|
345
|
+
"""
|
|
346
|
+
errors = []
|
|
347
|
+
|
|
348
|
+
# Check numeric
|
|
349
|
+
try:
|
|
350
|
+
numeric_value = float(value)
|
|
351
|
+
if numeric_value != numeric_value: # NaN check
|
|
352
|
+
errors.append(f"Column {table_name}.{column_name}: NaN value cannot be anonymized")
|
|
353
|
+
except (TypeError, ValueError):
|
|
354
|
+
errors.append(
|
|
355
|
+
f"Column {table_name}.{column_name}: "
|
|
356
|
+
f"DifferentialPrivacyStrategy requires numeric values, "
|
|
357
|
+
f"got {type(value).__name__}"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Check budget
|
|
361
|
+
if self.budget_remaining <= 0:
|
|
362
|
+
errors.append(
|
|
363
|
+
f"Column {table_name}.{column_name}: "
|
|
364
|
+
f"Privacy budget exhausted (remaining: {self.budget_remaining:.1f})"
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# Check epsilon validity
|
|
368
|
+
if self.config.epsilon > 10:
|
|
369
|
+
errors.append(
|
|
370
|
+
f"Column {table_name}.{column_name}: "
|
|
371
|
+
f"Epsilon {self.config.epsilon} is high (privacy may be weak)"
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
return len(errors) == 0, errors
|
|
375
|
+
|
|
376
|
+
@property
|
|
377
|
+
def is_reversible(self) -> bool:
|
|
378
|
+
"""Differential privacy is irreversible.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
False (noise is irreversible)
|
|
382
|
+
"""
|
|
383
|
+
return False
|
|
384
|
+
|
|
385
|
+
def get_budget_status(self) -> dict[str, float]:
|
|
386
|
+
"""Get privacy budget status.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Dict with budget information
|
|
390
|
+
"""
|
|
391
|
+
return {
|
|
392
|
+
"total": self.config.budget_total,
|
|
393
|
+
"remaining": self.budget_remaining,
|
|
394
|
+
"consumed": self.config.budget_total - self.budget_remaining,
|
|
395
|
+
"percentage": (
|
|
396
|
+
100 * (self.config.budget_total - self.budget_remaining) / self.config.budget_total
|
|
397
|
+
),
|
|
398
|
+
}
|