fraiseql-confiture 0.3.4__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- confiture/__init__.py +48 -0
- confiture/_core.cp311-win_amd64.pyd +0 -0
- confiture/cli/__init__.py +0 -0
- confiture/cli/dry_run.py +116 -0
- confiture/cli/lint_formatter.py +193 -0
- confiture/cli/main.py +1656 -0
- confiture/config/__init__.py +0 -0
- confiture/config/environment.py +263 -0
- confiture/core/__init__.py +51 -0
- confiture/core/anonymization/__init__.py +0 -0
- confiture/core/anonymization/audit.py +485 -0
- confiture/core/anonymization/benchmarking.py +372 -0
- confiture/core/anonymization/breach_notification.py +652 -0
- confiture/core/anonymization/compliance.py +617 -0
- confiture/core/anonymization/composer.py +298 -0
- confiture/core/anonymization/data_subject_rights.py +669 -0
- confiture/core/anonymization/factory.py +319 -0
- confiture/core/anonymization/governance.py +737 -0
- confiture/core/anonymization/performance.py +1092 -0
- confiture/core/anonymization/profile.py +284 -0
- confiture/core/anonymization/registry.py +195 -0
- confiture/core/anonymization/security/kms_manager.py +547 -0
- confiture/core/anonymization/security/lineage.py +888 -0
- confiture/core/anonymization/security/token_store.py +686 -0
- confiture/core/anonymization/strategies/__init__.py +41 -0
- confiture/core/anonymization/strategies/address.py +359 -0
- confiture/core/anonymization/strategies/credit_card.py +374 -0
- confiture/core/anonymization/strategies/custom.py +161 -0
- confiture/core/anonymization/strategies/date.py +218 -0
- confiture/core/anonymization/strategies/differential_privacy.py +398 -0
- confiture/core/anonymization/strategies/email.py +141 -0
- confiture/core/anonymization/strategies/format_preserving_encryption.py +310 -0
- confiture/core/anonymization/strategies/hash.py +150 -0
- confiture/core/anonymization/strategies/ip_address.py +235 -0
- confiture/core/anonymization/strategies/masking_retention.py +252 -0
- confiture/core/anonymization/strategies/name.py +298 -0
- confiture/core/anonymization/strategies/phone.py +119 -0
- confiture/core/anonymization/strategies/preserve.py +85 -0
- confiture/core/anonymization/strategies/redact.py +101 -0
- confiture/core/anonymization/strategies/salted_hashing.py +322 -0
- confiture/core/anonymization/strategies/text_redaction.py +183 -0
- confiture/core/anonymization/strategies/tokenization.py +334 -0
- confiture/core/anonymization/strategy.py +241 -0
- confiture/core/anonymization/syncer_audit.py +357 -0
- confiture/core/blue_green.py +683 -0
- confiture/core/builder.py +500 -0
- confiture/core/checksum.py +358 -0
- confiture/core/connection.py +132 -0
- confiture/core/differ.py +522 -0
- confiture/core/drift.py +564 -0
- confiture/core/dry_run.py +182 -0
- confiture/core/health.py +313 -0
- confiture/core/hooks/__init__.py +87 -0
- confiture/core/hooks/base.py +232 -0
- confiture/core/hooks/context.py +146 -0
- confiture/core/hooks/execution_strategies.py +57 -0
- confiture/core/hooks/observability.py +220 -0
- confiture/core/hooks/phases.py +53 -0
- confiture/core/hooks/registry.py +295 -0
- confiture/core/large_tables.py +775 -0
- confiture/core/linting/__init__.py +70 -0
- confiture/core/linting/composer.py +192 -0
- confiture/core/linting/libraries/__init__.py +17 -0
- confiture/core/linting/libraries/gdpr.py +168 -0
- confiture/core/linting/libraries/general.py +184 -0
- confiture/core/linting/libraries/hipaa.py +144 -0
- confiture/core/linting/libraries/pci_dss.py +104 -0
- confiture/core/linting/libraries/sox.py +120 -0
- confiture/core/linting/schema_linter.py +491 -0
- confiture/core/linting/versioning.py +151 -0
- confiture/core/locking.py +389 -0
- confiture/core/migration_generator.py +298 -0
- confiture/core/migrator.py +793 -0
- confiture/core/observability/__init__.py +44 -0
- confiture/core/observability/audit.py +323 -0
- confiture/core/observability/logging.py +187 -0
- confiture/core/observability/metrics.py +174 -0
- confiture/core/observability/tracing.py +192 -0
- confiture/core/pg_version.py +418 -0
- confiture/core/pool.py +406 -0
- confiture/core/risk/__init__.py +39 -0
- confiture/core/risk/predictor.py +188 -0
- confiture/core/risk/scoring.py +248 -0
- confiture/core/rollback_generator.py +388 -0
- confiture/core/schema_analyzer.py +769 -0
- confiture/core/schema_to_schema.py +590 -0
- confiture/core/security/__init__.py +32 -0
- confiture/core/security/logging.py +201 -0
- confiture/core/security/validation.py +416 -0
- confiture/core/signals.py +371 -0
- confiture/core/syncer.py +540 -0
- confiture/exceptions.py +192 -0
- confiture/integrations/__init__.py +0 -0
- confiture/models/__init__.py +0 -0
- confiture/models/lint.py +193 -0
- confiture/models/migration.py +180 -0
- confiture/models/schema.py +203 -0
- confiture/scenarios/__init__.py +36 -0
- confiture/scenarios/compliance.py +586 -0
- confiture/scenarios/ecommerce.py +199 -0
- confiture/scenarios/financial.py +253 -0
- confiture/scenarios/healthcare.py +315 -0
- confiture/scenarios/multi_tenant.py +340 -0
- confiture/scenarios/saas.py +295 -0
- confiture/testing/FRAMEWORK_API.md +722 -0
- confiture/testing/__init__.py +38 -0
- confiture/testing/fixtures/__init__.py +11 -0
- confiture/testing/fixtures/data_validator.py +229 -0
- confiture/testing/fixtures/migration_runner.py +167 -0
- confiture/testing/fixtures/schema_snapshotter.py +352 -0
- confiture/testing/frameworks/__init__.py +10 -0
- confiture/testing/frameworks/mutation.py +587 -0
- confiture/testing/frameworks/performance.py +479 -0
- confiture/testing/utils/__init__.py +0 -0
- fraiseql_confiture-0.3.4.dist-info/METADATA +438 -0
- fraiseql_confiture-0.3.4.dist-info/RECORD +119 -0
- fraiseql_confiture-0.3.4.dist-info/WHEEL +4 -0
- fraiseql_confiture-0.3.4.dist-info/entry_points.txt +2 -0
- fraiseql_confiture-0.3.4.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""Healthcare PHI (Protected Health Information) anonymization scenario.
|
|
2
|
+
|
|
3
|
+
Real-world use case: Compliant anonymization for research across multiple regions.
|
|
4
|
+
|
|
5
|
+
Supports multiple data protection regulations:
|
|
6
|
+
- HIPAA (USA) - Safe Harbor rules for de-identification
|
|
7
|
+
- GDPR (EU/EEA) - General Data Protection Regulation
|
|
8
|
+
- PIPEDA (Canada) - Personal Information Protection Act
|
|
9
|
+
- LGPD (Brazil) - Lei Geral de Proteção de Dados
|
|
10
|
+
- PIPL (China) - Personal Information Protection Law
|
|
11
|
+
- Privacy Act (Australia) - Privacy Act 1988
|
|
12
|
+
- POPIA (South Africa) - Protection of Personal Information Act
|
|
13
|
+
|
|
14
|
+
Data Types (PHI - Protected Health Information):
|
|
15
|
+
- Patient names (PII)
|
|
16
|
+
- Social security numbers / Tax IDs (SSN)
|
|
17
|
+
- Dates of birth (sensitive)
|
|
18
|
+
- Medical record numbers (identifiers)
|
|
19
|
+
- Diagnosis codes (sensitive)
|
|
20
|
+
- Medication information (sensitive)
|
|
21
|
+
- Provider names (PII)
|
|
22
|
+
- Facility names (may be identifying)
|
|
23
|
+
- Visit dates (sensitive)
|
|
24
|
+
- Vital signs (may need masking)
|
|
25
|
+
- Test results (sensitive)
|
|
26
|
+
|
|
27
|
+
Strategy:
|
|
28
|
+
- Names: Complete masking
|
|
29
|
+
- SSN/Tax IDs: Pattern redaction
|
|
30
|
+
- Birth dates: Year masking
|
|
31
|
+
- Medical record numbers: Hash-based replacement
|
|
32
|
+
- Diagnoses: Preserve ICD codes
|
|
33
|
+
- Medications: Preserve as-is
|
|
34
|
+
- Dates: Preserve year only
|
|
35
|
+
- IP addresses: Complete masking
|
|
36
|
+
- Facilities: Preserve facility ID but mask name
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from confiture.core.anonymization.factory import StrategyFactory, StrategyProfile
|
|
40
|
+
from confiture.scenarios.compliance import (
|
|
41
|
+
ComplianceVerifier,
|
|
42
|
+
RegulationType,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class HealthcareScenario:
|
|
47
|
+
"""Healthcare PHI anonymization scenario supporting multiple regulations.
|
|
48
|
+
|
|
49
|
+
Demonstrates anonymization for research across different regions with
|
|
50
|
+
compliance verification for various data protection regulations.
|
|
51
|
+
|
|
52
|
+
Example (Default - HIPAA):
|
|
53
|
+
>>> scenario = HealthcareScenario()
|
|
54
|
+
>>> data = {
|
|
55
|
+
... "patient_id": "PAT-00123",
|
|
56
|
+
... "patient_name": "John Smith",
|
|
57
|
+
... "ssn": "123-45-6789",
|
|
58
|
+
... "date_of_birth": "1965-03-12",
|
|
59
|
+
... "medical_record_number": "MRN-999888",
|
|
60
|
+
... "diagnosis": "E11", # Type 2 diabetes
|
|
61
|
+
... "medication": "Metformin 500mg",
|
|
62
|
+
... "visit_date": "2024-12-15",
|
|
63
|
+
... "provider_name": "Dr. Sarah Johnson",
|
|
64
|
+
... "facility_name": "St. Mary's Hospital",
|
|
65
|
+
... }
|
|
66
|
+
>>> anonymized = scenario.anonymize(data)
|
|
67
|
+
>>> # PHI masked, clinical data preserved
|
|
68
|
+
|
|
69
|
+
Example (GDPR):
|
|
70
|
+
>>> anonymized = scenario.anonymize(data, regulation=RegulationType.GDPR)
|
|
71
|
+
>>> compliant = scenario.verify_compliance(data, anonymized, RegulationType.GDPR)
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
# Default seed for reproducibility
|
|
75
|
+
DEFAULT_SEED = 42
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def get_profile(regulation: RegulationType = RegulationType.GDPR) -> StrategyProfile:
|
|
79
|
+
"""Get healthcare anonymization profile for specified regulation.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
regulation: Target data protection regulation. Defaults to GDPR.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
StrategyProfile configured for healthcare PHI anonymization.
|
|
86
|
+
|
|
87
|
+
Strategy Mapping (applies to all regulations):
|
|
88
|
+
- patient_id: preserve (study identifier)
|
|
89
|
+
- patient_name: name masking (complete)
|
|
90
|
+
- ssn: text redaction (SSN pattern)
|
|
91
|
+
- date_of_birth: date masking (year conversion to safe range)
|
|
92
|
+
- medical_record_number: custom hash
|
|
93
|
+
- diagnosis: preserve (ICD code)
|
|
94
|
+
- medication: preserve (clinical)
|
|
95
|
+
- visit_date: date masking (year only)
|
|
96
|
+
- provider_name: name masking
|
|
97
|
+
- facility_name: name masking
|
|
98
|
+
- vital signs: preserve (clinical)
|
|
99
|
+
- test results: preserve (clinical)
|
|
100
|
+
"""
|
|
101
|
+
profile_name = f"healthcare_{regulation.value}"
|
|
102
|
+
return StrategyProfile(
|
|
103
|
+
name=profile_name,
|
|
104
|
+
seed=HealthcareScenario.DEFAULT_SEED, # Fixed seed for reproducibility
|
|
105
|
+
columns={
|
|
106
|
+
# Study/research identifiers - preserve
|
|
107
|
+
"patient_id": "preserve",
|
|
108
|
+
"study_id": "preserve",
|
|
109
|
+
"record_id": "preserve",
|
|
110
|
+
# PII - mask completely
|
|
111
|
+
"patient_name": "name",
|
|
112
|
+
"first_name": "name",
|
|
113
|
+
"last_name": "name",
|
|
114
|
+
"provider_name": "name",
|
|
115
|
+
"provider_first": "name",
|
|
116
|
+
"provider_last": "name",
|
|
117
|
+
# Identifiers - redact/mask
|
|
118
|
+
"ssn": "text_redaction",
|
|
119
|
+
"social_security_number": "text_redaction",
|
|
120
|
+
"medical_record_number": "text_redaction",
|
|
121
|
+
"mrn": "text_redaction",
|
|
122
|
+
# Contact - redact
|
|
123
|
+
"email": "text_redaction",
|
|
124
|
+
"phone": "text_redaction",
|
|
125
|
+
"phone_number": "text_redaction",
|
|
126
|
+
"address": "address",
|
|
127
|
+
# Sensitive dates - mask to year only
|
|
128
|
+
"date_of_birth": "date",
|
|
129
|
+
"birth_date": "date",
|
|
130
|
+
"dob": "date",
|
|
131
|
+
"admission_date": "date",
|
|
132
|
+
"discharge_date": "date",
|
|
133
|
+
"visit_date": "date",
|
|
134
|
+
"appointment_date": "date",
|
|
135
|
+
"procedure_date": "date",
|
|
136
|
+
"test_date": "date",
|
|
137
|
+
# Clinical data - preserve
|
|
138
|
+
"diagnosis": "preserve",
|
|
139
|
+
"diagnosis_code": "preserve",
|
|
140
|
+
"icd_code": "preserve",
|
|
141
|
+
"procedure": "preserve",
|
|
142
|
+
"procedure_code": "preserve",
|
|
143
|
+
"medication": "preserve",
|
|
144
|
+
"drug_name": "preserve",
|
|
145
|
+
"dosage": "preserve",
|
|
146
|
+
"route": "preserve",
|
|
147
|
+
"frequency": "preserve",
|
|
148
|
+
# Vital signs - preserve
|
|
149
|
+
"temperature": "preserve",
|
|
150
|
+
"heart_rate": "preserve",
|
|
151
|
+
"blood_pressure": "preserve",
|
|
152
|
+
"respiratory_rate": "preserve",
|
|
153
|
+
"oxygen_saturation": "preserve",
|
|
154
|
+
"weight": "preserve",
|
|
155
|
+
"height": "preserve",
|
|
156
|
+
"bmi": "preserve",
|
|
157
|
+
# Lab results - preserve
|
|
158
|
+
"test_name": "preserve",
|
|
159
|
+
"test_value": "preserve",
|
|
160
|
+
"test_result": "preserve",
|
|
161
|
+
"lab_result": "preserve",
|
|
162
|
+
"reference_range": "preserve",
|
|
163
|
+
# Facility - preserve facility ID but mask name
|
|
164
|
+
"facility_id": "preserve",
|
|
165
|
+
"facility_name": "name",
|
|
166
|
+
"facility_code": "preserve",
|
|
167
|
+
"department": "preserve",
|
|
168
|
+
"ward": "preserve",
|
|
169
|
+
# Location - generalize
|
|
170
|
+
"city": "preserve",
|
|
171
|
+
"state": "preserve",
|
|
172
|
+
"country": "preserve",
|
|
173
|
+
# Metadata - preserve
|
|
174
|
+
"encounter_type": "preserve",
|
|
175
|
+
"admission_type": "preserve",
|
|
176
|
+
"discharge_disposition": "preserve",
|
|
177
|
+
"status": "preserve",
|
|
178
|
+
# IP/technical - mask
|
|
179
|
+
"ip_address": "ip_address",
|
|
180
|
+
"device_id": "preserve",
|
|
181
|
+
},
|
|
182
|
+
defaults="preserve",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
@classmethod
|
|
186
|
+
def create_factory(cls, regulation: RegulationType = RegulationType.GDPR) -> StrategyFactory:
|
|
187
|
+
"""Create factory for healthcare anonymization.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
regulation: Target data protection regulation. Defaults to GDPR.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Configured StrategyFactory for healthcare PHI.
|
|
194
|
+
"""
|
|
195
|
+
profile = cls.get_profile(regulation)
|
|
196
|
+
return StrategyFactory(profile)
|
|
197
|
+
|
|
198
|
+
@classmethod
|
|
199
|
+
def anonymize(cls, data: dict, regulation: RegulationType = RegulationType.GDPR) -> dict:
|
|
200
|
+
"""Anonymize healthcare PHI data according to specified regulation.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
data: Patient/encounter data dictionary.
|
|
204
|
+
regulation: Target data protection regulation. Defaults to GDPR.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Compliant anonymized data with PHI masked.
|
|
208
|
+
|
|
209
|
+
Example:
|
|
210
|
+
>>> data = {
|
|
211
|
+
... "patient_id": "PAT-00123",
|
|
212
|
+
... "patient_name": "John Smith",
|
|
213
|
+
... "ssn": "123-45-6789",
|
|
214
|
+
... "diagnosis": "E11",
|
|
215
|
+
... "medication": "Metformin 500mg",
|
|
216
|
+
... }
|
|
217
|
+
>>> result = HealthcareScenario.anonymize(data)
|
|
218
|
+
>>> result["patient_id"] # Preserved
|
|
219
|
+
'PAT-00123'
|
|
220
|
+
>>> result["patient_name"] # Anonymized
|
|
221
|
+
'Michael Johnson'
|
|
222
|
+
>>> result["ssn"] # Redacted
|
|
223
|
+
'[REDACTED]'
|
|
224
|
+
>>> result["diagnosis"] # Preserved
|
|
225
|
+
'E11'
|
|
226
|
+
|
|
227
|
+
>>> # Use different regulation
|
|
228
|
+
>>> result_ccpa = HealthcareScenario.anonymize(data, RegulationType.CCPA)
|
|
229
|
+
"""
|
|
230
|
+
factory = cls.create_factory(regulation)
|
|
231
|
+
return factory.anonymize(data)
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
def anonymize_batch(
|
|
235
|
+
cls, data_list: list[dict], regulation: RegulationType = RegulationType.GDPR
|
|
236
|
+
) -> list[dict]:
|
|
237
|
+
"""Anonymize batch of healthcare records.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
data_list: List of patient/encounter records.
|
|
241
|
+
regulation: Target data protection regulation. Defaults to GDPR.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
List of compliant anonymized records.
|
|
245
|
+
"""
|
|
246
|
+
factory = cls.create_factory(regulation)
|
|
247
|
+
return [factory.anonymize(record) for record in data_list]
|
|
248
|
+
|
|
249
|
+
@classmethod
|
|
250
|
+
def get_strategy_info(cls) -> dict:
|
|
251
|
+
"""Get information about strategies used.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
Dictionary mapping column names to strategy names.
|
|
255
|
+
"""
|
|
256
|
+
profile = cls.get_profile()
|
|
257
|
+
factory = StrategyFactory(profile)
|
|
258
|
+
return factory.list_column_strategies()
|
|
259
|
+
|
|
260
|
+
@classmethod
|
|
261
|
+
def verify_compliance(
|
|
262
|
+
cls, original: dict, anonymized: dict, regulation: RegulationType = RegulationType.GDPR
|
|
263
|
+
) -> dict:
|
|
264
|
+
"""Verify compliance of anonymized data with specified regulation.
|
|
265
|
+
|
|
266
|
+
Checks that sensitive fields have been properly masked according to
|
|
267
|
+
the regulation's requirements.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
original: Original data before anonymization.
|
|
271
|
+
anonymized: Anonymized data.
|
|
272
|
+
regulation: Target data protection regulation. Defaults to GDPR.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Dictionary with compliance verification results including:
|
|
276
|
+
- compliant: Boolean indicating compliance status
|
|
277
|
+
- regulation: Name of regulation checked
|
|
278
|
+
- masked_fields: List of fields that were anonymized
|
|
279
|
+
- preserved_fields: List of fields that were preserved
|
|
280
|
+
- issues: List of compliance issues if any
|
|
281
|
+
- masked_count: Number of masked fields
|
|
282
|
+
- preserved_count: Number of preserved fields
|
|
283
|
+
|
|
284
|
+
Example:
|
|
285
|
+
>>> data = {
|
|
286
|
+
... "patient_id": "PAT-123",
|
|
287
|
+
... "patient_name": "John Smith",
|
|
288
|
+
... "ssn": "123-45-6789",
|
|
289
|
+
... }
|
|
290
|
+
>>> anon = HealthcareScenario.anonymize(data, RegulationType.GDPR)
|
|
291
|
+
>>> result = HealthcareScenario.verify_compliance(data, anon, RegulationType.GDPR)
|
|
292
|
+
>>> print(result["compliant"])
|
|
293
|
+
True
|
|
294
|
+
"""
|
|
295
|
+
verifier = ComplianceVerifier(regulation)
|
|
296
|
+
return verifier.verify_anonymization(original, anonymized)
|
|
297
|
+
|
|
298
|
+
@classmethod
|
|
299
|
+
def get_compliance_requirements(cls, regulation: RegulationType = RegulationType.GDPR) -> dict:
|
|
300
|
+
"""Get compliance requirements for specified regulation.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
regulation: Target data protection regulation. Defaults to GDPR.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
Dictionary with regulation requirements including applicable
|
|
307
|
+
data categories and consent requirements.
|
|
308
|
+
|
|
309
|
+
Example:
|
|
310
|
+
>>> reqs = HealthcareScenario.get_compliance_requirements(RegulationType.GDPR)
|
|
311
|
+
>>> print(reqs["total_categories"]) # Number of applicable categories
|
|
312
|
+
15
|
|
313
|
+
"""
|
|
314
|
+
verifier = ComplianceVerifier(regulation)
|
|
315
|
+
return verifier.get_requirements()
|
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""Multi-tenant data isolation and anonymization scenario.
|
|
2
|
+
|
|
3
|
+
Real-world use case: Anonymizing tenant data while ensuring data isolation across
|
|
4
|
+
customers in multi-tenant systems.
|
|
5
|
+
|
|
6
|
+
Data Types:
|
|
7
|
+
- Tenant identifiers (preserve for data isolation)
|
|
8
|
+
- Tenant names (sensitive - may be identifying)
|
|
9
|
+
- User names (PII)
|
|
10
|
+
- Email addresses (PII)
|
|
11
|
+
- Organization information (sensitive)
|
|
12
|
+
- Tenant-specific data (anonymize per tenant config)
|
|
13
|
+
- Cross-tenant shared data (preserve for auditing)
|
|
14
|
+
|
|
15
|
+
Architecture:
|
|
16
|
+
- Each tenant has isolated data
|
|
17
|
+
- Global seed by tenant ensures consistent hashing across tables
|
|
18
|
+
- Tenant metadata preserved for data isolation
|
|
19
|
+
- Customer names and PII masked
|
|
20
|
+
- Business metrics preserved
|
|
21
|
+
|
|
22
|
+
Strategy:
|
|
23
|
+
- Tenant identifiers: Preserve (data isolation key)
|
|
24
|
+
- Tenant names: Mask with initials
|
|
25
|
+
- User data: Per-tenant anonymization
|
|
26
|
+
- Cross-cutting data: Preserve for auditing
|
|
27
|
+
- Relationships: Maintain via deterministic hashing
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from confiture.core.anonymization.factory import StrategyFactory, StrategyProfile
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MultiTenantScenario:
|
|
34
|
+
"""Multi-tenant data anonymization scenario.
|
|
35
|
+
|
|
36
|
+
Demonstrates anonymizing multi-tenant data while maintaining data isolation
|
|
37
|
+
and cross-table consistency through deterministic seeding.
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
>>> scenario = MultiTenantScenario()
|
|
41
|
+
>>> tenant_a_data = {
|
|
42
|
+
... "tenant_id": "TENANT-A",
|
|
43
|
+
... "user_id": "USER-001",
|
|
44
|
+
... "user_name": "john.smith",
|
|
45
|
+
... "text_redaction": "john@companya.com",
|
|
46
|
+
... "organization": "Company A",
|
|
47
|
+
... "department": "Engineering",
|
|
48
|
+
... }
|
|
49
|
+
>>> tenant_b_data = {
|
|
50
|
+
... "tenant_id": "TENANT-B",
|
|
51
|
+
... "user_id": "USER-001", # Same user ID, different tenant
|
|
52
|
+
... "user_name": "jane.doe",
|
|
53
|
+
... "text_redaction": "jane@companyb.com",
|
|
54
|
+
... "organization": "Company B",
|
|
55
|
+
... "department": "Sales",
|
|
56
|
+
... }
|
|
57
|
+
>>> anon_a = scenario.anonymize(tenant_a_data)
|
|
58
|
+
>>> anon_b = scenario.anonymize(tenant_b_data)
|
|
59
|
+
>>> # Tenant IDs preserved, user data anonymized differently per tenant
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def get_profile(tenant_id: str) -> StrategyProfile:
|
|
64
|
+
"""Get multi-tenant anonymization profile.
|
|
65
|
+
|
|
66
|
+
Uses tenant ID to create deterministic seed for cross-table consistency
|
|
67
|
+
within tenant boundaries.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
tenant_id: Tenant identifier for seed generation.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
StrategyProfile configured for multi-tenant data with tenant-specific seed.
|
|
74
|
+
|
|
75
|
+
Strategy Mapping:
|
|
76
|
+
- tenant_id: preserve (data isolation key)
|
|
77
|
+
- user_id: preserve (tenant-scoped identifier)
|
|
78
|
+
- user_name: anonymize
|
|
79
|
+
- email: redact
|
|
80
|
+
- organization_name: mask
|
|
81
|
+
- department: preserve (business metadata)
|
|
82
|
+
- created_by: anonymize
|
|
83
|
+
- updated_by: anonymize
|
|
84
|
+
- tenant_metadata: preserve (for auditing)
|
|
85
|
+
- business_metrics: preserve (for analytics)
|
|
86
|
+
"""
|
|
87
|
+
# Create deterministic seed from tenant ID
|
|
88
|
+
# This ensures same seed for all records in same tenant
|
|
89
|
+
tenant_seed = hash(tenant_id) & 0x7FFFFFFF # Positive integer
|
|
90
|
+
|
|
91
|
+
return StrategyProfile(
|
|
92
|
+
name=f"multi_tenant_{tenant_id}",
|
|
93
|
+
seed=tenant_seed, # Tenant-specific seed
|
|
94
|
+
columns={
|
|
95
|
+
# Tenant identifiers - preserve for data isolation
|
|
96
|
+
"tenant_id": "preserve",
|
|
97
|
+
"tenant_uuid": "preserve",
|
|
98
|
+
"account_id": "preserve",
|
|
99
|
+
"workspace_id": "preserve",
|
|
100
|
+
"client_id": "preserve",
|
|
101
|
+
"customer_id": "preserve",
|
|
102
|
+
# User identifiers - preserve (tenant-scoped)
|
|
103
|
+
"user_id": "preserve",
|
|
104
|
+
"user_uuid": "preserve",
|
|
105
|
+
"employee_id": "preserve",
|
|
106
|
+
"member_id": "preserve",
|
|
107
|
+
# User PII - anonymize
|
|
108
|
+
"user_name": "text_redaction",
|
|
109
|
+
"username": "text_redaction",
|
|
110
|
+
"first_name": "name",
|
|
111
|
+
"last_name": "name",
|
|
112
|
+
"full_name": "name",
|
|
113
|
+
"display_name": "name",
|
|
114
|
+
"email": "text_redaction",
|
|
115
|
+
"phone": "text_redaction",
|
|
116
|
+
"phone_number": "text_redaction",
|
|
117
|
+
# Organization/Tenant info - mask names
|
|
118
|
+
"organization_name": "name",
|
|
119
|
+
"tenant_name": "name",
|
|
120
|
+
"company_name": "name",
|
|
121
|
+
"department": "preserve", # Business metadata
|
|
122
|
+
"team": "preserve",
|
|
123
|
+
"division": "preserve",
|
|
124
|
+
# Address - mask
|
|
125
|
+
"address": "address",
|
|
126
|
+
"city": "preserve",
|
|
127
|
+
"state": "preserve",
|
|
128
|
+
"country": "preserve",
|
|
129
|
+
# Relationships - anonymize names but preserve IDs
|
|
130
|
+
"created_by": "text_redaction",
|
|
131
|
+
"created_by_user_id": "preserve",
|
|
132
|
+
"updated_by": "text_redaction",
|
|
133
|
+
"updated_by_user_id": "preserve",
|
|
134
|
+
"assigned_to": "text_redaction",
|
|
135
|
+
"assigned_to_user_id": "preserve",
|
|
136
|
+
"manager": "text_redaction",
|
|
137
|
+
"manager_id": "preserve",
|
|
138
|
+
# Tenant metadata - preserve
|
|
139
|
+
"tenant_type": "preserve",
|
|
140
|
+
"tenant_status": "preserve",
|
|
141
|
+
"tenant_tier": "preserve",
|
|
142
|
+
"industry": "preserve",
|
|
143
|
+
"region": "preserve",
|
|
144
|
+
"timezone": "preserve",
|
|
145
|
+
# Business metrics - preserve
|
|
146
|
+
"active_users": "preserve",
|
|
147
|
+
"total_users": "preserve",
|
|
148
|
+
"data_storage": "preserve",
|
|
149
|
+
"api_quota": "preserve",
|
|
150
|
+
"monthly_cost": "preserve",
|
|
151
|
+
"annual_contract_value": "preserve",
|
|
152
|
+
# Dates - preserve for audit trail
|
|
153
|
+
"created_at": "preserve",
|
|
154
|
+
"updated_at": "preserve",
|
|
155
|
+
"deleted_at": "preserve",
|
|
156
|
+
"last_login": "date",
|
|
157
|
+
"contract_start": "preserve",
|
|
158
|
+
"contract_end": "preserve",
|
|
159
|
+
"billing_cycle": "preserve",
|
|
160
|
+
# Content/Data - preserve
|
|
161
|
+
"description": "preserve",
|
|
162
|
+
"notes": "preserve",
|
|
163
|
+
"tags": "preserve",
|
|
164
|
+
"status": "preserve",
|
|
165
|
+
"data_classification": "preserve",
|
|
166
|
+
# IP/Technical - mask
|
|
167
|
+
"ip_address": "ip_address",
|
|
168
|
+
"device_id": "preserve",
|
|
169
|
+
"browser": "preserve",
|
|
170
|
+
# Audit fields
|
|
171
|
+
"change_log": "preserve",
|
|
172
|
+
"audit_trail": "preserve",
|
|
173
|
+
},
|
|
174
|
+
defaults="preserve",
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def create_factory(cls, tenant_id: str) -> StrategyFactory:
|
|
179
|
+
"""Create tenant-specific factory for anonymization.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
tenant_id: Tenant identifier for isolation.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Configured StrategyFactory for the tenant.
|
|
186
|
+
"""
|
|
187
|
+
profile = cls.get_profile(tenant_id)
|
|
188
|
+
return StrategyFactory(profile)
|
|
189
|
+
|
|
190
|
+
@classmethod
|
|
191
|
+
def anonymize(cls, data: dict) -> dict:
|
|
192
|
+
"""Anonymize multi-tenant data.
|
|
193
|
+
|
|
194
|
+
Extracts tenant ID from data and uses tenant-specific seed for
|
|
195
|
+
deterministic anonymization within tenant boundaries.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
data: Record containing tenant_id and other fields.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Anonymized data with PII masked and tenant isolation maintained.
|
|
202
|
+
|
|
203
|
+
Raises:
|
|
204
|
+
ValueError: If tenant_id not in data.
|
|
205
|
+
|
|
206
|
+
Example:
|
|
207
|
+
>>> data = {
|
|
208
|
+
... "tenant_id": "TENANT-A",
|
|
209
|
+
... "user_id": "USER-001",
|
|
210
|
+
... "text_redaction": "john@example.com",
|
|
211
|
+
... "organization_name": "Company A",
|
|
212
|
+
... }
|
|
213
|
+
>>> result = MultiTenantScenario.anonymize(data)
|
|
214
|
+
>>> result["tenant_id"] # Preserved
|
|
215
|
+
'TENANT-A'
|
|
216
|
+
>>> result["text_redaction"] # Redacted
|
|
217
|
+
'[EMAIL]'
|
|
218
|
+
>>> result["organization_name"] # Masked
|
|
219
|
+
'CA'
|
|
220
|
+
"""
|
|
221
|
+
if "tenant_id" not in data:
|
|
222
|
+
raise ValueError("Data must contain 'tenant_id' field for multi-tenant anonymization")
|
|
223
|
+
|
|
224
|
+
tenant_id = data["tenant_id"]
|
|
225
|
+
factory = cls.create_factory(tenant_id)
|
|
226
|
+
return factory.anonymize(data)
|
|
227
|
+
|
|
228
|
+
@classmethod
|
|
229
|
+
def anonymize_batch(cls, data_list: list[dict]) -> list[dict]:
|
|
230
|
+
"""Anonymize batch of multi-tenant records.
|
|
231
|
+
|
|
232
|
+
Creates per-tenant factories to maintain data isolation while
|
|
233
|
+
anonymizing deterministically within each tenant.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
data_list: List of records from potentially multiple tenants.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
List of anonymized records maintaining tenant isolation.
|
|
240
|
+
|
|
241
|
+
Example:
|
|
242
|
+
>>> data = [
|
|
243
|
+
... {"tenant_id": "TENANT-A", "user_id": "U1", ...},
|
|
244
|
+
... {"tenant_id": "TENANT-A", "user_id": "U2", ...},
|
|
245
|
+
... {"tenant_id": "TENANT-B", "user_id": "U1", ...},
|
|
246
|
+
... ]
|
|
247
|
+
>>> results = MultiTenantScenario.anonymize_batch(data)
|
|
248
|
+
>>> # TENANT-A records use TENANT-A seed, TENANT-B uses TENANT-B seed
|
|
249
|
+
"""
|
|
250
|
+
results = []
|
|
251
|
+
factories_cache = {}
|
|
252
|
+
|
|
253
|
+
for record in data_list:
|
|
254
|
+
tenant_id = record.get("tenant_id")
|
|
255
|
+
if not tenant_id:
|
|
256
|
+
raise ValueError("All records must contain 'tenant_id' field")
|
|
257
|
+
|
|
258
|
+
# Cache factories by tenant to avoid recreating
|
|
259
|
+
if tenant_id not in factories_cache:
|
|
260
|
+
factories_cache[tenant_id] = cls.create_factory(tenant_id)
|
|
261
|
+
|
|
262
|
+
factory = factories_cache[tenant_id]
|
|
263
|
+
results.append(factory.anonymize(record))
|
|
264
|
+
|
|
265
|
+
return results
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
def get_strategy_info(cls, tenant_id: str) -> dict:
|
|
269
|
+
"""Get strategies for specific tenant.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
tenant_id: Tenant identifier.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Dictionary mapping columns to strategy names for tenant.
|
|
276
|
+
"""
|
|
277
|
+
profile = cls.get_profile(tenant_id)
|
|
278
|
+
factory = StrategyFactory(profile)
|
|
279
|
+
return factory.list_column_strategies()
|
|
280
|
+
|
|
281
|
+
@classmethod
|
|
282
|
+
def verify_data_isolation(cls, data_list: list[dict], original_list: list[dict]) -> dict:
|
|
283
|
+
"""Verify data isolation across tenants.
|
|
284
|
+
|
|
285
|
+
Checks that same user IDs in different tenants produce different
|
|
286
|
+
anonymized results due to tenant-specific seeding.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
data_list: Anonymized records.
|
|
290
|
+
original_list: Original records.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Dictionary with isolation verification results.
|
|
294
|
+
"""
|
|
295
|
+
results = {
|
|
296
|
+
"isolated": True,
|
|
297
|
+
"issues": [],
|
|
298
|
+
"cross_tenant_checks": [],
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
# Group by tenant
|
|
302
|
+
by_tenant = {}
|
|
303
|
+
for record in data_list:
|
|
304
|
+
tenant = record.get("tenant_id", "UNKNOWN")
|
|
305
|
+
if tenant not in by_tenant:
|
|
306
|
+
by_tenant[tenant] = []
|
|
307
|
+
by_tenant[tenant].append(record)
|
|
308
|
+
|
|
309
|
+
# Check isolation: same user_id in different tenants should have different PII
|
|
310
|
+
user_by_tenant = {}
|
|
311
|
+
for i, record in enumerate(original_list):
|
|
312
|
+
tenant = record.get("tenant_id")
|
|
313
|
+
user_id = record.get("user_id")
|
|
314
|
+
key = (user_id,)
|
|
315
|
+
|
|
316
|
+
if key not in user_by_tenant:
|
|
317
|
+
user_by_tenant[key] = {}
|
|
318
|
+
|
|
319
|
+
user_by_tenant[key][tenant] = {
|
|
320
|
+
"original": record,
|
|
321
|
+
"anonymized": data_list[i],
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
# Verify same user in different tenants has different anonymizations
|
|
325
|
+
for (user_id,), tenants_data in user_by_tenant.items():
|
|
326
|
+
if len(tenants_data) > 1:
|
|
327
|
+
anon_values = [
|
|
328
|
+
tenants_data[t]["anonymized"].get("text_redaction") for t in tenants_data
|
|
329
|
+
]
|
|
330
|
+
if len(set(anon_values)) != len(anon_values):
|
|
331
|
+
results["isolated"] = False
|
|
332
|
+
results["issues"].append(
|
|
333
|
+
f"User {user_id} has same anonymization in different tenants"
|
|
334
|
+
)
|
|
335
|
+
else:
|
|
336
|
+
results["cross_tenant_checks"].append(
|
|
337
|
+
f"User {user_id}: ✓ Properly isolated across {len(tenants_data)} tenants"
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
return results
|