fraiseql-confiture 0.3.7__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- confiture/__init__.py +48 -0
- confiture/_core.cpython-311-darwin.so +0 -0
- confiture/cli/__init__.py +0 -0
- confiture/cli/dry_run.py +116 -0
- confiture/cli/lint_formatter.py +193 -0
- confiture/cli/main.py +1893 -0
- confiture/config/__init__.py +0 -0
- confiture/config/environment.py +263 -0
- confiture/core/__init__.py +51 -0
- confiture/core/anonymization/__init__.py +0 -0
- confiture/core/anonymization/audit.py +485 -0
- confiture/core/anonymization/benchmarking.py +372 -0
- confiture/core/anonymization/breach_notification.py +652 -0
- confiture/core/anonymization/compliance.py +617 -0
- confiture/core/anonymization/composer.py +298 -0
- confiture/core/anonymization/data_subject_rights.py +669 -0
- confiture/core/anonymization/factory.py +319 -0
- confiture/core/anonymization/governance.py +737 -0
- confiture/core/anonymization/performance.py +1092 -0
- confiture/core/anonymization/profile.py +284 -0
- confiture/core/anonymization/registry.py +195 -0
- confiture/core/anonymization/security/kms_manager.py +547 -0
- confiture/core/anonymization/security/lineage.py +888 -0
- confiture/core/anonymization/security/token_store.py +686 -0
- confiture/core/anonymization/strategies/__init__.py +41 -0
- confiture/core/anonymization/strategies/address.py +359 -0
- confiture/core/anonymization/strategies/credit_card.py +374 -0
- confiture/core/anonymization/strategies/custom.py +161 -0
- confiture/core/anonymization/strategies/date.py +218 -0
- confiture/core/anonymization/strategies/differential_privacy.py +398 -0
- confiture/core/anonymization/strategies/email.py +141 -0
- confiture/core/anonymization/strategies/format_preserving_encryption.py +310 -0
- confiture/core/anonymization/strategies/hash.py +150 -0
- confiture/core/anonymization/strategies/ip_address.py +235 -0
- confiture/core/anonymization/strategies/masking_retention.py +252 -0
- confiture/core/anonymization/strategies/name.py +298 -0
- confiture/core/anonymization/strategies/phone.py +119 -0
- confiture/core/anonymization/strategies/preserve.py +85 -0
- confiture/core/anonymization/strategies/redact.py +101 -0
- confiture/core/anonymization/strategies/salted_hashing.py +322 -0
- confiture/core/anonymization/strategies/text_redaction.py +183 -0
- confiture/core/anonymization/strategies/tokenization.py +334 -0
- confiture/core/anonymization/strategy.py +241 -0
- confiture/core/anonymization/syncer_audit.py +357 -0
- confiture/core/blue_green.py +683 -0
- confiture/core/builder.py +500 -0
- confiture/core/checksum.py +358 -0
- confiture/core/connection.py +184 -0
- confiture/core/differ.py +522 -0
- confiture/core/drift.py +564 -0
- confiture/core/dry_run.py +182 -0
- confiture/core/health.py +313 -0
- confiture/core/hooks/__init__.py +87 -0
- confiture/core/hooks/base.py +232 -0
- confiture/core/hooks/context.py +146 -0
- confiture/core/hooks/execution_strategies.py +57 -0
- confiture/core/hooks/observability.py +220 -0
- confiture/core/hooks/phases.py +53 -0
- confiture/core/hooks/registry.py +295 -0
- confiture/core/large_tables.py +775 -0
- confiture/core/linting/__init__.py +70 -0
- confiture/core/linting/composer.py +192 -0
- confiture/core/linting/libraries/__init__.py +17 -0
- confiture/core/linting/libraries/gdpr.py +168 -0
- confiture/core/linting/libraries/general.py +184 -0
- confiture/core/linting/libraries/hipaa.py +144 -0
- confiture/core/linting/libraries/pci_dss.py +104 -0
- confiture/core/linting/libraries/sox.py +120 -0
- confiture/core/linting/schema_linter.py +491 -0
- confiture/core/linting/versioning.py +151 -0
- confiture/core/locking.py +389 -0
- confiture/core/migration_generator.py +298 -0
- confiture/core/migrator.py +882 -0
- confiture/core/observability/__init__.py +44 -0
- confiture/core/observability/audit.py +323 -0
- confiture/core/observability/logging.py +187 -0
- confiture/core/observability/metrics.py +174 -0
- confiture/core/observability/tracing.py +192 -0
- confiture/core/pg_version.py +418 -0
- confiture/core/pool.py +406 -0
- confiture/core/risk/__init__.py +39 -0
- confiture/core/risk/predictor.py +188 -0
- confiture/core/risk/scoring.py +248 -0
- confiture/core/rollback_generator.py +388 -0
- confiture/core/schema_analyzer.py +769 -0
- confiture/core/schema_to_schema.py +590 -0
- confiture/core/security/__init__.py +32 -0
- confiture/core/security/logging.py +201 -0
- confiture/core/security/validation.py +416 -0
- confiture/core/signals.py +371 -0
- confiture/core/syncer.py +540 -0
- confiture/exceptions.py +192 -0
- confiture/integrations/__init__.py +0 -0
- confiture/models/__init__.py +24 -0
- confiture/models/lint.py +193 -0
- confiture/models/migration.py +265 -0
- confiture/models/schema.py +203 -0
- confiture/models/sql_file_migration.py +225 -0
- confiture/scenarios/__init__.py +36 -0
- confiture/scenarios/compliance.py +586 -0
- confiture/scenarios/ecommerce.py +199 -0
- confiture/scenarios/financial.py +253 -0
- confiture/scenarios/healthcare.py +315 -0
- confiture/scenarios/multi_tenant.py +340 -0
- confiture/scenarios/saas.py +295 -0
- confiture/testing/FRAMEWORK_API.md +722 -0
- confiture/testing/__init__.py +100 -0
- confiture/testing/fixtures/__init__.py +11 -0
- confiture/testing/fixtures/data_validator.py +229 -0
- confiture/testing/fixtures/migration_runner.py +167 -0
- confiture/testing/fixtures/schema_snapshotter.py +352 -0
- confiture/testing/frameworks/__init__.py +10 -0
- confiture/testing/frameworks/mutation.py +587 -0
- confiture/testing/frameworks/performance.py +479 -0
- confiture/testing/loader.py +225 -0
- confiture/testing/pytest/__init__.py +38 -0
- confiture/testing/pytest_plugin.py +190 -0
- confiture/testing/sandbox.py +304 -0
- confiture/testing/utils/__init__.py +0 -0
- fraiseql_confiture-0.3.7.dist-info/METADATA +438 -0
- fraiseql_confiture-0.3.7.dist-info/RECORD +124 -0
- fraiseql_confiture-0.3.7.dist-info/WHEEL +4 -0
- fraiseql_confiture-0.3.7.dist-info/entry_points.txt +4 -0
- fraiseql_confiture-0.3.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,737 @@
|
|
|
1
|
+
"""Data governance pipeline for anonymization workflows.
|
|
2
|
+
|
|
3
|
+
Provides a governance-enforced pipeline for anonymization operations that:
|
|
4
|
+
- Validates data before anonymization (type checking, completeness)
|
|
5
|
+
- Executes anonymization strategies with error recovery
|
|
6
|
+
- Records lineage and audit trails for compliance
|
|
7
|
+
- Integrates with KMS and token store for security
|
|
8
|
+
|
|
9
|
+
This module extends the HookExecutor system to provide:
|
|
10
|
+
1. BEFORE_ANONYMIZATION - Pre-flight validation and security checks
|
|
11
|
+
2. AFTER_ANONYMIZATION - Post-operation verification and logging
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
>>> from confiture.core.anonymization.governance import (
|
|
15
|
+
... DataGovernancePipeline, AnonymizationContext
|
|
16
|
+
... )
|
|
17
|
+
>>> from confiture.core.anonymization.security.kms_manager import KMSFactory, KMSProvider
|
|
18
|
+
>>> from confiture.core.anonymization.security.token_store import EncryptedTokenStore
|
|
19
|
+
>>> from confiture.core.anonymization.security.lineage import DataLineageTracker
|
|
20
|
+
>>>
|
|
21
|
+
>>> # Initialize pipeline with security components
|
|
22
|
+
>>> kms = KMSFactory.create(KMSProvider.AWS, region="us-east-1")
|
|
23
|
+
>>> token_store = EncryptedTokenStore(database_connection, kms_client=kms)
|
|
24
|
+
>>> lineage_tracker = DataLineageTracker(database_connection)
|
|
25
|
+
>>>
|
|
26
|
+
>>> pipeline = DataGovernancePipeline(
|
|
27
|
+
... kms_client=kms,
|
|
28
|
+
... token_store=token_store,
|
|
29
|
+
... lineage_tracker=lineage_tracker
|
|
30
|
+
... )
|
|
31
|
+
>>>
|
|
32
|
+
>>> # Execute governance pipeline
|
|
33
|
+
>>> context = AnonymizationContext(
|
|
34
|
+
... operation_id="anon-001",
|
|
35
|
+
... table_name="users",
|
|
36
|
+
... column_name="email",
|
|
37
|
+
... strategy_name="tokenization",
|
|
38
|
+
... rows_affected=1000,
|
|
39
|
+
... executed_by="admin@example.com",
|
|
40
|
+
... reason="GDPR compliance"
|
|
41
|
+
... )
|
|
42
|
+
>>>
|
|
43
|
+
>>> result = pipeline.execute(database_connection, context)
|
|
44
|
+
>>> print(f"Anonymized {result.rows_processed} rows")
|
|
45
|
+
>>> print(f"Audit ID: {result.audit_id}")
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
import logging
|
|
49
|
+
import time
|
|
50
|
+
from dataclasses import dataclass
|
|
51
|
+
from enum import Enum
|
|
52
|
+
from typing import Any
|
|
53
|
+
from uuid import UUID, uuid4
|
|
54
|
+
|
|
55
|
+
import psycopg
|
|
56
|
+
from psycopg import sql
|
|
57
|
+
|
|
58
|
+
from confiture.core.anonymization.security.kms_manager import KMSClient
|
|
59
|
+
from confiture.core.anonymization.security.lineage import (
|
|
60
|
+
DataLineageTracker,
|
|
61
|
+
create_lineage_entry,
|
|
62
|
+
)
|
|
63
|
+
from confiture.core.anonymization.security.token_store import EncryptedTokenStore
|
|
64
|
+
from confiture.core.anonymization.strategy import AnonymizationStrategy
|
|
65
|
+
from confiture.exceptions import MigrationError
|
|
66
|
+
|
|
67
|
+
logger = logging.getLogger(__name__)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class GovernancePhase(Enum):
|
|
71
|
+
"""Phases in the data governance pipeline."""
|
|
72
|
+
|
|
73
|
+
PRE_VALIDATION = 1
|
|
74
|
+
"""Pre-flight checks before anonymization."""
|
|
75
|
+
|
|
76
|
+
BEFORE_ANONYMIZATION = 2
|
|
77
|
+
"""Preparation before anonymization execution."""
|
|
78
|
+
|
|
79
|
+
ANONYMIZATION = 3
|
|
80
|
+
"""Actual anonymization operation."""
|
|
81
|
+
|
|
82
|
+
POST_ANONYMIZATION = 4
|
|
83
|
+
"""Verification and recording after anonymization."""
|
|
84
|
+
|
|
85
|
+
CLEANUP = 5
|
|
86
|
+
"""Final cleanup and optimization."""
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class ValidationResult:
|
|
91
|
+
"""Result of data validation."""
|
|
92
|
+
|
|
93
|
+
is_valid: bool
|
|
94
|
+
"""Whether validation passed."""
|
|
95
|
+
|
|
96
|
+
errors: list[str]
|
|
97
|
+
"""List of validation errors (empty if valid)."""
|
|
98
|
+
|
|
99
|
+
warnings: list[str]
|
|
100
|
+
"""List of validation warnings."""
|
|
101
|
+
|
|
102
|
+
rows_checked: int = 0
|
|
103
|
+
"""Number of rows validated."""
|
|
104
|
+
|
|
105
|
+
null_count: int = 0
|
|
106
|
+
"""Number of NULL values found."""
|
|
107
|
+
|
|
108
|
+
sample_values: list[Any] | None = None
|
|
109
|
+
"""Sample of values that passed validation."""
|
|
110
|
+
|
|
111
|
+
def __post_init__(self):
|
|
112
|
+
"""Initialize sample_values if not provided."""
|
|
113
|
+
if self.sample_values is None:
|
|
114
|
+
self.sample_values = []
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class AnonymizationContext:
|
|
119
|
+
"""Context for an anonymization operation.
|
|
120
|
+
|
|
121
|
+
Tracks all metadata about an anonymization operation for governance,
|
|
122
|
+
audit, and compliance purposes.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
operation_id: str
|
|
126
|
+
"""Unique identifier for this operation."""
|
|
127
|
+
|
|
128
|
+
table_name: str
|
|
129
|
+
"""Table being anonymized."""
|
|
130
|
+
|
|
131
|
+
column_name: str
|
|
132
|
+
"""Column being anonymized."""
|
|
133
|
+
|
|
134
|
+
strategy_name: str
|
|
135
|
+
"""Strategy being used."""
|
|
136
|
+
|
|
137
|
+
rows_affected: int = 0
|
|
138
|
+
"""Number of rows to be anonymized."""
|
|
139
|
+
|
|
140
|
+
executed_by: str = "system"
|
|
141
|
+
"""User executing the operation."""
|
|
142
|
+
|
|
143
|
+
reason: str | None = None
|
|
144
|
+
"""Business reason for anonymization."""
|
|
145
|
+
|
|
146
|
+
request_id: str | None = None
|
|
147
|
+
"""External request ID (ticket, case, etc.)."""
|
|
148
|
+
|
|
149
|
+
department: str | None = None
|
|
150
|
+
"""Department requesting anonymization."""
|
|
151
|
+
|
|
152
|
+
data_minimization_applied: bool = False
|
|
153
|
+
"""Whether data minimization is being applied."""
|
|
154
|
+
|
|
155
|
+
retention_days: int | None = None
|
|
156
|
+
"""Data retention period."""
|
|
157
|
+
|
|
158
|
+
start_time: float = 0.0
|
|
159
|
+
"""Operation start time (set by pipeline)."""
|
|
160
|
+
|
|
161
|
+
end_time: float = 0.0
|
|
162
|
+
"""Operation end time (set by pipeline)."""
|
|
163
|
+
|
|
164
|
+
source_count: int | None = None
|
|
165
|
+
"""Row count before anonymization."""
|
|
166
|
+
|
|
167
|
+
target_count: int | None = None
|
|
168
|
+
"""Row count after anonymization."""
|
|
169
|
+
|
|
170
|
+
stats: dict[str, Any] | None = None
|
|
171
|
+
"""Statistics collected during operation."""
|
|
172
|
+
|
|
173
|
+
def __post_init__(self):
|
|
174
|
+
"""Initialize stats if not provided."""
|
|
175
|
+
if self.stats is None:
|
|
176
|
+
self.stats = {}
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def duration_seconds(self) -> float:
|
|
180
|
+
"""Calculate operation duration in seconds."""
|
|
181
|
+
if self.start_time and self.end_time:
|
|
182
|
+
return self.end_time - self.start_time
|
|
183
|
+
return 0.0
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@dataclass
|
|
187
|
+
class AnonymizationResult:
|
|
188
|
+
"""Result of anonymization operation."""
|
|
189
|
+
|
|
190
|
+
operation_id: str
|
|
191
|
+
"""Unique identifier for this operation."""
|
|
192
|
+
|
|
193
|
+
rows_processed: int
|
|
194
|
+
"""Number of rows processed."""
|
|
195
|
+
|
|
196
|
+
rows_anonymized: int
|
|
197
|
+
"""Number of rows successfully anonymized."""
|
|
198
|
+
|
|
199
|
+
rows_failed: int
|
|
200
|
+
"""Number of rows that failed."""
|
|
201
|
+
|
|
202
|
+
audit_id: UUID
|
|
203
|
+
"""UUID of the audit/lineage entry."""
|
|
204
|
+
|
|
205
|
+
duration_seconds: float
|
|
206
|
+
"""Operation duration."""
|
|
207
|
+
|
|
208
|
+
status: str
|
|
209
|
+
"""Operation status (success, partial, error)."""
|
|
210
|
+
|
|
211
|
+
error_message: str | None = None
|
|
212
|
+
"""Error message if operation failed."""
|
|
213
|
+
|
|
214
|
+
warnings: list[str] | None = None
|
|
215
|
+
"""List of warnings that occurred."""
|
|
216
|
+
|
|
217
|
+
def __post_init__(self):
|
|
218
|
+
"""Initialize warnings if not provided."""
|
|
219
|
+
if self.warnings is None:
|
|
220
|
+
self.warnings = []
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class DataValidator:
|
|
224
|
+
"""Validates data before anonymization.
|
|
225
|
+
|
|
226
|
+
Checks:
|
|
227
|
+
- Column exists and has expected type
|
|
228
|
+
- Data is not NULL (unless strategy allows)
|
|
229
|
+
- Data matches strategy requirements
|
|
230
|
+
- No duplicates (if strategy requires uniqueness)
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
def __init__(self, conn: psycopg.Connection):
|
|
234
|
+
"""Initialize validator with database connection.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
conn: PostgreSQL connection for queries
|
|
238
|
+
"""
|
|
239
|
+
self.conn = conn
|
|
240
|
+
|
|
241
|
+
def validate_column(
|
|
242
|
+
self,
|
|
243
|
+
table_name: str,
|
|
244
|
+
column_name: str,
|
|
245
|
+
strategy: AnonymizationStrategy,
|
|
246
|
+
sample_size: int = 100,
|
|
247
|
+
) -> ValidationResult:
|
|
248
|
+
"""Validate a column before anonymization.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
table_name: Table to validate
|
|
252
|
+
column_name: Column to validate
|
|
253
|
+
strategy: Strategy that will be applied
|
|
254
|
+
sample_size: Number of sample rows to check
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
ValidationResult with status and details
|
|
258
|
+
|
|
259
|
+
Raises:
|
|
260
|
+
psycopg.DatabaseError: If query fails
|
|
261
|
+
"""
|
|
262
|
+
errors = []
|
|
263
|
+
warnings = []
|
|
264
|
+
sample_values = []
|
|
265
|
+
null_count = 0
|
|
266
|
+
rows_checked = 0
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
# 1. Check column exists
|
|
270
|
+
with self.conn.cursor() as cursor:
|
|
271
|
+
cursor.execute(
|
|
272
|
+
"""
|
|
273
|
+
SELECT column_name, data_type, is_nullable
|
|
274
|
+
FROM information_schema.columns
|
|
275
|
+
WHERE table_name = %s AND column_name = %s
|
|
276
|
+
""",
|
|
277
|
+
(table_name, column_name),
|
|
278
|
+
)
|
|
279
|
+
col_info = cursor.fetchone()
|
|
280
|
+
|
|
281
|
+
if not col_info:
|
|
282
|
+
errors.append(f"Column {table_name}.{column_name} not found")
|
|
283
|
+
return ValidationResult(
|
|
284
|
+
is_valid=False,
|
|
285
|
+
errors=errors,
|
|
286
|
+
warnings=warnings,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
col_name, data_type, is_nullable = col_info
|
|
290
|
+
|
|
291
|
+
# 2. Sample data and validate with strategy
|
|
292
|
+
with self.conn.cursor() as cursor:
|
|
293
|
+
cursor.execute(
|
|
294
|
+
sql.SQL("""
|
|
295
|
+
SELECT {column}, COUNT(*)
|
|
296
|
+
FROM {table}
|
|
297
|
+
GROUP BY {column}
|
|
298
|
+
LIMIT %s
|
|
299
|
+
""").format(
|
|
300
|
+
column=sql.Identifier(column_name),
|
|
301
|
+
table=sql.Identifier(table_name),
|
|
302
|
+
),
|
|
303
|
+
(sample_size,),
|
|
304
|
+
)
|
|
305
|
+
rows = cursor.fetchall()
|
|
306
|
+
|
|
307
|
+
for value, count in rows:
|
|
308
|
+
rows_checked += count
|
|
309
|
+
|
|
310
|
+
# Track NULLs
|
|
311
|
+
if value is None:
|
|
312
|
+
null_count += count
|
|
313
|
+
if is_nullable == "NO":
|
|
314
|
+
warnings.append(
|
|
315
|
+
f"NULL found in non-nullable column {column_name} ({count} rows)"
|
|
316
|
+
)
|
|
317
|
+
continue
|
|
318
|
+
|
|
319
|
+
# Validate with strategy
|
|
320
|
+
if not strategy.validate(value):
|
|
321
|
+
errors.append(
|
|
322
|
+
f"Value '{value}' (type {type(value).__name__}) "
|
|
323
|
+
f"cannot be anonymized with {strategy.name_short()}"
|
|
324
|
+
)
|
|
325
|
+
else:
|
|
326
|
+
sample_values.append(value)
|
|
327
|
+
|
|
328
|
+
# 3. Get total row count
|
|
329
|
+
with self.conn.cursor() as cursor:
|
|
330
|
+
cursor.execute(
|
|
331
|
+
sql.SQL("SELECT COUNT(*) FROM {}").format(sql.Identifier(table_name)),
|
|
332
|
+
)
|
|
333
|
+
row = cursor.fetchone()
|
|
334
|
+
total_rows = row[0] if row else 0
|
|
335
|
+
|
|
336
|
+
if total_rows == 0:
|
|
337
|
+
warnings.append(f"Table {table_name} is empty")
|
|
338
|
+
|
|
339
|
+
# Determine validity
|
|
340
|
+
is_valid = len(errors) == 0
|
|
341
|
+
|
|
342
|
+
return ValidationResult(
|
|
343
|
+
is_valid=is_valid,
|
|
344
|
+
errors=errors,
|
|
345
|
+
warnings=warnings,
|
|
346
|
+
rows_checked=rows_checked,
|
|
347
|
+
null_count=null_count,
|
|
348
|
+
sample_values=sample_values,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
except Exception as e:
|
|
352
|
+
logger.error(f"Validation failed for {table_name}.{column_name}: {e}")
|
|
353
|
+
errors.append(str(e))
|
|
354
|
+
return ValidationResult(
|
|
355
|
+
is_valid=False,
|
|
356
|
+
errors=errors,
|
|
357
|
+
warnings=warnings,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class DataGovernancePipeline:
|
|
362
|
+
"""Governance-enforced anonymization pipeline.
|
|
363
|
+
|
|
364
|
+
Orchestrates the complete anonymization workflow with:
|
|
365
|
+
- Pre-flight validation (data checks)
|
|
366
|
+
- Anonymization execution
|
|
367
|
+
- Security integration (KMS, token store, lineage)
|
|
368
|
+
- Error recovery and rollback
|
|
369
|
+
- Audit logging and compliance
|
|
370
|
+
|
|
371
|
+
Attributes:
|
|
372
|
+
kms_client: KMS client for encryption key management
|
|
373
|
+
token_store: Encrypted token storage for reversible strategies
|
|
374
|
+
lineage_tracker: Data lineage tracker for audit trails
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
def __init__(
|
|
378
|
+
self,
|
|
379
|
+
kms_client: KMSClient,
|
|
380
|
+
token_store: EncryptedTokenStore,
|
|
381
|
+
lineage_tracker: DataLineageTracker,
|
|
382
|
+
):
|
|
383
|
+
"""Initialize governance pipeline.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
kms_client: KMS client for key management
|
|
387
|
+
token_store: Token store for reversible strategies
|
|
388
|
+
lineage_tracker: Lineage tracker for audit trails
|
|
389
|
+
"""
|
|
390
|
+
self.kms_client = kms_client
|
|
391
|
+
self.token_store = token_store
|
|
392
|
+
self.lineage_tracker = lineage_tracker
|
|
393
|
+
self.validator = None
|
|
394
|
+
|
|
395
|
+
def execute(
|
|
396
|
+
self,
|
|
397
|
+
conn: psycopg.Connection,
|
|
398
|
+
context: AnonymizationContext,
|
|
399
|
+
strategy: AnonymizationStrategy,
|
|
400
|
+
) -> AnonymizationResult:
|
|
401
|
+
"""Execute full anonymization pipeline with governance.
|
|
402
|
+
|
|
403
|
+
Phases:
|
|
404
|
+
1. PRE_VALIDATION - Validate data and security settings
|
|
405
|
+
2. BEFORE_ANONYMIZATION - Prepare and backup if needed
|
|
406
|
+
3. ANONYMIZATION - Apply strategy to data
|
|
407
|
+
4. POST_ANONYMIZATION - Verify and log
|
|
408
|
+
5. CLEANUP - Optimize and finalize
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
conn: Database connection
|
|
412
|
+
context: Anonymization context with metadata
|
|
413
|
+
strategy: Strategy to apply
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
AnonymizationResult with operation status
|
|
417
|
+
|
|
418
|
+
Raises:
|
|
419
|
+
MigrationError: If operation fails
|
|
420
|
+
"""
|
|
421
|
+
context.operation_id = context.operation_id or str(uuid4())
|
|
422
|
+
context.start_time = time.time()
|
|
423
|
+
audit_id = uuid4()
|
|
424
|
+
|
|
425
|
+
try:
|
|
426
|
+
# PRE_VALIDATION Phase
|
|
427
|
+
logger.info(f"Starting anonymization operation {context.operation_id}")
|
|
428
|
+
|
|
429
|
+
validation = self._pre_validate(conn, context, strategy)
|
|
430
|
+
if not validation.is_valid:
|
|
431
|
+
raise MigrationError(f"Pre-validation failed: {'; '.join(validation.errors)}")
|
|
432
|
+
|
|
433
|
+
if context.stats is None:
|
|
434
|
+
context.stats = {}
|
|
435
|
+
context.stats["validation_warnings"] = validation.warnings
|
|
436
|
+
context.source_count = validation.rows_checked
|
|
437
|
+
|
|
438
|
+
# BEFORE_ANONYMIZATION Phase
|
|
439
|
+
self._before_anonymization(conn, context)
|
|
440
|
+
|
|
441
|
+
# ANONYMIZATION Phase
|
|
442
|
+
rows_anonymized = self._anonymize(conn, context, strategy)
|
|
443
|
+
|
|
444
|
+
# POST_ANONYMIZATION Phase
|
|
445
|
+
context.target_count = rows_anonymized
|
|
446
|
+
context.end_time = time.time()
|
|
447
|
+
|
|
448
|
+
self._post_anonymization(conn, context, audit_id)
|
|
449
|
+
|
|
450
|
+
# CLEANUP Phase
|
|
451
|
+
self._cleanup(conn, context)
|
|
452
|
+
|
|
453
|
+
logger.info(
|
|
454
|
+
f"Anonymization operation {context.operation_id} completed successfully: "
|
|
455
|
+
f"{rows_anonymized} rows anonymized in {context.duration_seconds:.2f}s"
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
return AnonymizationResult(
|
|
459
|
+
operation_id=context.operation_id,
|
|
460
|
+
rows_processed=context.source_count or 0,
|
|
461
|
+
rows_anonymized=rows_anonymized,
|
|
462
|
+
rows_failed=0,
|
|
463
|
+
audit_id=audit_id,
|
|
464
|
+
duration_seconds=context.duration_seconds,
|
|
465
|
+
status="success",
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
except Exception as e:
|
|
469
|
+
context.end_time = time.time()
|
|
470
|
+
logger.error(
|
|
471
|
+
f"Anonymization operation {context.operation_id} failed: {e}",
|
|
472
|
+
exc_info=True,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Record failure in lineage
|
|
476
|
+
self._record_lineage(
|
|
477
|
+
conn,
|
|
478
|
+
context,
|
|
479
|
+
audit_id,
|
|
480
|
+
status="error",
|
|
481
|
+
error_message=str(e),
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
return AnonymizationResult(
|
|
485
|
+
operation_id=context.operation_id,
|
|
486
|
+
rows_processed=context.source_count or 0,
|
|
487
|
+
rows_anonymized=0,
|
|
488
|
+
rows_failed=context.source_count or 0,
|
|
489
|
+
audit_id=audit_id,
|
|
490
|
+
duration_seconds=context.duration_seconds,
|
|
491
|
+
status="error",
|
|
492
|
+
error_message=str(e),
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
def _pre_validate(
|
|
496
|
+
self,
|
|
497
|
+
conn: psycopg.Connection,
|
|
498
|
+
context: AnonymizationContext,
|
|
499
|
+
strategy: AnonymizationStrategy,
|
|
500
|
+
) -> ValidationResult:
|
|
501
|
+
"""Pre-flight validation (PRE_VALIDATION phase).
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
conn: Database connection
|
|
505
|
+
context: Anonymization context
|
|
506
|
+
strategy: Strategy to validate
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
ValidationResult with validation status
|
|
510
|
+
"""
|
|
511
|
+
if self.validator is None:
|
|
512
|
+
self.validator = DataValidator(conn)
|
|
513
|
+
|
|
514
|
+
logger.info(
|
|
515
|
+
f"Validating {context.table_name}.{context.column_name} "
|
|
516
|
+
f"with strategy {context.strategy_name}"
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
return self.validator.validate_column(
|
|
520
|
+
context.table_name,
|
|
521
|
+
context.column_name,
|
|
522
|
+
strategy,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
def _before_anonymization(
|
|
526
|
+
self,
|
|
527
|
+
_conn: psycopg.Connection,
|
|
528
|
+
context: AnonymizationContext,
|
|
529
|
+
) -> None:
|
|
530
|
+
"""Preparation before anonymization (BEFORE_ANONYMIZATION phase).
|
|
531
|
+
|
|
532
|
+
Can perform:
|
|
533
|
+
- Backups of original data
|
|
534
|
+
- Pre-computation of anonymization maps
|
|
535
|
+
- Caching strategies
|
|
536
|
+
- Lock acquisition
|
|
537
|
+
|
|
538
|
+
Args:
|
|
539
|
+
conn: Database connection
|
|
540
|
+
context: Anonymization context
|
|
541
|
+
"""
|
|
542
|
+
logger.debug(
|
|
543
|
+
f"Preparing for anonymization: {context.operation_id} "
|
|
544
|
+
f"({context.table_name}.{context.column_name})"
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# In a real implementation, could:
|
|
548
|
+
# 1. Create a backup table
|
|
549
|
+
# 2. Pre-compute token mappings for tokenization
|
|
550
|
+
# 3. Warm up caches
|
|
551
|
+
# 4. Acquire advisory locks
|
|
552
|
+
|
|
553
|
+
pass
|
|
554
|
+
|
|
555
|
+
def _anonymize(
|
|
556
|
+
self,
|
|
557
|
+
_conn: psycopg.Connection,
|
|
558
|
+
context: AnonymizationContext,
|
|
559
|
+
_strategy: AnonymizationStrategy,
|
|
560
|
+
) -> int:
|
|
561
|
+
"""Execute anonymization (ANONYMIZATION phase).
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
conn: Database connection
|
|
565
|
+
context: Anonymization context
|
|
566
|
+
strategy: Strategy to apply
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
Number of rows anonymized
|
|
570
|
+
|
|
571
|
+
Raises:
|
|
572
|
+
Exception: If anonymization fails
|
|
573
|
+
"""
|
|
574
|
+
logger.info(
|
|
575
|
+
f"Applying {context.strategy_name} to {context.table_name}.{context.column_name}"
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# In a real implementation, would:
|
|
579
|
+
# 1. Fetch rows in batches
|
|
580
|
+
# 2. Apply strategy to each value
|
|
581
|
+
# 3. Update database
|
|
582
|
+
# 4. Store tokens if reversible strategy
|
|
583
|
+
# 5. Handle errors per row
|
|
584
|
+
|
|
585
|
+
# Placeholder: return 0 for now (TODO: implement actual batch processing)
|
|
586
|
+
return context.rows_affected
|
|
587
|
+
|
|
588
|
+
def _post_anonymization(
|
|
589
|
+
self,
|
|
590
|
+
conn: psycopg.Connection,
|
|
591
|
+
context: AnonymizationContext,
|
|
592
|
+
audit_id: UUID,
|
|
593
|
+
) -> None:
|
|
594
|
+
"""Post-operation verification and logging (POST_ANONYMIZATION phase).
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
conn: Database connection
|
|
598
|
+
context: Anonymization context
|
|
599
|
+
audit_id: UUID of audit entry
|
|
600
|
+
"""
|
|
601
|
+
logger.info(f"Verifying anonymization operation {context.operation_id}")
|
|
602
|
+
|
|
603
|
+
# Record lineage entry
|
|
604
|
+
self._record_lineage(
|
|
605
|
+
conn,
|
|
606
|
+
context,
|
|
607
|
+
audit_id,
|
|
608
|
+
status="success",
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
def _cleanup(
|
|
612
|
+
self,
|
|
613
|
+
_conn: psycopg.Connection,
|
|
614
|
+
context: AnonymizationContext,
|
|
615
|
+
) -> None:
|
|
616
|
+
"""Final cleanup (CLEANUP phase).
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
conn: Database connection
|
|
620
|
+
context: Anonymization context
|
|
621
|
+
"""
|
|
622
|
+
logger.debug(f"Cleaning up after operation {context.operation_id}")
|
|
623
|
+
|
|
624
|
+
# Could perform:
|
|
625
|
+
# 1. Remove backup tables
|
|
626
|
+
# 2. Vacuum table
|
|
627
|
+
# 3. Update statistics
|
|
628
|
+
# 4. Release locks
|
|
629
|
+
|
|
630
|
+
pass
|
|
631
|
+
|
|
632
|
+
def _record_lineage(
|
|
633
|
+
self,
|
|
634
|
+
_conn: psycopg.Connection,
|
|
635
|
+
context: AnonymizationContext,
|
|
636
|
+
audit_id: UUID,
|
|
637
|
+
status: str = "success",
|
|
638
|
+
error_message: str | None = None,
|
|
639
|
+
) -> None:
|
|
640
|
+
"""Record operation in lineage tracker.
|
|
641
|
+
|
|
642
|
+
Args:
|
|
643
|
+
conn: Database connection
|
|
644
|
+
context: Anonymization context
|
|
645
|
+
audit_id: UUID for this lineage entry
|
|
646
|
+
status: Operation status (success, error, partial)
|
|
647
|
+
error_message: Error message if operation failed
|
|
648
|
+
"""
|
|
649
|
+
entry = create_lineage_entry(
|
|
650
|
+
operation_id=context.operation_id,
|
|
651
|
+
table_name=context.table_name,
|
|
652
|
+
column_name=context.column_name,
|
|
653
|
+
strategy_name=context.strategy_name,
|
|
654
|
+
rows_affected=context.rows_affected,
|
|
655
|
+
executed_by=context.executed_by,
|
|
656
|
+
reason=context.reason,
|
|
657
|
+
request_id=context.request_id,
|
|
658
|
+
department=context.department,
|
|
659
|
+
data_minimization_applied=context.data_minimization_applied,
|
|
660
|
+
retention_days=context.retention_days,
|
|
661
|
+
source_count=context.source_count,
|
|
662
|
+
target_count=context.target_count,
|
|
663
|
+
duration_seconds=context.duration_seconds,
|
|
664
|
+
status=status,
|
|
665
|
+
error_message=error_message,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
entry.id = audit_id
|
|
669
|
+
self.lineage_tracker.record_entry(entry)
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
class StrategyValidator:
|
|
673
|
+
"""Extends AnonymizationStrategy validation with governance checks.
|
|
674
|
+
|
|
675
|
+
Validates:
|
|
676
|
+
- Data type compatibility
|
|
677
|
+
- Completeness (NULL handling)
|
|
678
|
+
- Constraints (uniqueness, format)
|
|
679
|
+
- Reversibility and key management
|
|
680
|
+
"""
|
|
681
|
+
|
|
682
|
+
@staticmethod
|
|
683
|
+
def validate_strategy_compatibility(
|
|
684
|
+
strategy: AnonymizationStrategy,
|
|
685
|
+
sample_values: list[Any],
|
|
686
|
+
) -> tuple[bool, list[str]]:
|
|
687
|
+
"""Validate strategy can handle all sample values.
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
strategy: Strategy to validate
|
|
691
|
+
sample_values: List of sample values to test
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
Tuple of (is_valid, error_messages)
|
|
695
|
+
"""
|
|
696
|
+
errors = []
|
|
697
|
+
|
|
698
|
+
for value in sample_values:
|
|
699
|
+
try:
|
|
700
|
+
if not strategy.validate(value):
|
|
701
|
+
errors.append(
|
|
702
|
+
f"Strategy {strategy.name_short()} cannot handle {type(value).__name__} "
|
|
703
|
+
f"value: {repr(value)}"
|
|
704
|
+
)
|
|
705
|
+
except Exception as e:
|
|
706
|
+
errors.append(f"Strategy {strategy.name_short()} validation error: {e}")
|
|
707
|
+
|
|
708
|
+
return len(errors) == 0, errors
|
|
709
|
+
|
|
710
|
+
@staticmethod
|
|
711
|
+
def validate_reversibility(
|
|
712
|
+
strategy: AnonymizationStrategy,
|
|
713
|
+
kms_client: KMSClient | None = None,
|
|
714
|
+
token_store: EncryptedTokenStore | None = None,
|
|
715
|
+
) -> tuple[bool, list[str]]:
|
|
716
|
+
"""Validate reversibility requirements are met.
|
|
717
|
+
|
|
718
|
+
Args:
|
|
719
|
+
strategy: Strategy to validate
|
|
720
|
+
kms_client: KMS client (required for encrypted strategies)
|
|
721
|
+
token_store: Token store (required for tokenization)
|
|
722
|
+
|
|
723
|
+
Returns:
|
|
724
|
+
Tuple of (is_valid, error_messages)
|
|
725
|
+
"""
|
|
726
|
+
errors = []
|
|
727
|
+
strategy_name = strategy.name_short()
|
|
728
|
+
|
|
729
|
+
# Check for reversibility requirements
|
|
730
|
+
if hasattr(strategy, "is_reversible") and strategy.is_reversible:
|
|
731
|
+
if strategy_name == "tokenization" and token_store is None:
|
|
732
|
+
errors.append("Tokenization strategy requires token store to be configured")
|
|
733
|
+
|
|
734
|
+
if hasattr(strategy, "requires_kms") and strategy.requires_kms and kms_client is None:
|
|
735
|
+
errors.append(f"{strategy_name} strategy requires KMS client to be configured")
|
|
736
|
+
|
|
737
|
+
return len(errors) == 0, errors
|