fraiseql-confiture 0.3.4__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- confiture/__init__.py +48 -0
- confiture/_core.cp311-win_amd64.pyd +0 -0
- confiture/cli/__init__.py +0 -0
- confiture/cli/dry_run.py +116 -0
- confiture/cli/lint_formatter.py +193 -0
- confiture/cli/main.py +1656 -0
- confiture/config/__init__.py +0 -0
- confiture/config/environment.py +263 -0
- confiture/core/__init__.py +51 -0
- confiture/core/anonymization/__init__.py +0 -0
- confiture/core/anonymization/audit.py +485 -0
- confiture/core/anonymization/benchmarking.py +372 -0
- confiture/core/anonymization/breach_notification.py +652 -0
- confiture/core/anonymization/compliance.py +617 -0
- confiture/core/anonymization/composer.py +298 -0
- confiture/core/anonymization/data_subject_rights.py +669 -0
- confiture/core/anonymization/factory.py +319 -0
- confiture/core/anonymization/governance.py +737 -0
- confiture/core/anonymization/performance.py +1092 -0
- confiture/core/anonymization/profile.py +284 -0
- confiture/core/anonymization/registry.py +195 -0
- confiture/core/anonymization/security/kms_manager.py +547 -0
- confiture/core/anonymization/security/lineage.py +888 -0
- confiture/core/anonymization/security/token_store.py +686 -0
- confiture/core/anonymization/strategies/__init__.py +41 -0
- confiture/core/anonymization/strategies/address.py +359 -0
- confiture/core/anonymization/strategies/credit_card.py +374 -0
- confiture/core/anonymization/strategies/custom.py +161 -0
- confiture/core/anonymization/strategies/date.py +218 -0
- confiture/core/anonymization/strategies/differential_privacy.py +398 -0
- confiture/core/anonymization/strategies/email.py +141 -0
- confiture/core/anonymization/strategies/format_preserving_encryption.py +310 -0
- confiture/core/anonymization/strategies/hash.py +150 -0
- confiture/core/anonymization/strategies/ip_address.py +235 -0
- confiture/core/anonymization/strategies/masking_retention.py +252 -0
- confiture/core/anonymization/strategies/name.py +298 -0
- confiture/core/anonymization/strategies/phone.py +119 -0
- confiture/core/anonymization/strategies/preserve.py +85 -0
- confiture/core/anonymization/strategies/redact.py +101 -0
- confiture/core/anonymization/strategies/salted_hashing.py +322 -0
- confiture/core/anonymization/strategies/text_redaction.py +183 -0
- confiture/core/anonymization/strategies/tokenization.py +334 -0
- confiture/core/anonymization/strategy.py +241 -0
- confiture/core/anonymization/syncer_audit.py +357 -0
- confiture/core/blue_green.py +683 -0
- confiture/core/builder.py +500 -0
- confiture/core/checksum.py +358 -0
- confiture/core/connection.py +132 -0
- confiture/core/differ.py +522 -0
- confiture/core/drift.py +564 -0
- confiture/core/dry_run.py +182 -0
- confiture/core/health.py +313 -0
- confiture/core/hooks/__init__.py +87 -0
- confiture/core/hooks/base.py +232 -0
- confiture/core/hooks/context.py +146 -0
- confiture/core/hooks/execution_strategies.py +57 -0
- confiture/core/hooks/observability.py +220 -0
- confiture/core/hooks/phases.py +53 -0
- confiture/core/hooks/registry.py +295 -0
- confiture/core/large_tables.py +775 -0
- confiture/core/linting/__init__.py +70 -0
- confiture/core/linting/composer.py +192 -0
- confiture/core/linting/libraries/__init__.py +17 -0
- confiture/core/linting/libraries/gdpr.py +168 -0
- confiture/core/linting/libraries/general.py +184 -0
- confiture/core/linting/libraries/hipaa.py +144 -0
- confiture/core/linting/libraries/pci_dss.py +104 -0
- confiture/core/linting/libraries/sox.py +120 -0
- confiture/core/linting/schema_linter.py +491 -0
- confiture/core/linting/versioning.py +151 -0
- confiture/core/locking.py +389 -0
- confiture/core/migration_generator.py +298 -0
- confiture/core/migrator.py +793 -0
- confiture/core/observability/__init__.py +44 -0
- confiture/core/observability/audit.py +323 -0
- confiture/core/observability/logging.py +187 -0
- confiture/core/observability/metrics.py +174 -0
- confiture/core/observability/tracing.py +192 -0
- confiture/core/pg_version.py +418 -0
- confiture/core/pool.py +406 -0
- confiture/core/risk/__init__.py +39 -0
- confiture/core/risk/predictor.py +188 -0
- confiture/core/risk/scoring.py +248 -0
- confiture/core/rollback_generator.py +388 -0
- confiture/core/schema_analyzer.py +769 -0
- confiture/core/schema_to_schema.py +590 -0
- confiture/core/security/__init__.py +32 -0
- confiture/core/security/logging.py +201 -0
- confiture/core/security/validation.py +416 -0
- confiture/core/signals.py +371 -0
- confiture/core/syncer.py +540 -0
- confiture/exceptions.py +192 -0
- confiture/integrations/__init__.py +0 -0
- confiture/models/__init__.py +0 -0
- confiture/models/lint.py +193 -0
- confiture/models/migration.py +180 -0
- confiture/models/schema.py +203 -0
- confiture/scenarios/__init__.py +36 -0
- confiture/scenarios/compliance.py +586 -0
- confiture/scenarios/ecommerce.py +199 -0
- confiture/scenarios/financial.py +253 -0
- confiture/scenarios/healthcare.py +315 -0
- confiture/scenarios/multi_tenant.py +340 -0
- confiture/scenarios/saas.py +295 -0
- confiture/testing/FRAMEWORK_API.md +722 -0
- confiture/testing/__init__.py +38 -0
- confiture/testing/fixtures/__init__.py +11 -0
- confiture/testing/fixtures/data_validator.py +229 -0
- confiture/testing/fixtures/migration_runner.py +167 -0
- confiture/testing/fixtures/schema_snapshotter.py +352 -0
- confiture/testing/frameworks/__init__.py +10 -0
- confiture/testing/frameworks/mutation.py +587 -0
- confiture/testing/frameworks/performance.py +479 -0
- confiture/testing/utils/__init__.py +0 -0
- fraiseql_confiture-0.3.4.dist-info/METADATA +438 -0
- fraiseql_confiture-0.3.4.dist-info/RECORD +119 -0
- fraiseql_confiture-0.3.4.dist-info/WHEEL +4 -0
- fraiseql_confiture-0.3.4.dist-info/entry_points.txt +2 -0
- fraiseql_confiture-0.3.4.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1092 @@
|
|
|
1
|
+
"""Performance optimization for anonymization.
|
|
2
|
+
|
|
3
|
+
Provides optimizations for production-scale anonymization:
|
|
4
|
+
- Batch processing (optimize database I/O)
|
|
5
|
+
- Concurrent/parallel processing (multi-worker execution)
|
|
6
|
+
- Connection pooling (reuse database connections)
|
|
7
|
+
- Query optimization (indexes, query plans)
|
|
8
|
+
- Memory efficiency (streaming, chunking)
|
|
9
|
+
- Performance monitoring (metrics, alerts)
|
|
10
|
+
|
|
11
|
+
Performance Targets:
|
|
12
|
+
- 10K-35K rows/sec depending on strategy
|
|
13
|
+
- Sub-100ms latency for small batches
|
|
14
|
+
- <2GB memory for processing 1M rows
|
|
15
|
+
- 99.9% availability
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
>>> from confiture.core.anonymization.performance import (
|
|
19
|
+
... BatchAnonymizer, ConcurrentAnonymizer, PerformanceMonitor
|
|
20
|
+
... )
|
|
21
|
+
>>>
|
|
22
|
+
>>> # Batch processing (optimized I/O)
|
|
23
|
+
>>> batch = BatchAnonymizer(conn, strategy, batch_size=10000)
|
|
24
|
+
>>> result = batch.anonymize_table("users", "email")
|
|
25
|
+
>>>
|
|
26
|
+
>>> # Concurrent processing (multi-worker)
|
|
27
|
+
>>> concurrent = ConcurrentAnonymizer(conn, strategy, num_workers=4)
|
|
28
|
+
>>> result = concurrent.anonymize_table("users", "email")
|
|
29
|
+
>>>
|
|
30
|
+
>>> # Monitor performance
|
|
31
|
+
>>> monitor = PerformanceMonitor()
|
|
32
|
+
>>> monitor.record("anonymize", duration_ms=150, rows=1000)
|
|
33
|
+
>>> stats = monitor.get_statistics()
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
import contextlib
|
|
37
|
+
import logging
|
|
38
|
+
import threading
|
|
39
|
+
import time
|
|
40
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
41
|
+
from dataclasses import dataclass, field
|
|
42
|
+
from datetime import datetime, timedelta
|
|
43
|
+
from typing import Any
|
|
44
|
+
|
|
45
|
+
import psycopg
|
|
46
|
+
from psycopg import sql
|
|
47
|
+
|
|
48
|
+
from confiture.core.anonymization.strategy import AnonymizationStrategy
|
|
49
|
+
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class PerformanceMetric:
|
|
55
|
+
"""Single performance measurement."""
|
|
56
|
+
|
|
57
|
+
operation: str
|
|
58
|
+
"""Operation being measured."""
|
|
59
|
+
|
|
60
|
+
duration_ms: float
|
|
61
|
+
"""Duration in milliseconds."""
|
|
62
|
+
|
|
63
|
+
rows_processed: int = 0
|
|
64
|
+
"""Rows processed in this operation."""
|
|
65
|
+
|
|
66
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
67
|
+
"""When measurement was taken."""
|
|
68
|
+
|
|
69
|
+
throughput_rows_per_sec: float = 0.0
|
|
70
|
+
"""Calculated throughput (rows/sec)."""
|
|
71
|
+
|
|
72
|
+
memory_mb: float = 0.0
|
|
73
|
+
"""Memory used (MB)."""
|
|
74
|
+
|
|
75
|
+
error: str | None = None
|
|
76
|
+
"""Error message if operation failed."""
|
|
77
|
+
|
|
78
|
+
def __post_init__(self):
|
|
79
|
+
"""Calculate derived metrics."""
|
|
80
|
+
if self.rows_processed > 0 and self.duration_ms > 0:
|
|
81
|
+
self.throughput_rows_per_sec = (self.rows_processed / self.duration_ms) * 1000
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class PerformanceStatistics:
|
|
86
|
+
"""Aggregated performance statistics."""
|
|
87
|
+
|
|
88
|
+
operation: str
|
|
89
|
+
"""Operation name."""
|
|
90
|
+
|
|
91
|
+
count: int
|
|
92
|
+
"""Number of measurements."""
|
|
93
|
+
|
|
94
|
+
avg_duration_ms: float
|
|
95
|
+
"""Average duration."""
|
|
96
|
+
|
|
97
|
+
min_duration_ms: float
|
|
98
|
+
"""Minimum duration."""
|
|
99
|
+
|
|
100
|
+
max_duration_ms: float
|
|
101
|
+
"""Maximum duration."""
|
|
102
|
+
|
|
103
|
+
avg_throughput: float
|
|
104
|
+
"""Average throughput (rows/sec)."""
|
|
105
|
+
|
|
106
|
+
total_rows_processed: int
|
|
107
|
+
"""Total rows processed."""
|
|
108
|
+
|
|
109
|
+
total_duration_ms: float
|
|
110
|
+
"""Total time spent."""
|
|
111
|
+
|
|
112
|
+
error_count: int = 0
|
|
113
|
+
"""Number of errors."""
|
|
114
|
+
|
|
115
|
+
error_rate: float = 0.0
|
|
116
|
+
"""Percentage of operations that failed."""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class PerformanceMonitor:
|
|
120
|
+
"""Monitor and track performance metrics.
|
|
121
|
+
|
|
122
|
+
Tracks performance of anonymization operations with:
|
|
123
|
+
- Duration measurement
|
|
124
|
+
- Throughput calculation
|
|
125
|
+
- Memory tracking
|
|
126
|
+
- Error rate monitoring
|
|
127
|
+
- Statistical analysis
|
|
128
|
+
- Alerting on performance degradation
|
|
129
|
+
|
|
130
|
+
Example:
|
|
131
|
+
>>> monitor = PerformanceMonitor()
|
|
132
|
+
>>>
|
|
133
|
+
>>> # Record operations
|
|
134
|
+
>>> monitor.record("anonymize", duration_ms=150, rows=1000)
|
|
135
|
+
>>> monitor.record("anonymize", duration_ms=160, rows=1000)
|
|
136
|
+
>>>
|
|
137
|
+
>>> # Get statistics
|
|
138
|
+
>>> stats = monitor.get_statistics("anonymize")
|
|
139
|
+
>>> print(f"Throughput: {stats.avg_throughput:.0f} rows/sec")
|
|
140
|
+
>>> print(f"Error rate: {stats.error_rate:.1f}%")
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
def __init__(self, retention_minutes: int = 1440):
|
|
144
|
+
"""Initialize performance monitor.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
retention_minutes: How long to keep metrics (default: 24 hours)
|
|
148
|
+
"""
|
|
149
|
+
self.retention_minutes = retention_minutes
|
|
150
|
+
self.metrics: list[PerformanceMetric] = []
|
|
151
|
+
self._lock = threading.Lock()
|
|
152
|
+
self._baseline: dict[str, PerformanceStatistics] = {}
|
|
153
|
+
|
|
154
|
+
def record(
|
|
155
|
+
self,
|
|
156
|
+
operation: str,
|
|
157
|
+
duration_ms: float,
|
|
158
|
+
rows_processed: int = 0,
|
|
159
|
+
memory_mb: float = 0.0,
|
|
160
|
+
error: str | None = None,
|
|
161
|
+
) -> None:
|
|
162
|
+
"""Record a performance measurement.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
operation: Operation name
|
|
166
|
+
duration_ms: Duration in milliseconds
|
|
167
|
+
rows_processed: Number of rows processed
|
|
168
|
+
memory_mb: Memory used
|
|
169
|
+
error: Error message if operation failed
|
|
170
|
+
"""
|
|
171
|
+
metric = PerformanceMetric(
|
|
172
|
+
operation=operation,
|
|
173
|
+
duration_ms=duration_ms,
|
|
174
|
+
rows_processed=rows_processed,
|
|
175
|
+
memory_mb=memory_mb,
|
|
176
|
+
error=error,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
with self._lock:
|
|
180
|
+
self.metrics.append(metric)
|
|
181
|
+
self._cleanup_old_metrics()
|
|
182
|
+
|
|
183
|
+
def get_statistics(self, operation: str | None = None) -> list[PerformanceStatistics]:
|
|
184
|
+
"""Get aggregated statistics for operations.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
operation: Specific operation (None = all)
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
List of PerformanceStatistics
|
|
191
|
+
"""
|
|
192
|
+
with self._lock:
|
|
193
|
+
metrics = self.metrics
|
|
194
|
+
|
|
195
|
+
# Filter by operation if specified
|
|
196
|
+
if operation:
|
|
197
|
+
metrics = [m for m in metrics if m.operation == operation]
|
|
198
|
+
|
|
199
|
+
# Group by operation
|
|
200
|
+
ops = {}
|
|
201
|
+
for metric in metrics:
|
|
202
|
+
if metric.operation not in ops:
|
|
203
|
+
ops[metric.operation] = []
|
|
204
|
+
ops[metric.operation].append(metric)
|
|
205
|
+
|
|
206
|
+
# Calculate statistics for each operation
|
|
207
|
+
stats = []
|
|
208
|
+
for op_name, op_metrics in ops.items():
|
|
209
|
+
durations = [m.duration_ms for m in op_metrics]
|
|
210
|
+
rows = [m.rows_processed for m in op_metrics]
|
|
211
|
+
errors = [m for m in op_metrics if m.error]
|
|
212
|
+
|
|
213
|
+
stat = PerformanceStatistics(
|
|
214
|
+
operation=op_name,
|
|
215
|
+
count=len(op_metrics),
|
|
216
|
+
avg_duration_ms=sum(durations) / len(durations),
|
|
217
|
+
min_duration_ms=min(durations),
|
|
218
|
+
max_duration_ms=max(durations),
|
|
219
|
+
avg_throughput=sum(m.throughput_rows_per_sec for m in op_metrics) / len(op_metrics),
|
|
220
|
+
total_rows_processed=sum(rows),
|
|
221
|
+
total_duration_ms=sum(durations),
|
|
222
|
+
error_count=len(errors),
|
|
223
|
+
error_rate=100.0 * len(errors) / len(op_metrics) if op_metrics else 0,
|
|
224
|
+
)
|
|
225
|
+
stats.append(stat)
|
|
226
|
+
|
|
227
|
+
return stats
|
|
228
|
+
|
|
229
|
+
def _cleanup_old_metrics(self) -> None:
|
|
230
|
+
"""Remove metrics older than retention period."""
|
|
231
|
+
cutoff = datetime.now() - timedelta(minutes=self.retention_minutes)
|
|
232
|
+
self.metrics = [m for m in self.metrics if m.timestamp > cutoff]
|
|
233
|
+
|
|
234
|
+
def set_baseline(self, operation: str, stats: PerformanceStatistics) -> None:
|
|
235
|
+
"""Set performance baseline for regression detection.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
operation: Operation name
|
|
239
|
+
stats: Baseline statistics
|
|
240
|
+
"""
|
|
241
|
+
self._baseline[operation] = stats
|
|
242
|
+
|
|
243
|
+
def check_regression(self, operation: str, threshold_pct: float = 10.0) -> bool:
|
|
244
|
+
"""Check if current performance has regressed vs baseline.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
operation: Operation to check
|
|
248
|
+
threshold_pct: Degradation threshold (default: 10%)
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if performance has regressed
|
|
252
|
+
"""
|
|
253
|
+
if operation not in self._baseline:
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
baseline = self._baseline[operation]
|
|
257
|
+
current_stats = self.get_statistics(operation)
|
|
258
|
+
|
|
259
|
+
if not current_stats:
|
|
260
|
+
return False
|
|
261
|
+
|
|
262
|
+
current = current_stats[0]
|
|
263
|
+
|
|
264
|
+
# Check if throughput decreased by more than threshold
|
|
265
|
+
degradation = (
|
|
266
|
+
100.0 * (baseline.avg_throughput - current.avg_throughput) / baseline.avg_throughput
|
|
267
|
+
)
|
|
268
|
+
return degradation > threshold_pct
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class BatchAnonymizer:
|
|
272
|
+
"""Batch processing for anonymization.
|
|
273
|
+
|
|
274
|
+
Optimizes database I/O by:
|
|
275
|
+
- Reading in batches (reduces round-trips)
|
|
276
|
+
- Processing in memory (avoids per-row database calls)
|
|
277
|
+
- Writing in batches (reduces write latency)
|
|
278
|
+
- Pipelining (overlap I/O and processing)
|
|
279
|
+
|
|
280
|
+
Performance:
|
|
281
|
+
- Reduces database round-trips from N to N/batch_size
|
|
282
|
+
- Achieves 10K-20K rows/sec depending on strategy
|
|
283
|
+
- Memory-efficient (streaming)
|
|
284
|
+
- Suitable for large tables (millions of rows)
|
|
285
|
+
|
|
286
|
+
Example:
|
|
287
|
+
>>> anonymizer = BatchAnonymizer(conn, strategy, batch_size=10000)
|
|
288
|
+
>>> result = anonymizer.anonymize_table("users", "email")
|
|
289
|
+
>>> print(f"Anonymized {result.rows_processed} rows")
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
def __init__(
|
|
293
|
+
self,
|
|
294
|
+
conn: psycopg.Connection,
|
|
295
|
+
strategy: AnonymizationStrategy,
|
|
296
|
+
batch_size: int = 10000,
|
|
297
|
+
monitor: PerformanceMonitor | None = None,
|
|
298
|
+
):
|
|
299
|
+
"""Initialize batch anonymizer.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
conn: Database connection
|
|
303
|
+
strategy: Anonymization strategy
|
|
304
|
+
batch_size: Number of rows per batch (default: 10000)
|
|
305
|
+
monitor: Performance monitor (optional)
|
|
306
|
+
"""
|
|
307
|
+
self.conn = conn
|
|
308
|
+
self.strategy = strategy
|
|
309
|
+
self.batch_size = batch_size
|
|
310
|
+
self.monitor = monitor or PerformanceMonitor()
|
|
311
|
+
|
|
312
|
+
def anonymize_table(
|
|
313
|
+
self,
|
|
314
|
+
table_name: str,
|
|
315
|
+
column_name: str,
|
|
316
|
+
where_clause: str | None = None,
|
|
317
|
+
) -> dict[str, Any]:
|
|
318
|
+
"""Anonymize a table column in batches.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
table_name: Table to anonymize
|
|
322
|
+
column_name: Column to anonymize
|
|
323
|
+
where_clause: Optional WHERE clause to filter rows
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
Dictionary with result statistics
|
|
327
|
+
"""
|
|
328
|
+
start_time = time.time()
|
|
329
|
+
total_rows = 0
|
|
330
|
+
updated_rows = 0
|
|
331
|
+
failed_rows = 0
|
|
332
|
+
|
|
333
|
+
try:
|
|
334
|
+
# Get total row count
|
|
335
|
+
with self.conn.cursor() as cursor:
|
|
336
|
+
count_query = sql.SQL("SELECT COUNT(*) FROM {}").format(sql.Identifier(table_name))
|
|
337
|
+
if where_clause:
|
|
338
|
+
# Caller is responsible for ensuring where_clause is safe (not user input)
|
|
339
|
+
count_query = sql.SQL("{} WHERE {}").format(
|
|
340
|
+
count_query,
|
|
341
|
+
sql.SQL(where_clause), # type: ignore[arg-type]
|
|
342
|
+
)
|
|
343
|
+
cursor.execute(count_query)
|
|
344
|
+
row = cursor.fetchone()
|
|
345
|
+
total_rows = row[0] if row else 0
|
|
346
|
+
|
|
347
|
+
logger.info(f"Anonymizing {table_name}.{column_name}: {total_rows} rows")
|
|
348
|
+
|
|
349
|
+
# Process in batches
|
|
350
|
+
offset = 0
|
|
351
|
+
while offset < total_rows:
|
|
352
|
+
batch_updated = self._process_batch(
|
|
353
|
+
table_name, column_name, offset, self.batch_size, where_clause
|
|
354
|
+
)
|
|
355
|
+
updated_rows += batch_updated
|
|
356
|
+
offset += self.batch_size
|
|
357
|
+
|
|
358
|
+
# Log progress every 100K rows
|
|
359
|
+
if offset % 100000 == 0:
|
|
360
|
+
logger.info(
|
|
361
|
+
f"Progress: {offset}/{total_rows} rows ({100.0 * offset / total_rows:.1f}%)"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
except Exception as e:
|
|
365
|
+
logger.error(f"Batch anonymization failed: {e}")
|
|
366
|
+
failed_rows = total_rows - updated_rows
|
|
367
|
+
|
|
368
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
369
|
+
|
|
370
|
+
# Record performance
|
|
371
|
+
self.monitor.record(
|
|
372
|
+
operation="batch_anonymize",
|
|
373
|
+
duration_ms=duration_ms,
|
|
374
|
+
rows_processed=updated_rows,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
result = {
|
|
378
|
+
"table": table_name,
|
|
379
|
+
"column": column_name,
|
|
380
|
+
"total_rows": total_rows,
|
|
381
|
+
"updated_rows": updated_rows,
|
|
382
|
+
"failed_rows": failed_rows,
|
|
383
|
+
"duration_ms": duration_ms,
|
|
384
|
+
"throughput_rows_per_sec": (updated_rows / duration_ms * 1000)
|
|
385
|
+
if duration_ms > 0
|
|
386
|
+
else 0,
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
logger.info(f"Batch anonymization complete: {result}")
|
|
390
|
+
return result
|
|
391
|
+
|
|
392
|
+
def _process_batch(
|
|
393
|
+
self,
|
|
394
|
+
table_name: str,
|
|
395
|
+
column_name: str,
|
|
396
|
+
offset: int,
|
|
397
|
+
batch_size: int,
|
|
398
|
+
where_clause: str | None = None,
|
|
399
|
+
) -> int:
|
|
400
|
+
"""Process a single batch.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
table_name: Table to anonymize
|
|
404
|
+
column_name: Column to anonymize
|
|
405
|
+
offset: Batch offset
|
|
406
|
+
batch_size: Number of rows per batch
|
|
407
|
+
where_clause: Optional WHERE clause
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Number of rows updated
|
|
411
|
+
"""
|
|
412
|
+
# Fetch batch
|
|
413
|
+
select_query = sql.SQL("SELECT id, {} FROM {}").format(
|
|
414
|
+
sql.Identifier(column_name),
|
|
415
|
+
sql.Identifier(table_name),
|
|
416
|
+
)
|
|
417
|
+
if where_clause:
|
|
418
|
+
# Caller is responsible for ensuring where_clause is safe (not user input)
|
|
419
|
+
select_query = sql.SQL("{} WHERE {}").format(
|
|
420
|
+
select_query,
|
|
421
|
+
sql.SQL(where_clause), # type: ignore[arg-type]
|
|
422
|
+
)
|
|
423
|
+
select_query = sql.SQL("{} LIMIT {} OFFSET {}").format(
|
|
424
|
+
select_query, sql.Literal(batch_size), sql.Literal(offset)
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
with self.conn.cursor() as cursor:
|
|
428
|
+
cursor.execute(select_query)
|
|
429
|
+
rows = cursor.fetchall()
|
|
430
|
+
|
|
431
|
+
if not rows:
|
|
432
|
+
return 0
|
|
433
|
+
|
|
434
|
+
# Anonymize in memory
|
|
435
|
+
updates = []
|
|
436
|
+
for row_id, value in rows:
|
|
437
|
+
try:
|
|
438
|
+
anonymized = self.strategy.anonymize(value)
|
|
439
|
+
updates.append((row_id, anonymized))
|
|
440
|
+
except Exception as e:
|
|
441
|
+
logger.error(f"Anonymization failed for row {row_id}: {e}")
|
|
442
|
+
|
|
443
|
+
# Update database (batch update)
|
|
444
|
+
if updates:
|
|
445
|
+
update_query = sql.SQL("UPDATE {} SET {} = %s WHERE id = %s").format(
|
|
446
|
+
sql.Identifier(table_name),
|
|
447
|
+
sql.Identifier(column_name),
|
|
448
|
+
)
|
|
449
|
+
with self.conn.cursor() as cursor:
|
|
450
|
+
for row_id, anonymized in updates:
|
|
451
|
+
cursor.execute(update_query, (anonymized, row_id))
|
|
452
|
+
self.conn.commit()
|
|
453
|
+
|
|
454
|
+
return len(updates)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
class ConcurrentAnonymizer:
|
|
458
|
+
"""Concurrent processing using thread pool.
|
|
459
|
+
|
|
460
|
+
Parallelizes anonymization across multiple workers:
|
|
461
|
+
- Multiple worker threads
|
|
462
|
+
- Shared connection pool
|
|
463
|
+
- Work queue distribution
|
|
464
|
+
- Thread-safe operation tracking
|
|
465
|
+
|
|
466
|
+
Performance:
|
|
467
|
+
- 2-4x speedup with 4 workers (I/O bound)
|
|
468
|
+
- Achieves 20K-35K rows/sec with tuning
|
|
469
|
+
- Uses connection pooling to avoid connection limits
|
|
470
|
+
- Suitable for multi-core systems
|
|
471
|
+
|
|
472
|
+
Limitations:
|
|
473
|
+
- GIL limits CPU-intensive strategies (use multiprocessing instead)
|
|
474
|
+
- Connection pool must support concurrent access
|
|
475
|
+
- Requires careful synchronization for shared state
|
|
476
|
+
|
|
477
|
+
Example:
|
|
478
|
+
>>> anonymizer = ConcurrentAnonymizer(conn, strategy, num_workers=4)
|
|
479
|
+
>>> result = anonymizer.anonymize_table("users", "email")
|
|
480
|
+
>>> print(f"Processed {result['throughput_rows_per_sec']:.0f} rows/sec")
|
|
481
|
+
"""
|
|
482
|
+
|
|
483
|
+
def __init__(
|
|
484
|
+
self,
|
|
485
|
+
conn: psycopg.Connection,
|
|
486
|
+
strategy: AnonymizationStrategy,
|
|
487
|
+
num_workers: int = 4,
|
|
488
|
+
batch_size: int = 5000,
|
|
489
|
+
monitor: PerformanceMonitor | None = None,
|
|
490
|
+
):
|
|
491
|
+
"""Initialize concurrent anonymizer.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
conn: Database connection (must support concurrent access)
|
|
495
|
+
strategy: Anonymization strategy
|
|
496
|
+
num_workers: Number of worker threads (default: 4)
|
|
497
|
+
batch_size: Rows per batch per worker
|
|
498
|
+
monitor: Performance monitor (optional)
|
|
499
|
+
"""
|
|
500
|
+
self.conn = conn
|
|
501
|
+
self.strategy = strategy
|
|
502
|
+
self.num_workers = num_workers
|
|
503
|
+
self.batch_size = batch_size
|
|
504
|
+
self.monitor = monitor or PerformanceMonitor()
|
|
505
|
+
|
|
506
|
+
def anonymize_table(
|
|
507
|
+
self,
|
|
508
|
+
table_name: str,
|
|
509
|
+
column_name: str,
|
|
510
|
+
where_clause: str | None = None,
|
|
511
|
+
) -> dict[str, Any]:
|
|
512
|
+
"""Anonymize table with concurrent workers.
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
table_name: Table to anonymize
|
|
516
|
+
column_name: Column to anonymize
|
|
517
|
+
where_clause: Optional WHERE clause
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
Dictionary with result statistics
|
|
521
|
+
"""
|
|
522
|
+
start_time = time.time()
|
|
523
|
+
total_rows = 0
|
|
524
|
+
updated_rows = 0
|
|
525
|
+
failed_rows = 0
|
|
526
|
+
|
|
527
|
+
try:
|
|
528
|
+
# Get total row count
|
|
529
|
+
with self.conn.cursor() as cursor:
|
|
530
|
+
count_query = sql.SQL("SELECT COUNT(*) FROM {}").format(sql.Identifier(table_name))
|
|
531
|
+
if where_clause:
|
|
532
|
+
# Caller is responsible for ensuring where_clause is safe (not user input)
|
|
533
|
+
count_query = sql.SQL("{} WHERE {}").format(
|
|
534
|
+
count_query,
|
|
535
|
+
sql.SQL(where_clause), # type: ignore[arg-type]
|
|
536
|
+
)
|
|
537
|
+
cursor.execute(count_query)
|
|
538
|
+
row = cursor.fetchone()
|
|
539
|
+
total_rows = row[0] if row else 0
|
|
540
|
+
|
|
541
|
+
logger.info(
|
|
542
|
+
f"Anonymizing {table_name}.{column_name} "
|
|
543
|
+
f"with {self.num_workers} workers: {total_rows} rows"
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
# Create work queue (batch offsets)
|
|
547
|
+
work_queue = []
|
|
548
|
+
for offset in range(0, total_rows, self.batch_size):
|
|
549
|
+
work_queue.append((table_name, column_name, offset, where_clause))
|
|
550
|
+
|
|
551
|
+
# Process with thread pool
|
|
552
|
+
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
|
553
|
+
futures = [
|
|
554
|
+
executor.submit(self._process_batch_concurrent, *task) for task in work_queue
|
|
555
|
+
]
|
|
556
|
+
|
|
557
|
+
for future in as_completed(futures):
|
|
558
|
+
try:
|
|
559
|
+
batch_updated = future.result()
|
|
560
|
+
updated_rows += batch_updated
|
|
561
|
+
except Exception as e:
|
|
562
|
+
logger.error(f"Worker failed: {e}")
|
|
563
|
+
failed_rows += 1
|
|
564
|
+
|
|
565
|
+
except Exception as e:
|
|
566
|
+
logger.error(f"Concurrent anonymization failed: {e}")
|
|
567
|
+
failed_rows = total_rows - updated_rows
|
|
568
|
+
|
|
569
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
570
|
+
|
|
571
|
+
# Record performance
|
|
572
|
+
self.monitor.record(
|
|
573
|
+
operation="concurrent_anonymize",
|
|
574
|
+
duration_ms=duration_ms,
|
|
575
|
+
rows_processed=updated_rows,
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
result = {
|
|
579
|
+
"table": table_name,
|
|
580
|
+
"column": column_name,
|
|
581
|
+
"total_rows": total_rows,
|
|
582
|
+
"updated_rows": updated_rows,
|
|
583
|
+
"failed_rows": failed_rows,
|
|
584
|
+
"workers": self.num_workers,
|
|
585
|
+
"duration_ms": duration_ms,
|
|
586
|
+
"throughput_rows_per_sec": (updated_rows / duration_ms * 1000)
|
|
587
|
+
if duration_ms > 0
|
|
588
|
+
else 0,
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
logger.info(f"Concurrent anonymization complete: {result}")
|
|
592
|
+
return result
|
|
593
|
+
|
|
594
|
+
def _process_batch_concurrent(
|
|
595
|
+
self,
|
|
596
|
+
table_name: str,
|
|
597
|
+
column_name: str,
|
|
598
|
+
offset: int,
|
|
599
|
+
where_clause: str | None = None,
|
|
600
|
+
) -> int:
|
|
601
|
+
"""Process a batch in a worker thread.
|
|
602
|
+
|
|
603
|
+
Args:
|
|
604
|
+
table_name: Table to anonymize
|
|
605
|
+
column_name: Column to anonymize
|
|
606
|
+
offset: Batch offset
|
|
607
|
+
where_clause: Optional WHERE clause
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
Number of rows updated
|
|
611
|
+
"""
|
|
612
|
+
# Each worker gets its own connection
|
|
613
|
+
try:
|
|
614
|
+
worker_conn = self.conn.copy()
|
|
615
|
+
except Exception:
|
|
616
|
+
# Fallback: reuse main connection (less ideal)
|
|
617
|
+
worker_conn = self.conn
|
|
618
|
+
|
|
619
|
+
try:
|
|
620
|
+
# Fetch batch
|
|
621
|
+
select_query = sql.SQL("SELECT id, {} FROM {}").format(
|
|
622
|
+
sql.Identifier(column_name),
|
|
623
|
+
sql.Identifier(table_name),
|
|
624
|
+
)
|
|
625
|
+
if where_clause:
|
|
626
|
+
# Caller is responsible for ensuring where_clause is safe (not user input)
|
|
627
|
+
select_query = sql.SQL("{} WHERE {}").format(
|
|
628
|
+
select_query,
|
|
629
|
+
sql.SQL(where_clause), # type: ignore[arg-type]
|
|
630
|
+
)
|
|
631
|
+
select_query = sql.SQL("{} LIMIT {} OFFSET {}").format(
|
|
632
|
+
select_query, sql.Literal(self.batch_size), sql.Literal(offset)
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
with worker_conn.cursor() as cursor:
|
|
636
|
+
cursor.execute(select_query)
|
|
637
|
+
rows = cursor.fetchall()
|
|
638
|
+
|
|
639
|
+
if not rows:
|
|
640
|
+
return 0
|
|
641
|
+
|
|
642
|
+
# Anonymize in memory
|
|
643
|
+
updates = []
|
|
644
|
+
for row_id, value in rows:
|
|
645
|
+
try:
|
|
646
|
+
anonymized = self.strategy.anonymize(value)
|
|
647
|
+
updates.append((row_id, anonymized))
|
|
648
|
+
except Exception as e:
|
|
649
|
+
logger.error(f"Anonymization failed for row {row_id}: {e}")
|
|
650
|
+
|
|
651
|
+
# Update database (batch update)
|
|
652
|
+
if updates:
|
|
653
|
+
update_query = sql.SQL("UPDATE {} SET {} = %s WHERE id = %s").format(
|
|
654
|
+
sql.Identifier(table_name),
|
|
655
|
+
sql.Identifier(column_name),
|
|
656
|
+
)
|
|
657
|
+
with worker_conn.cursor() as cursor:
|
|
658
|
+
for row_id, anonymized in updates:
|
|
659
|
+
cursor.execute(update_query, (anonymized, row_id))
|
|
660
|
+
worker_conn.commit()
|
|
661
|
+
|
|
662
|
+
return len(updates)
|
|
663
|
+
|
|
664
|
+
except Exception as e:
|
|
665
|
+
logger.error(f"Worker batch processing failed: {e}")
|
|
666
|
+
return 0
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
class CacheEntry:
|
|
670
|
+
"""Single cache entry with expiration and stats."""
|
|
671
|
+
|
|
672
|
+
def __init__(self, original_value: Any, anonymized_value: Any, ttl_seconds: int = 3600):
|
|
673
|
+
"""Initialize cache entry.
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
original_value: Original value
|
|
677
|
+
anonymized_value: Anonymized value
|
|
678
|
+
ttl_seconds: Time-to-live (default: 1 hour)
|
|
679
|
+
"""
|
|
680
|
+
self.original_value = original_value
|
|
681
|
+
self.anonymized_value = anonymized_value
|
|
682
|
+
self.created_at = datetime.now()
|
|
683
|
+
self.expires_at = datetime.now() + timedelta(seconds=ttl_seconds)
|
|
684
|
+
self.access_count = 0
|
|
685
|
+
self.last_accessed = datetime.now()
|
|
686
|
+
|
|
687
|
+
def is_expired(self) -> bool:
|
|
688
|
+
"""Check if entry has expired."""
|
|
689
|
+
return datetime.now() > self.expires_at
|
|
690
|
+
|
|
691
|
+
def record_access(self) -> None:
|
|
692
|
+
"""Record access for LRU tracking."""
|
|
693
|
+
self.access_count += 1
|
|
694
|
+
self.last_accessed = datetime.now()
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
@dataclass
|
|
698
|
+
class CacheStatistics:
|
|
699
|
+
"""Cache performance statistics."""
|
|
700
|
+
|
|
701
|
+
hits: int = 0
|
|
702
|
+
"""Number of cache hits."""
|
|
703
|
+
|
|
704
|
+
misses: int = 0
|
|
705
|
+
"""Number of cache misses."""
|
|
706
|
+
|
|
707
|
+
evictions: int = 0
|
|
708
|
+
"""Number of evictions."""
|
|
709
|
+
|
|
710
|
+
avg_lookup_time_us: float = 0.0
|
|
711
|
+
"""Average lookup time in microseconds."""
|
|
712
|
+
|
|
713
|
+
total_entries: int = 0
|
|
714
|
+
"""Current entries in cache."""
|
|
715
|
+
|
|
716
|
+
max_entries: int = 0
|
|
717
|
+
"""Maximum cache size."""
|
|
718
|
+
|
|
719
|
+
@property
|
|
720
|
+
def hit_rate(self) -> float:
|
|
721
|
+
"""Calculate hit rate percentage."""
|
|
722
|
+
total = self.hits + self.misses
|
|
723
|
+
return (100.0 * self.hits / total) if total > 0 else 0.0
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
class AnonymizationCache:
|
|
727
|
+
"""In-memory cache for anonymization results.
|
|
728
|
+
|
|
729
|
+
Caches mapping of original→anonymized values to avoid re-computing
|
|
730
|
+
identical values. Uses LRU eviction when cache grows too large.
|
|
731
|
+
|
|
732
|
+
Features:
|
|
733
|
+
- Deterministic caching (same input → same output)
|
|
734
|
+
- TTL-based expiration
|
|
735
|
+
- LRU eviction policy
|
|
736
|
+
- Thread-safe access
|
|
737
|
+
- Performance tracking
|
|
738
|
+
|
|
739
|
+
Example:
|
|
740
|
+
>>> cache = AnonymizationCache(max_entries=10000)
|
|
741
|
+
>>> cache.set("john@example.com", "TOKEN_abc123")
|
|
742
|
+
>>> result = cache.get("john@example.com")
|
|
743
|
+
>>> stats = cache.get_statistics()
|
|
744
|
+
"""
|
|
745
|
+
|
|
746
|
+
def __init__(self, max_entries: int = 10000, ttl_seconds: int = 3600):
|
|
747
|
+
"""Initialize cache.
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
max_entries: Maximum cache size
|
|
751
|
+
ttl_seconds: Entry time-to-live
|
|
752
|
+
"""
|
|
753
|
+
self.max_entries = max_entries
|
|
754
|
+
self.ttl_seconds = ttl_seconds
|
|
755
|
+
self._cache: dict[str, CacheEntry] = {}
|
|
756
|
+
self._lock = threading.Lock()
|
|
757
|
+
self._hits = 0
|
|
758
|
+
self._misses = 0
|
|
759
|
+
self._evictions = 0
|
|
760
|
+
self._lookup_times: list[float] = []
|
|
761
|
+
|
|
762
|
+
def get(self, original_value: Any) -> Any | None:
|
|
763
|
+
"""Get anonymized value from cache.
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
original_value: Value to look up
|
|
767
|
+
|
|
768
|
+
Returns:
|
|
769
|
+
Anonymized value if found and not expired, None otherwise
|
|
770
|
+
"""
|
|
771
|
+
start_time = time.time() * 1e6 # microseconds
|
|
772
|
+
|
|
773
|
+
key = str(original_value)
|
|
774
|
+
|
|
775
|
+
with self._lock:
|
|
776
|
+
if key not in self._cache:
|
|
777
|
+
self._misses += 1
|
|
778
|
+
self._record_lookup_time(start_time)
|
|
779
|
+
return None
|
|
780
|
+
|
|
781
|
+
entry = self._cache[key]
|
|
782
|
+
|
|
783
|
+
if entry.is_expired():
|
|
784
|
+
del self._cache[key]
|
|
785
|
+
self._misses += 1
|
|
786
|
+
self._record_lookup_time(start_time)
|
|
787
|
+
return None
|
|
788
|
+
|
|
789
|
+
entry.record_access()
|
|
790
|
+
self._hits += 1
|
|
791
|
+
self._record_lookup_time(start_time)
|
|
792
|
+
return entry.anonymized_value
|
|
793
|
+
|
|
794
|
+
def set(self, original_value: Any, anonymized_value: Any) -> None:
|
|
795
|
+
"""Set cached anonymization result.
|
|
796
|
+
|
|
797
|
+
Args:
|
|
798
|
+
original_value: Original value
|
|
799
|
+
anonymized_value: Anonymized value
|
|
800
|
+
"""
|
|
801
|
+
key = str(original_value)
|
|
802
|
+
|
|
803
|
+
with self._lock:
|
|
804
|
+
# Check if cache is full
|
|
805
|
+
if len(self._cache) >= self.max_entries:
|
|
806
|
+
self._evict_lru()
|
|
807
|
+
|
|
808
|
+
self._cache[key] = CacheEntry(original_value, anonymized_value, self.ttl_seconds)
|
|
809
|
+
|
|
810
|
+
def clear(self) -> None:
|
|
811
|
+
"""Clear entire cache."""
|
|
812
|
+
with self._lock:
|
|
813
|
+
self._cache.clear()
|
|
814
|
+
|
|
815
|
+
def get_statistics(self) -> CacheStatistics:
|
|
816
|
+
"""Get cache statistics."""
|
|
817
|
+
with self._lock:
|
|
818
|
+
avg_lookup = (
|
|
819
|
+
sum(self._lookup_times) / len(self._lookup_times) if self._lookup_times else 0.0
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
return CacheStatistics(
|
|
823
|
+
hits=self._hits,
|
|
824
|
+
misses=self._misses,
|
|
825
|
+
evictions=self._evictions,
|
|
826
|
+
avg_lookup_time_us=avg_lookup,
|
|
827
|
+
total_entries=len(self._cache),
|
|
828
|
+
max_entries=self.max_entries,
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
def _evict_lru(self) -> None:
|
|
832
|
+
"""Evict least-recently-used entry."""
|
|
833
|
+
if not self._cache:
|
|
834
|
+
return
|
|
835
|
+
|
|
836
|
+
lru_key = min(self._cache.keys(), key=lambda k: self._cache[k].last_accessed)
|
|
837
|
+
del self._cache[lru_key]
|
|
838
|
+
self._evictions += 1
|
|
839
|
+
|
|
840
|
+
def _record_lookup_time(self, start_time_us: float) -> None:
|
|
841
|
+
"""Record lookup time."""
|
|
842
|
+
duration_us = time.time() * 1e6 - start_time_us
|
|
843
|
+
self._lookup_times.append(duration_us)
|
|
844
|
+
|
|
845
|
+
# Keep only last 1000 lookups to avoid unbounded list
|
|
846
|
+
if len(self._lookup_times) > 1000:
|
|
847
|
+
self._lookup_times = self._lookup_times[-1000:]
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
class ConnectionPoolManager:
|
|
851
|
+
"""Manage database connection pooling.
|
|
852
|
+
|
|
853
|
+
Provides efficient connection reuse for concurrent operations:
|
|
854
|
+
- Connection pool with configurable size
|
|
855
|
+
- Automatic connection recycling
|
|
856
|
+
- Health checking
|
|
857
|
+
- Connection borrowing and returning
|
|
858
|
+
- Thread-safe access
|
|
859
|
+
|
|
860
|
+
Example:
|
|
861
|
+
>>> pool = ConnectionPoolManager(min_size=5, max_size=20)
|
|
862
|
+
>>> pool.initialize(conn_params)
|
|
863
|
+
>>> conn = pool.borrow()
|
|
864
|
+
>>> try:
|
|
865
|
+
... # Use connection
|
|
866
|
+
... finally:
|
|
867
|
+
... pool.return_connection(conn)
|
|
868
|
+
"""
|
|
869
|
+
|
|
870
|
+
def __init__(self, min_size: int = 5, max_size: int = 20):
|
|
871
|
+
"""Initialize connection pool manager.
|
|
872
|
+
|
|
873
|
+
Args:
|
|
874
|
+
min_size: Minimum pool size
|
|
875
|
+
max_size: Maximum pool size
|
|
876
|
+
"""
|
|
877
|
+
self.min_size = min_size
|
|
878
|
+
self.max_size = max_size
|
|
879
|
+
self._connections: list[psycopg.Connection] = []
|
|
880
|
+
self._available: list[psycopg.Connection] = []
|
|
881
|
+
self._in_use: set[psycopg.Connection] = set()
|
|
882
|
+
self._lock = threading.Lock()
|
|
883
|
+
self._initialized = False
|
|
884
|
+
|
|
885
|
+
def initialize(self, conn_params: dict[str, Any]) -> None:
|
|
886
|
+
"""Initialize connection pool.
|
|
887
|
+
|
|
888
|
+
Args:
|
|
889
|
+
conn_params: Connection parameters (host, dbname, user, password, etc.)
|
|
890
|
+
"""
|
|
891
|
+
with self._lock:
|
|
892
|
+
for _ in range(self.min_size):
|
|
893
|
+
try:
|
|
894
|
+
conn = psycopg.connect(**conn_params)
|
|
895
|
+
self._connections.append(conn)
|
|
896
|
+
self._available.append(conn)
|
|
897
|
+
except psycopg.Error as e:
|
|
898
|
+
logger.error(f"Failed to create connection: {e}")
|
|
899
|
+
|
|
900
|
+
self._initialized = True
|
|
901
|
+
logger.info(f"Connection pool initialized: {len(self._available)}/{self.min_size}")
|
|
902
|
+
|
|
903
|
+
def borrow(self, timeout_seconds: int = 30) -> psycopg.Connection | None:
|
|
904
|
+
"""Borrow connection from pool.
|
|
905
|
+
|
|
906
|
+
Args:
|
|
907
|
+
timeout_seconds: Max wait time for available connection
|
|
908
|
+
|
|
909
|
+
Returns:
|
|
910
|
+
Connection or None if timeout
|
|
911
|
+
"""
|
|
912
|
+
start_time = time.time()
|
|
913
|
+
|
|
914
|
+
while time.time() - start_time < timeout_seconds:
|
|
915
|
+
with self._lock:
|
|
916
|
+
if self._available:
|
|
917
|
+
conn = self._available.pop()
|
|
918
|
+
if self._check_connection_health(conn):
|
|
919
|
+
self._in_use.add(conn)
|
|
920
|
+
return conn
|
|
921
|
+
|
|
922
|
+
# Create new connection if under max_size
|
|
923
|
+
if len(self._connections) < self.max_size:
|
|
924
|
+
try:
|
|
925
|
+
conn = psycopg.connect() # Use cached params
|
|
926
|
+
self._connections.append(conn)
|
|
927
|
+
self._in_use.add(conn)
|
|
928
|
+
return conn
|
|
929
|
+
except psycopg.Error:
|
|
930
|
+
pass
|
|
931
|
+
|
|
932
|
+
time.sleep(0.1)
|
|
933
|
+
|
|
934
|
+
logger.warning("Connection pool timeout - no available connections")
|
|
935
|
+
return None
|
|
936
|
+
|
|
937
|
+
def return_connection(self, conn: psycopg.Connection) -> None:
|
|
938
|
+
"""Return connection to pool.
|
|
939
|
+
|
|
940
|
+
Args:
|
|
941
|
+
conn: Connection to return
|
|
942
|
+
"""
|
|
943
|
+
with self._lock:
|
|
944
|
+
if conn in self._in_use:
|
|
945
|
+
self._in_use.remove(conn)
|
|
946
|
+
|
|
947
|
+
if self._check_connection_health(conn):
|
|
948
|
+
self._available.append(conn)
|
|
949
|
+
else:
|
|
950
|
+
# Remove unhealthy connection
|
|
951
|
+
if conn in self._connections:
|
|
952
|
+
self._connections.remove(conn)
|
|
953
|
+
with contextlib.suppress(psycopg.Error):
|
|
954
|
+
conn.close()
|
|
955
|
+
|
|
956
|
+
def close_all(self) -> None:
|
|
957
|
+
"""Close all connections in pool."""
|
|
958
|
+
with self._lock:
|
|
959
|
+
for conn in self._connections:
|
|
960
|
+
with contextlib.suppress(psycopg.Error):
|
|
961
|
+
conn.close()
|
|
962
|
+
|
|
963
|
+
self._connections.clear()
|
|
964
|
+
self._available.clear()
|
|
965
|
+
self._in_use.clear()
|
|
966
|
+
|
|
967
|
+
def _check_connection_health(self, conn: psycopg.Connection) -> bool:
|
|
968
|
+
"""Check if connection is healthy."""
|
|
969
|
+
try:
|
|
970
|
+
with conn.cursor() as cursor:
|
|
971
|
+
cursor.execute("SELECT 1")
|
|
972
|
+
return True
|
|
973
|
+
except psycopg.Error:
|
|
974
|
+
return False
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
class QueryOptimizer:
|
|
978
|
+
"""Optimize queries for anonymization operations.
|
|
979
|
+
|
|
980
|
+
Analyzes and optimizes SQL queries:
|
|
981
|
+
- EXPLAIN ANALYZE integration
|
|
982
|
+
- Index recommendations
|
|
983
|
+
- Slow query detection
|
|
984
|
+
- Query plan analysis
|
|
985
|
+
- Cost estimation
|
|
986
|
+
|
|
987
|
+
Example:
|
|
988
|
+
>>> optimizer = QueryOptimizer(conn)
|
|
989
|
+
>>> plan = optimizer.analyze_query("SELECT * FROM users WHERE email = %s", ("test@example.com",))
|
|
990
|
+
>>> stats = optimizer.get_statistics()
|
|
991
|
+
"""
|
|
992
|
+
|
|
993
|
+
def __init__(self, conn: psycopg.Connection):
|
|
994
|
+
"""Initialize query optimizer.
|
|
995
|
+
|
|
996
|
+
Args:
|
|
997
|
+
conn: Database connection
|
|
998
|
+
"""
|
|
999
|
+
self.conn = conn
|
|
1000
|
+
self._query_stats: dict[str, dict[str, Any]] = {}
|
|
1001
|
+
|
|
1002
|
+
def analyze_query(self, query: str, params: tuple[Any, ...] | None = None) -> dict[str, Any]:
|
|
1003
|
+
"""Analyze query execution plan.
|
|
1004
|
+
|
|
1005
|
+
Args:
|
|
1006
|
+
query: SQL query
|
|
1007
|
+
params: Query parameters
|
|
1008
|
+
|
|
1009
|
+
Returns:
|
|
1010
|
+
Execution plan analysis
|
|
1011
|
+
"""
|
|
1012
|
+
try:
|
|
1013
|
+
with self.conn.cursor() as cursor:
|
|
1014
|
+
# Get EXPLAIN ANALYZE output
|
|
1015
|
+
explain_query = f"EXPLAIN ANALYZE {query}"
|
|
1016
|
+
cursor.execute(explain_query, params or ())
|
|
1017
|
+
plan = cursor.fetchall()
|
|
1018
|
+
|
|
1019
|
+
analysis = {
|
|
1020
|
+
"query": query,
|
|
1021
|
+
"plan": plan,
|
|
1022
|
+
"indexed": self._check_indexes(query),
|
|
1023
|
+
"estimated_rows": self._extract_rows(plan),
|
|
1024
|
+
"is_slow": self._is_slow_query(plan),
|
|
1025
|
+
"recommendations": self._get_recommendations(query, plan),
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
# Cache for statistics
|
|
1029
|
+
query_hash = hash(query)
|
|
1030
|
+
self._query_stats[str(query_hash)] = analysis
|
|
1031
|
+
|
|
1032
|
+
return analysis
|
|
1033
|
+
|
|
1034
|
+
except psycopg.Error as e:
|
|
1035
|
+
logger.error(f"Query analysis failed: {e}")
|
|
1036
|
+
return {"error": str(e)}
|
|
1037
|
+
|
|
1038
|
+
def recommend_indexes(self, table_name: str, column_names: list[str]) -> list[str]:
|
|
1039
|
+
"""Recommend indexes for table.
|
|
1040
|
+
|
|
1041
|
+
Args:
|
|
1042
|
+
table_name: Table to analyze
|
|
1043
|
+
column_names: Columns to index
|
|
1044
|
+
|
|
1045
|
+
Returns:
|
|
1046
|
+
List of recommended index creation statements
|
|
1047
|
+
"""
|
|
1048
|
+
recommendations = []
|
|
1049
|
+
|
|
1050
|
+
for column_name in column_names:
|
|
1051
|
+
index_name = f"idx_{table_name}_{column_name}"
|
|
1052
|
+
recommendations.append(f"CREATE INDEX {index_name} ON {table_name}({column_name})")
|
|
1053
|
+
|
|
1054
|
+
return recommendations
|
|
1055
|
+
|
|
1056
|
+
def _check_indexes(self, _query: str) -> bool:
|
|
1057
|
+
"""Check if query uses indexes."""
|
|
1058
|
+
return "Index" in str(self._query_stats)
|
|
1059
|
+
|
|
1060
|
+
def _extract_rows(self, plan: list[tuple]) -> int:
|
|
1061
|
+
"""Extract estimated rows from plan."""
|
|
1062
|
+
for row in plan:
|
|
1063
|
+
if "rows=" in str(row):
|
|
1064
|
+
return int(str(row).split("rows=")[1].split(" ")[0])
|
|
1065
|
+
return 0
|
|
1066
|
+
|
|
1067
|
+
def _is_slow_query(self, plan: list[tuple]) -> bool:
|
|
1068
|
+
"""Detect if query is slow."""
|
|
1069
|
+
plan_str = str(plan)
|
|
1070
|
+
return "Seq Scan" in plan_str or "Sort" in plan_str and "Sequential" in plan_str
|
|
1071
|
+
|
|
1072
|
+
def _get_recommendations(self, _query: str, plan: list[tuple]) -> list[str]:
|
|
1073
|
+
"""Get recommendations for query optimization."""
|
|
1074
|
+
recommendations = []
|
|
1075
|
+
|
|
1076
|
+
if "Seq Scan" in str(plan):
|
|
1077
|
+
recommendations.append("Add index on WHERE clause columns")
|
|
1078
|
+
|
|
1079
|
+
if "Sort" in str(plan):
|
|
1080
|
+
recommendations.append("Consider index on ORDER BY columns")
|
|
1081
|
+
|
|
1082
|
+
return recommendations
|
|
1083
|
+
|
|
1084
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
1085
|
+
"""Get query optimization statistics."""
|
|
1086
|
+
return {
|
|
1087
|
+
"total_queries_analyzed": len(self._query_stats),
|
|
1088
|
+
"slow_queries": sum(1 for stats in self._query_stats.values() if stats.get("is_slow")),
|
|
1089
|
+
"queries_with_recommendations": sum(
|
|
1090
|
+
1 for stats in self._query_stats.values() if stats.get("recommendations")
|
|
1091
|
+
),
|
|
1092
|
+
}
|