fraiseql-confiture 0.3.7__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- confiture/__init__.py +48 -0
- confiture/_core.cpython-311-darwin.so +0 -0
- confiture/cli/__init__.py +0 -0
- confiture/cli/dry_run.py +116 -0
- confiture/cli/lint_formatter.py +193 -0
- confiture/cli/main.py +1893 -0
- confiture/config/__init__.py +0 -0
- confiture/config/environment.py +263 -0
- confiture/core/__init__.py +51 -0
- confiture/core/anonymization/__init__.py +0 -0
- confiture/core/anonymization/audit.py +485 -0
- confiture/core/anonymization/benchmarking.py +372 -0
- confiture/core/anonymization/breach_notification.py +652 -0
- confiture/core/anonymization/compliance.py +617 -0
- confiture/core/anonymization/composer.py +298 -0
- confiture/core/anonymization/data_subject_rights.py +669 -0
- confiture/core/anonymization/factory.py +319 -0
- confiture/core/anonymization/governance.py +737 -0
- confiture/core/anonymization/performance.py +1092 -0
- confiture/core/anonymization/profile.py +284 -0
- confiture/core/anonymization/registry.py +195 -0
- confiture/core/anonymization/security/kms_manager.py +547 -0
- confiture/core/anonymization/security/lineage.py +888 -0
- confiture/core/anonymization/security/token_store.py +686 -0
- confiture/core/anonymization/strategies/__init__.py +41 -0
- confiture/core/anonymization/strategies/address.py +359 -0
- confiture/core/anonymization/strategies/credit_card.py +374 -0
- confiture/core/anonymization/strategies/custom.py +161 -0
- confiture/core/anonymization/strategies/date.py +218 -0
- confiture/core/anonymization/strategies/differential_privacy.py +398 -0
- confiture/core/anonymization/strategies/email.py +141 -0
- confiture/core/anonymization/strategies/format_preserving_encryption.py +310 -0
- confiture/core/anonymization/strategies/hash.py +150 -0
- confiture/core/anonymization/strategies/ip_address.py +235 -0
- confiture/core/anonymization/strategies/masking_retention.py +252 -0
- confiture/core/anonymization/strategies/name.py +298 -0
- confiture/core/anonymization/strategies/phone.py +119 -0
- confiture/core/anonymization/strategies/preserve.py +85 -0
- confiture/core/anonymization/strategies/redact.py +101 -0
- confiture/core/anonymization/strategies/salted_hashing.py +322 -0
- confiture/core/anonymization/strategies/text_redaction.py +183 -0
- confiture/core/anonymization/strategies/tokenization.py +334 -0
- confiture/core/anonymization/strategy.py +241 -0
- confiture/core/anonymization/syncer_audit.py +357 -0
- confiture/core/blue_green.py +683 -0
- confiture/core/builder.py +500 -0
- confiture/core/checksum.py +358 -0
- confiture/core/connection.py +184 -0
- confiture/core/differ.py +522 -0
- confiture/core/drift.py +564 -0
- confiture/core/dry_run.py +182 -0
- confiture/core/health.py +313 -0
- confiture/core/hooks/__init__.py +87 -0
- confiture/core/hooks/base.py +232 -0
- confiture/core/hooks/context.py +146 -0
- confiture/core/hooks/execution_strategies.py +57 -0
- confiture/core/hooks/observability.py +220 -0
- confiture/core/hooks/phases.py +53 -0
- confiture/core/hooks/registry.py +295 -0
- confiture/core/large_tables.py +775 -0
- confiture/core/linting/__init__.py +70 -0
- confiture/core/linting/composer.py +192 -0
- confiture/core/linting/libraries/__init__.py +17 -0
- confiture/core/linting/libraries/gdpr.py +168 -0
- confiture/core/linting/libraries/general.py +184 -0
- confiture/core/linting/libraries/hipaa.py +144 -0
- confiture/core/linting/libraries/pci_dss.py +104 -0
- confiture/core/linting/libraries/sox.py +120 -0
- confiture/core/linting/schema_linter.py +491 -0
- confiture/core/linting/versioning.py +151 -0
- confiture/core/locking.py +389 -0
- confiture/core/migration_generator.py +298 -0
- confiture/core/migrator.py +882 -0
- confiture/core/observability/__init__.py +44 -0
- confiture/core/observability/audit.py +323 -0
- confiture/core/observability/logging.py +187 -0
- confiture/core/observability/metrics.py +174 -0
- confiture/core/observability/tracing.py +192 -0
- confiture/core/pg_version.py +418 -0
- confiture/core/pool.py +406 -0
- confiture/core/risk/__init__.py +39 -0
- confiture/core/risk/predictor.py +188 -0
- confiture/core/risk/scoring.py +248 -0
- confiture/core/rollback_generator.py +388 -0
- confiture/core/schema_analyzer.py +769 -0
- confiture/core/schema_to_schema.py +590 -0
- confiture/core/security/__init__.py +32 -0
- confiture/core/security/logging.py +201 -0
- confiture/core/security/validation.py +416 -0
- confiture/core/signals.py +371 -0
- confiture/core/syncer.py +540 -0
- confiture/exceptions.py +192 -0
- confiture/integrations/__init__.py +0 -0
- confiture/models/__init__.py +24 -0
- confiture/models/lint.py +193 -0
- confiture/models/migration.py +265 -0
- confiture/models/schema.py +203 -0
- confiture/models/sql_file_migration.py +225 -0
- confiture/scenarios/__init__.py +36 -0
- confiture/scenarios/compliance.py +586 -0
- confiture/scenarios/ecommerce.py +199 -0
- confiture/scenarios/financial.py +253 -0
- confiture/scenarios/healthcare.py +315 -0
- confiture/scenarios/multi_tenant.py +340 -0
- confiture/scenarios/saas.py +295 -0
- confiture/testing/FRAMEWORK_API.md +722 -0
- confiture/testing/__init__.py +100 -0
- confiture/testing/fixtures/__init__.py +11 -0
- confiture/testing/fixtures/data_validator.py +229 -0
- confiture/testing/fixtures/migration_runner.py +167 -0
- confiture/testing/fixtures/schema_snapshotter.py +352 -0
- confiture/testing/frameworks/__init__.py +10 -0
- confiture/testing/frameworks/mutation.py +587 -0
- confiture/testing/frameworks/performance.py +479 -0
- confiture/testing/loader.py +225 -0
- confiture/testing/pytest/__init__.py +38 -0
- confiture/testing/pytest_plugin.py +190 -0
- confiture/testing/sandbox.py +304 -0
- confiture/testing/utils/__init__.py +0 -0
- fraiseql_confiture-0.3.7.dist-info/METADATA +438 -0
- fraiseql_confiture-0.3.7.dist-info/RECORD +124 -0
- fraiseql_confiture-0.3.7.dist-info/WHEEL +4 -0
- fraiseql_confiture-0.3.7.dist-info/entry_points.txt +4 -0
- fraiseql_confiture-0.3.7.dist-info/licenses/LICENSE +21 -0
confiture/core/syncer.py
ADDED
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
"""Production data synchronization.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to sync data from production databases to
|
|
4
|
+
local/staging environments with PII anonymization support.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import random
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import psycopg
|
|
17
|
+
from rich.progress import BarColumn, Progress, TextColumn, TimeRemainingColumn
|
|
18
|
+
|
|
19
|
+
from confiture.config.environment import DatabaseConfig
|
|
20
|
+
from confiture.core.connection import create_connection
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class TableSelection:
|
|
25
|
+
"""Configuration for selecting which tables to sync."""
|
|
26
|
+
|
|
27
|
+
include: list[str] | None = None # Explicit table list or patterns
|
|
28
|
+
exclude: list[str] | None = None # Tables/patterns to exclude
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class AnonymizationRule:
|
|
33
|
+
"""Rule for anonymizing a specific column."""
|
|
34
|
+
|
|
35
|
+
column: str
|
|
36
|
+
strategy: str # 'email', 'phone', 'name', 'redact', 'hash'
|
|
37
|
+
seed: int | None = None # For reproducible anonymization
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class SyncConfig:
|
|
42
|
+
"""Configuration for data sync operation."""
|
|
43
|
+
|
|
44
|
+
tables: TableSelection
|
|
45
|
+
anonymization: dict[str, list[AnonymizationRule]] | None = None # table -> rules
|
|
46
|
+
batch_size: int = 5000 # Optimized based on benchmarks
|
|
47
|
+
resume: bool = False
|
|
48
|
+
show_progress: bool = False
|
|
49
|
+
checkpoint_file: Path | None = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class TableMetrics:
|
|
54
|
+
"""Performance metrics for a single table sync."""
|
|
55
|
+
|
|
56
|
+
rows_synced: int
|
|
57
|
+
elapsed_seconds: float
|
|
58
|
+
rows_per_second: float
|
|
59
|
+
synced_at: str
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ProductionSyncer:
|
|
63
|
+
"""Synchronize data from production to target database.
|
|
64
|
+
|
|
65
|
+
Features:
|
|
66
|
+
- Table selection with include/exclude patterns
|
|
67
|
+
- Schema-aware data copying
|
|
68
|
+
- PII anonymization
|
|
69
|
+
- Progress reporting
|
|
70
|
+
- Resume support for interrupted syncs
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
source: DatabaseConfig | str,
|
|
76
|
+
target: DatabaseConfig | str,
|
|
77
|
+
):
|
|
78
|
+
"""Initialize syncer with source and target databases.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
source: Source database config or environment name
|
|
82
|
+
target: Target database config or environment name
|
|
83
|
+
"""
|
|
84
|
+
from confiture.config.environment import Environment
|
|
85
|
+
|
|
86
|
+
# Load configs if strings provided
|
|
87
|
+
if isinstance(source, str):
|
|
88
|
+
source = Environment.load(source).database
|
|
89
|
+
|
|
90
|
+
if isinstance(target, str):
|
|
91
|
+
target = Environment.load(target).database
|
|
92
|
+
|
|
93
|
+
self.source_config = source
|
|
94
|
+
self.target_config = target
|
|
95
|
+
|
|
96
|
+
self._source_conn: psycopg.Connection[Any] | None = None
|
|
97
|
+
self._target_conn: psycopg.Connection[Any] | None = None
|
|
98
|
+
|
|
99
|
+
# Progress tracking and metrics
|
|
100
|
+
self._metrics: dict[str, TableMetrics] = {}
|
|
101
|
+
self._completed_tables: set[str] = set()
|
|
102
|
+
self._checkpoint_data: dict[str, Any] = {}
|
|
103
|
+
|
|
104
|
+
def __enter__(self) -> "ProductionSyncer":
|
|
105
|
+
"""Context manager entry."""
|
|
106
|
+
self._source_conn = create_connection(self.source_config)
|
|
107
|
+
self._target_conn = create_connection(self.target_config)
|
|
108
|
+
return self
|
|
109
|
+
|
|
110
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
111
|
+
"""Context manager exit."""
|
|
112
|
+
if self._source_conn:
|
|
113
|
+
self._source_conn.close()
|
|
114
|
+
if self._target_conn:
|
|
115
|
+
self._target_conn.close()
|
|
116
|
+
|
|
117
|
+
def get_all_tables(self) -> list[str]:
|
|
118
|
+
"""Get list of all user tables in source database.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
List of table names in public schema
|
|
122
|
+
"""
|
|
123
|
+
if not self._source_conn:
|
|
124
|
+
raise RuntimeError("Not connected. Use context manager.")
|
|
125
|
+
|
|
126
|
+
with self._source_conn.cursor() as cursor:
|
|
127
|
+
cursor.execute("""
|
|
128
|
+
SELECT tablename
|
|
129
|
+
FROM pg_tables
|
|
130
|
+
WHERE schemaname = 'public'
|
|
131
|
+
ORDER BY tablename
|
|
132
|
+
""")
|
|
133
|
+
return [row[0] for row in cursor.fetchall()]
|
|
134
|
+
|
|
135
|
+
def select_tables(self, selection: TableSelection) -> list[str]:
|
|
136
|
+
"""Select tables based on include/exclude patterns.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
selection: Table selection configuration
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
List of table names to sync
|
|
143
|
+
"""
|
|
144
|
+
all_tables = self.get_all_tables()
|
|
145
|
+
|
|
146
|
+
# If explicit include list, start with those
|
|
147
|
+
if selection.include:
|
|
148
|
+
tables = [t for t in all_tables if t in selection.include]
|
|
149
|
+
else:
|
|
150
|
+
tables = all_tables
|
|
151
|
+
|
|
152
|
+
# Apply exclusions
|
|
153
|
+
if selection.exclude:
|
|
154
|
+
tables = [t for t in tables if t not in selection.exclude]
|
|
155
|
+
|
|
156
|
+
return tables
|
|
157
|
+
|
|
158
|
+
def _anonymize_value(self, value: Any, strategy: str, seed: int | None = None) -> Any:
|
|
159
|
+
"""Anonymize a single value based on strategy.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
value: Original value to anonymize
|
|
163
|
+
strategy: Anonymization strategy ('email', 'phone', 'name', 'redact', 'hash')
|
|
164
|
+
seed: Optional seed for deterministic anonymization
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Anonymized value
|
|
168
|
+
"""
|
|
169
|
+
if value is None:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
# Set random seed for deterministic anonymization
|
|
173
|
+
if seed is not None:
|
|
174
|
+
random.seed(f"{seed}:{value}")
|
|
175
|
+
|
|
176
|
+
if strategy == "email":
|
|
177
|
+
# Generate deterministic fake email
|
|
178
|
+
hash_value = hashlib.sha256(str(value).encode()).hexdigest()[:8]
|
|
179
|
+
return f"user_{hash_value}@example.com"
|
|
180
|
+
|
|
181
|
+
elif strategy == "phone":
|
|
182
|
+
# Generate fake phone number
|
|
183
|
+
if seed is not None:
|
|
184
|
+
# Deterministic based on seed
|
|
185
|
+
hash_int = int(hashlib.sha256(str(value).encode()).hexdigest()[:8], 16)
|
|
186
|
+
number = hash_int % 10000
|
|
187
|
+
else:
|
|
188
|
+
number = random.randint(1000, 9999)
|
|
189
|
+
return f"+1-555-{number}"
|
|
190
|
+
|
|
191
|
+
elif strategy == "name":
|
|
192
|
+
# Generate fake name
|
|
193
|
+
hash_str = hashlib.sha256(str(value).encode()).hexdigest()[:8]
|
|
194
|
+
return f"User {hash_str[:4].upper()}"
|
|
195
|
+
|
|
196
|
+
elif strategy == "redact":
|
|
197
|
+
# Simply redact the value
|
|
198
|
+
return "[REDACTED]"
|
|
199
|
+
|
|
200
|
+
elif strategy == "hash":
|
|
201
|
+
# One-way hash (preserves uniqueness)
|
|
202
|
+
return hashlib.sha256(str(value).encode()).hexdigest()[:16]
|
|
203
|
+
|
|
204
|
+
else:
|
|
205
|
+
# Unknown strategy, redact by default
|
|
206
|
+
return "[REDACTED]"
|
|
207
|
+
|
|
208
|
+
def sync_table(
|
|
209
|
+
self,
|
|
210
|
+
table_name: str,
|
|
211
|
+
anonymization_rules: list[AnonymizationRule] | None = None,
|
|
212
|
+
batch_size: int = 5000, # Optimized based on benchmarks
|
|
213
|
+
progress_task: Any = None,
|
|
214
|
+
progress: Progress | None = None,
|
|
215
|
+
) -> int:
|
|
216
|
+
"""Sync a single table from source to target.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
table_name: Name of table to sync
|
|
220
|
+
anonymization_rules: Optional anonymization rules for PII
|
|
221
|
+
batch_size: Number of rows per batch (default 5000, optimized via benchmarks)
|
|
222
|
+
progress_task: Rich progress task ID for updating progress
|
|
223
|
+
progress: Progress instance
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Number of rows synced
|
|
227
|
+
"""
|
|
228
|
+
if not self._source_conn or not self._target_conn:
|
|
229
|
+
raise RuntimeError("Not connected. Use context manager.")
|
|
230
|
+
|
|
231
|
+
start_time = time.time()
|
|
232
|
+
|
|
233
|
+
with self._source_conn.cursor() as src_cursor, self._target_conn.cursor() as dst_cursor:
|
|
234
|
+
# Truncate target table first
|
|
235
|
+
dst_cursor.execute(f"TRUNCATE TABLE {table_name} CASCADE")
|
|
236
|
+
|
|
237
|
+
# Get row count for verification
|
|
238
|
+
src_cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
|
239
|
+
expected_row = src_cursor.fetchone()
|
|
240
|
+
expected_count: int = expected_row[0] if expected_row else 0
|
|
241
|
+
|
|
242
|
+
# Update progress with total
|
|
243
|
+
if progress and progress_task is not None:
|
|
244
|
+
progress.update(progress_task, total=expected_count)
|
|
245
|
+
|
|
246
|
+
# Temporarily disable triggers to allow FK constraint violations
|
|
247
|
+
dst_cursor.execute(f"ALTER TABLE {table_name} DISABLE TRIGGER ALL")
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
if anonymization_rules:
|
|
251
|
+
# Anonymization path: fetch, anonymize, insert
|
|
252
|
+
actual_count = self._sync_with_anonymization(
|
|
253
|
+
src_cursor,
|
|
254
|
+
dst_cursor,
|
|
255
|
+
table_name,
|
|
256
|
+
anonymization_rules,
|
|
257
|
+
batch_size,
|
|
258
|
+
progress_task,
|
|
259
|
+
progress,
|
|
260
|
+
)
|
|
261
|
+
else:
|
|
262
|
+
# Fast path: direct COPY
|
|
263
|
+
actual_count = self._sync_with_copy(
|
|
264
|
+
src_cursor,
|
|
265
|
+
dst_cursor,
|
|
266
|
+
table_name,
|
|
267
|
+
progress_task,
|
|
268
|
+
progress,
|
|
269
|
+
)
|
|
270
|
+
finally:
|
|
271
|
+
# Re-enable triggers
|
|
272
|
+
dst_cursor.execute(f"ALTER TABLE {table_name} ENABLE TRIGGER ALL")
|
|
273
|
+
|
|
274
|
+
# Commit target transaction
|
|
275
|
+
self._target_conn.commit()
|
|
276
|
+
|
|
277
|
+
# Verify row count
|
|
278
|
+
if actual_count != expected_count:
|
|
279
|
+
raise RuntimeError(
|
|
280
|
+
f"Row count mismatch for {table_name}: "
|
|
281
|
+
f"expected {expected_count}, got {actual_count}"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Track metrics
|
|
285
|
+
elapsed = time.time() - start_time
|
|
286
|
+
rows_per_second = actual_count / elapsed if elapsed > 0 else 0
|
|
287
|
+
self._metrics[table_name] = TableMetrics(
|
|
288
|
+
rows_synced=actual_count,
|
|
289
|
+
elapsed_seconds=elapsed,
|
|
290
|
+
rows_per_second=rows_per_second,
|
|
291
|
+
synced_at=datetime.now().isoformat(),
|
|
292
|
+
)
|
|
293
|
+
self._completed_tables.add(table_name)
|
|
294
|
+
|
|
295
|
+
return actual_count
|
|
296
|
+
|
|
297
|
+
def _sync_with_copy(
|
|
298
|
+
self,
|
|
299
|
+
src_cursor: Any,
|
|
300
|
+
dst_cursor: Any,
|
|
301
|
+
table_name: str,
|
|
302
|
+
progress_task: Any = None,
|
|
303
|
+
progress: Progress | None = None,
|
|
304
|
+
) -> int:
|
|
305
|
+
"""Fast sync using COPY (no anonymization).
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
src_cursor: Source database cursor
|
|
309
|
+
dst_cursor: Target database cursor
|
|
310
|
+
table_name: Name of table to sync
|
|
311
|
+
progress_task: Progress task ID
|
|
312
|
+
progress: Progress instance
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Number of rows synced
|
|
316
|
+
"""
|
|
317
|
+
with (
|
|
318
|
+
src_cursor.copy(f"COPY {table_name} TO STDOUT") as copy_out,
|
|
319
|
+
dst_cursor.copy(f"COPY {table_name} FROM STDIN") as copy_in,
|
|
320
|
+
):
|
|
321
|
+
for data in copy_out:
|
|
322
|
+
copy_in.write(data)
|
|
323
|
+
if progress and progress_task is not None:
|
|
324
|
+
progress.update(progress_task, advance=1)
|
|
325
|
+
|
|
326
|
+
# Get final count
|
|
327
|
+
dst_cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
|
328
|
+
result = dst_cursor.fetchone()
|
|
329
|
+
return int(result[0]) if result else 0
|
|
330
|
+
|
|
331
|
+
def _sync_with_anonymization(
|
|
332
|
+
self,
|
|
333
|
+
src_cursor: Any,
|
|
334
|
+
dst_cursor: Any,
|
|
335
|
+
table_name: str,
|
|
336
|
+
anonymization_rules: list[AnonymizationRule],
|
|
337
|
+
batch_size: int,
|
|
338
|
+
progress_task: Any = None,
|
|
339
|
+
progress: Progress | None = None,
|
|
340
|
+
) -> int:
|
|
341
|
+
"""Sync with anonymization (slower, row-by-row).
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
src_cursor: Source database cursor
|
|
345
|
+
dst_cursor: Target database cursor
|
|
346
|
+
table_name: Name of table to sync
|
|
347
|
+
anonymization_rules: List of anonymization rules
|
|
348
|
+
batch_size: Batch size for inserts
|
|
349
|
+
progress_task: Progress task ID
|
|
350
|
+
progress: Progress instance
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Number of rows synced
|
|
354
|
+
"""
|
|
355
|
+
# Get column names
|
|
356
|
+
src_cursor.execute(f"SELECT * FROM {table_name} LIMIT 0")
|
|
357
|
+
column_names = [desc[0] for desc in src_cursor.description]
|
|
358
|
+
|
|
359
|
+
# Build column index map for anonymization
|
|
360
|
+
anonymize_map: dict[int, AnonymizationRule] = {}
|
|
361
|
+
for rule in anonymization_rules:
|
|
362
|
+
if rule.column in column_names:
|
|
363
|
+
col_idx = column_names.index(rule.column)
|
|
364
|
+
anonymize_map[col_idx] = rule
|
|
365
|
+
|
|
366
|
+
# Fetch all rows
|
|
367
|
+
src_cursor.execute(f"SELECT * FROM {table_name}")
|
|
368
|
+
|
|
369
|
+
# Process in batches
|
|
370
|
+
rows_synced = 0
|
|
371
|
+
batch = []
|
|
372
|
+
|
|
373
|
+
for row in src_cursor:
|
|
374
|
+
# Anonymize specified columns
|
|
375
|
+
anonymized_row = list(row)
|
|
376
|
+
for col_idx, rule in anonymize_map.items():
|
|
377
|
+
anonymized_row[col_idx] = self._anonymize_value(
|
|
378
|
+
row[col_idx], rule.strategy, rule.seed
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
batch.append(tuple(anonymized_row))
|
|
382
|
+
|
|
383
|
+
# Insert batch when full
|
|
384
|
+
if len(batch) >= batch_size:
|
|
385
|
+
self._insert_batch(dst_cursor, table_name, column_names, batch)
|
|
386
|
+
rows_synced += len(batch)
|
|
387
|
+
if progress and progress_task is not None:
|
|
388
|
+
progress.update(progress_task, advance=len(batch))
|
|
389
|
+
batch = []
|
|
390
|
+
|
|
391
|
+
# Insert remaining rows
|
|
392
|
+
if batch:
|
|
393
|
+
self._insert_batch(dst_cursor, table_name, column_names, batch)
|
|
394
|
+
rows_synced += len(batch)
|
|
395
|
+
if progress and progress_task is not None:
|
|
396
|
+
progress.update(progress_task, advance=len(batch))
|
|
397
|
+
|
|
398
|
+
return rows_synced
|
|
399
|
+
|
|
400
|
+
def _insert_batch(
|
|
401
|
+
self,
|
|
402
|
+
cursor: Any,
|
|
403
|
+
table_name: str,
|
|
404
|
+
column_names: list[str],
|
|
405
|
+
rows: list[tuple[Any, ...]],
|
|
406
|
+
) -> None:
|
|
407
|
+
"""Insert a batch of rows into target table.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
cursor: Database cursor
|
|
411
|
+
table_name: Name of table
|
|
412
|
+
column_names: List of column names
|
|
413
|
+
rows: List of row tuples to insert
|
|
414
|
+
"""
|
|
415
|
+
if not rows:
|
|
416
|
+
return
|
|
417
|
+
|
|
418
|
+
columns_str = ", ".join(column_names)
|
|
419
|
+
placeholders = ", ".join(["%s"] * len(column_names))
|
|
420
|
+
query = f"INSERT INTO {table_name} ({columns_str}) VALUES ({placeholders})"
|
|
421
|
+
|
|
422
|
+
cursor.executemany(query, rows)
|
|
423
|
+
|
|
424
|
+
def sync(self, config: SyncConfig) -> dict[str, int]:
|
|
425
|
+
"""Sync multiple tables based on configuration.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
config: Sync configuration
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Dictionary mapping table names to row counts synced
|
|
432
|
+
"""
|
|
433
|
+
# Load checkpoint if requested
|
|
434
|
+
if config.resume and config.checkpoint_file and config.checkpoint_file.exists():
|
|
435
|
+
self.load_checkpoint(config.checkpoint_file)
|
|
436
|
+
|
|
437
|
+
tables = self.select_tables(config.tables)
|
|
438
|
+
results = {}
|
|
439
|
+
|
|
440
|
+
# Filter out completed tables if resuming
|
|
441
|
+
if config.resume:
|
|
442
|
+
tables = [t for t in tables if t not in self._completed_tables]
|
|
443
|
+
|
|
444
|
+
if config.show_progress:
|
|
445
|
+
# Use rich progress bar
|
|
446
|
+
with Progress(
|
|
447
|
+
TextColumn("[bold blue]{task.description}"),
|
|
448
|
+
BarColumn(),
|
|
449
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
450
|
+
TextColumn("•"),
|
|
451
|
+
TextColumn("{task.completed}/{task.total} rows"),
|
|
452
|
+
TimeRemainingColumn(),
|
|
453
|
+
) as progress:
|
|
454
|
+
for table in tables:
|
|
455
|
+
task = progress.add_task(f"Syncing {table}", total=0)
|
|
456
|
+
|
|
457
|
+
anonymization_rules = None
|
|
458
|
+
if config.anonymization and table in config.anonymization:
|
|
459
|
+
anonymization_rules = config.anonymization[table]
|
|
460
|
+
|
|
461
|
+
rows_synced = self.sync_table(
|
|
462
|
+
table,
|
|
463
|
+
anonymization_rules=anonymization_rules,
|
|
464
|
+
batch_size=config.batch_size,
|
|
465
|
+
progress_task=task,
|
|
466
|
+
progress=progress,
|
|
467
|
+
)
|
|
468
|
+
results[table] = rows_synced
|
|
469
|
+
else:
|
|
470
|
+
# No progress bar
|
|
471
|
+
for table in tables:
|
|
472
|
+
anonymization_rules = None
|
|
473
|
+
if config.anonymization and table in config.anonymization:
|
|
474
|
+
anonymization_rules = config.anonymization[table]
|
|
475
|
+
|
|
476
|
+
rows_synced = self.sync_table(
|
|
477
|
+
table,
|
|
478
|
+
anonymization_rules=anonymization_rules,
|
|
479
|
+
batch_size=config.batch_size,
|
|
480
|
+
)
|
|
481
|
+
results[table] = rows_synced
|
|
482
|
+
|
|
483
|
+
# Save checkpoint if requested
|
|
484
|
+
if config.checkpoint_file:
|
|
485
|
+
self.save_checkpoint(config.checkpoint_file)
|
|
486
|
+
|
|
487
|
+
return results
|
|
488
|
+
|
|
489
|
+
def get_metrics(self) -> dict[str, dict[str, Any]]:
|
|
490
|
+
"""Get performance metrics for all synced tables.
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
Dictionary mapping table names to metrics
|
|
494
|
+
"""
|
|
495
|
+
return {
|
|
496
|
+
table: {
|
|
497
|
+
"rows_synced": metrics.rows_synced,
|
|
498
|
+
"elapsed_seconds": metrics.elapsed_seconds,
|
|
499
|
+
"rows_per_second": metrics.rows_per_second,
|
|
500
|
+
"synced_at": metrics.synced_at,
|
|
501
|
+
}
|
|
502
|
+
for table, metrics in self._metrics.items()
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
def save_checkpoint(self, checkpoint_file: Path) -> None:
|
|
506
|
+
"""Save sync checkpoint to file.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
checkpoint_file: Path to checkpoint file
|
|
510
|
+
"""
|
|
511
|
+
checkpoint_data = {
|
|
512
|
+
"version": "1.0",
|
|
513
|
+
"timestamp": datetime.now().isoformat(),
|
|
514
|
+
"source_database": f"{self.source_config.host}:{self.source_config.port}/{self.source_config.database}",
|
|
515
|
+
"target_database": f"{self.target_config.host}:{self.target_config.port}/{self.target_config.database}",
|
|
516
|
+
"completed_tables": {
|
|
517
|
+
table: {
|
|
518
|
+
"rows_synced": metrics.rows_synced,
|
|
519
|
+
"synced_at": metrics.synced_at,
|
|
520
|
+
}
|
|
521
|
+
for table, metrics in self._metrics.items()
|
|
522
|
+
},
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
checkpoint_file.parent.mkdir(parents=True, exist_ok=True)
|
|
526
|
+
with open(checkpoint_file, "w") as f:
|
|
527
|
+
json.dump(checkpoint_data, f, indent=2)
|
|
528
|
+
|
|
529
|
+
def load_checkpoint(self, checkpoint_file: Path) -> None:
|
|
530
|
+
"""Load sync checkpoint from file.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
checkpoint_file: Path to checkpoint file
|
|
534
|
+
"""
|
|
535
|
+
with open(checkpoint_file) as f:
|
|
536
|
+
self._checkpoint_data = json.load(f)
|
|
537
|
+
|
|
538
|
+
# Restore completed tables
|
|
539
|
+
if "completed_tables" in self._checkpoint_data:
|
|
540
|
+
self._completed_tables = set(self._checkpoint_data["completed_tables"].keys())
|