fraiseql-confiture 0.3.7__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- confiture/__init__.py +48 -0
- confiture/_core.cpython-311-darwin.so +0 -0
- confiture/cli/__init__.py +0 -0
- confiture/cli/dry_run.py +116 -0
- confiture/cli/lint_formatter.py +193 -0
- confiture/cli/main.py +1893 -0
- confiture/config/__init__.py +0 -0
- confiture/config/environment.py +263 -0
- confiture/core/__init__.py +51 -0
- confiture/core/anonymization/__init__.py +0 -0
- confiture/core/anonymization/audit.py +485 -0
- confiture/core/anonymization/benchmarking.py +372 -0
- confiture/core/anonymization/breach_notification.py +652 -0
- confiture/core/anonymization/compliance.py +617 -0
- confiture/core/anonymization/composer.py +298 -0
- confiture/core/anonymization/data_subject_rights.py +669 -0
- confiture/core/anonymization/factory.py +319 -0
- confiture/core/anonymization/governance.py +737 -0
- confiture/core/anonymization/performance.py +1092 -0
- confiture/core/anonymization/profile.py +284 -0
- confiture/core/anonymization/registry.py +195 -0
- confiture/core/anonymization/security/kms_manager.py +547 -0
- confiture/core/anonymization/security/lineage.py +888 -0
- confiture/core/anonymization/security/token_store.py +686 -0
- confiture/core/anonymization/strategies/__init__.py +41 -0
- confiture/core/anonymization/strategies/address.py +359 -0
- confiture/core/anonymization/strategies/credit_card.py +374 -0
- confiture/core/anonymization/strategies/custom.py +161 -0
- confiture/core/anonymization/strategies/date.py +218 -0
- confiture/core/anonymization/strategies/differential_privacy.py +398 -0
- confiture/core/anonymization/strategies/email.py +141 -0
- confiture/core/anonymization/strategies/format_preserving_encryption.py +310 -0
- confiture/core/anonymization/strategies/hash.py +150 -0
- confiture/core/anonymization/strategies/ip_address.py +235 -0
- confiture/core/anonymization/strategies/masking_retention.py +252 -0
- confiture/core/anonymization/strategies/name.py +298 -0
- confiture/core/anonymization/strategies/phone.py +119 -0
- confiture/core/anonymization/strategies/preserve.py +85 -0
- confiture/core/anonymization/strategies/redact.py +101 -0
- confiture/core/anonymization/strategies/salted_hashing.py +322 -0
- confiture/core/anonymization/strategies/text_redaction.py +183 -0
- confiture/core/anonymization/strategies/tokenization.py +334 -0
- confiture/core/anonymization/strategy.py +241 -0
- confiture/core/anonymization/syncer_audit.py +357 -0
- confiture/core/blue_green.py +683 -0
- confiture/core/builder.py +500 -0
- confiture/core/checksum.py +358 -0
- confiture/core/connection.py +184 -0
- confiture/core/differ.py +522 -0
- confiture/core/drift.py +564 -0
- confiture/core/dry_run.py +182 -0
- confiture/core/health.py +313 -0
- confiture/core/hooks/__init__.py +87 -0
- confiture/core/hooks/base.py +232 -0
- confiture/core/hooks/context.py +146 -0
- confiture/core/hooks/execution_strategies.py +57 -0
- confiture/core/hooks/observability.py +220 -0
- confiture/core/hooks/phases.py +53 -0
- confiture/core/hooks/registry.py +295 -0
- confiture/core/large_tables.py +775 -0
- confiture/core/linting/__init__.py +70 -0
- confiture/core/linting/composer.py +192 -0
- confiture/core/linting/libraries/__init__.py +17 -0
- confiture/core/linting/libraries/gdpr.py +168 -0
- confiture/core/linting/libraries/general.py +184 -0
- confiture/core/linting/libraries/hipaa.py +144 -0
- confiture/core/linting/libraries/pci_dss.py +104 -0
- confiture/core/linting/libraries/sox.py +120 -0
- confiture/core/linting/schema_linter.py +491 -0
- confiture/core/linting/versioning.py +151 -0
- confiture/core/locking.py +389 -0
- confiture/core/migration_generator.py +298 -0
- confiture/core/migrator.py +882 -0
- confiture/core/observability/__init__.py +44 -0
- confiture/core/observability/audit.py +323 -0
- confiture/core/observability/logging.py +187 -0
- confiture/core/observability/metrics.py +174 -0
- confiture/core/observability/tracing.py +192 -0
- confiture/core/pg_version.py +418 -0
- confiture/core/pool.py +406 -0
- confiture/core/risk/__init__.py +39 -0
- confiture/core/risk/predictor.py +188 -0
- confiture/core/risk/scoring.py +248 -0
- confiture/core/rollback_generator.py +388 -0
- confiture/core/schema_analyzer.py +769 -0
- confiture/core/schema_to_schema.py +590 -0
- confiture/core/security/__init__.py +32 -0
- confiture/core/security/logging.py +201 -0
- confiture/core/security/validation.py +416 -0
- confiture/core/signals.py +371 -0
- confiture/core/syncer.py +540 -0
- confiture/exceptions.py +192 -0
- confiture/integrations/__init__.py +0 -0
- confiture/models/__init__.py +24 -0
- confiture/models/lint.py +193 -0
- confiture/models/migration.py +265 -0
- confiture/models/schema.py +203 -0
- confiture/models/sql_file_migration.py +225 -0
- confiture/scenarios/__init__.py +36 -0
- confiture/scenarios/compliance.py +586 -0
- confiture/scenarios/ecommerce.py +199 -0
- confiture/scenarios/financial.py +253 -0
- confiture/scenarios/healthcare.py +315 -0
- confiture/scenarios/multi_tenant.py +340 -0
- confiture/scenarios/saas.py +295 -0
- confiture/testing/FRAMEWORK_API.md +722 -0
- confiture/testing/__init__.py +100 -0
- confiture/testing/fixtures/__init__.py +11 -0
- confiture/testing/fixtures/data_validator.py +229 -0
- confiture/testing/fixtures/migration_runner.py +167 -0
- confiture/testing/fixtures/schema_snapshotter.py +352 -0
- confiture/testing/frameworks/__init__.py +10 -0
- confiture/testing/frameworks/mutation.py +587 -0
- confiture/testing/frameworks/performance.py +479 -0
- confiture/testing/loader.py +225 -0
- confiture/testing/pytest/__init__.py +38 -0
- confiture/testing/pytest_plugin.py +190 -0
- confiture/testing/sandbox.py +304 -0
- confiture/testing/utils/__init__.py +0 -0
- fraiseql_confiture-0.3.7.dist-info/METADATA +438 -0
- fraiseql_confiture-0.3.7.dist-info/RECORD +124 -0
- fraiseql_confiture-0.3.7.dist-info/WHEEL +4 -0
- fraiseql_confiture-0.3.7.dist-info/entry_points.txt +4 -0
- fraiseql_confiture-0.3.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,775 @@
|
|
|
1
|
+
"""Large table migration patterns.
|
|
2
|
+
|
|
3
|
+
Provides utilities for migrating large tables (>1M rows) without
|
|
4
|
+
blocking production traffic. Includes batched operations, progress
|
|
5
|
+
reporting, and resumable patterns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import time
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class BatchConfig:
|
|
19
|
+
"""Configuration for batched operations.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
batch_size: Number of rows per batch
|
|
23
|
+
sleep_between_batches: Seconds to wait between batches
|
|
24
|
+
max_retries: Maximum retries per batch on failure
|
|
25
|
+
progress_callback: Optional callback for progress updates
|
|
26
|
+
checkpoint_callback: Optional callback for checkpointing
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
batch_size: int = 10000
|
|
30
|
+
sleep_between_batches: float = 0.1
|
|
31
|
+
max_retries: int = 3
|
|
32
|
+
progress_callback: Callable[[int, int], None] | None = None
|
|
33
|
+
checkpoint_callback: Callable[[int], None] | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class BatchProgress:
|
|
38
|
+
"""Progress of a batched operation.
|
|
39
|
+
|
|
40
|
+
Tracks rows processed, batches completed, and timing information.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
total_rows: int
|
|
44
|
+
processed_rows: int = 0
|
|
45
|
+
current_batch: int = 0
|
|
46
|
+
total_batches: int = 0
|
|
47
|
+
elapsed_seconds: float = 0.0
|
|
48
|
+
errors: list[str] = field(default_factory=list)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def percent_complete(self) -> float:
|
|
52
|
+
"""Get completion percentage."""
|
|
53
|
+
if self.total_rows == 0:
|
|
54
|
+
return 100.0
|
|
55
|
+
return (self.processed_rows / self.total_rows) * 100
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def is_complete(self) -> bool:
|
|
59
|
+
"""Check if operation is complete."""
|
|
60
|
+
return self.processed_rows >= self.total_rows
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def rows_per_second(self) -> float:
|
|
64
|
+
"""Calculate processing rate."""
|
|
65
|
+
if self.elapsed_seconds == 0:
|
|
66
|
+
return 0.0
|
|
67
|
+
return self.processed_rows / self.elapsed_seconds
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def estimated_remaining_seconds(self) -> float:
|
|
71
|
+
"""Estimate remaining time."""
|
|
72
|
+
if self.rows_per_second == 0:
|
|
73
|
+
return 0.0
|
|
74
|
+
remaining_rows = self.total_rows - self.processed_rows
|
|
75
|
+
return remaining_rows / self.rows_per_second
|
|
76
|
+
|
|
77
|
+
def to_dict(self) -> dict[str, Any]:
|
|
78
|
+
"""Convert to dictionary."""
|
|
79
|
+
return {
|
|
80
|
+
"total_rows": self.total_rows,
|
|
81
|
+
"processed_rows": self.processed_rows,
|
|
82
|
+
"percent_complete": round(self.percent_complete, 2),
|
|
83
|
+
"current_batch": self.current_batch,
|
|
84
|
+
"total_batches": self.total_batches,
|
|
85
|
+
"elapsed_seconds": round(self.elapsed_seconds, 2),
|
|
86
|
+
"rows_per_second": round(self.rows_per_second, 2),
|
|
87
|
+
"estimated_remaining_seconds": round(self.estimated_remaining_seconds, 2),
|
|
88
|
+
"errors": self.errors,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class BatchedMigration:
|
|
93
|
+
"""Execute migrations in batches for large tables.
|
|
94
|
+
|
|
95
|
+
Provides methods for common large table operations that need
|
|
96
|
+
to be done in batches to avoid long-running transactions.
|
|
97
|
+
|
|
98
|
+
Example:
|
|
99
|
+
>>> config = BatchConfig(batch_size=10000)
|
|
100
|
+
>>> batched = BatchedMigration(conn, config)
|
|
101
|
+
>>> progress = batched.add_column_with_default(
|
|
102
|
+
... table="users",
|
|
103
|
+
... column="status",
|
|
104
|
+
... column_type="TEXT",
|
|
105
|
+
... default="'active'"
|
|
106
|
+
... )
|
|
107
|
+
>>> print(f"Processed {progress.processed_rows} rows")
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(self, connection: Any, config: BatchConfig | None = None):
|
|
111
|
+
"""Initialize batched migration.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
connection: Database connection
|
|
115
|
+
config: Batch configuration (optional)
|
|
116
|
+
"""
|
|
117
|
+
self.connection = connection
|
|
118
|
+
self.config = config or BatchConfig()
|
|
119
|
+
|
|
120
|
+
def add_column_with_default(
|
|
121
|
+
self,
|
|
122
|
+
table: str,
|
|
123
|
+
column: str,
|
|
124
|
+
column_type: str,
|
|
125
|
+
default: str,
|
|
126
|
+
start_from: int = 0,
|
|
127
|
+
) -> BatchProgress:
|
|
128
|
+
"""Add column with default value in batches.
|
|
129
|
+
|
|
130
|
+
PostgreSQL 11+ adds columns with defaults instantly, but
|
|
131
|
+
backfilling existing NULL rows can lock the table. This
|
|
132
|
+
does the backfill in batches.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
table: Table name
|
|
136
|
+
column: Column name
|
|
137
|
+
column_type: Column type (e.g., "TEXT", "INTEGER")
|
|
138
|
+
default: Default value expression
|
|
139
|
+
start_from: Resume from this row count (for resumption)
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
BatchProgress with operation result
|
|
143
|
+
"""
|
|
144
|
+
start_time = time.perf_counter()
|
|
145
|
+
|
|
146
|
+
with self.connection.cursor() as cur:
|
|
147
|
+
# Add column without default first (instant in PG 11+)
|
|
148
|
+
cur.execute(
|
|
149
|
+
f"""
|
|
150
|
+
ALTER TABLE {table}
|
|
151
|
+
ADD COLUMN IF NOT EXISTS {column} {column_type}
|
|
152
|
+
"""
|
|
153
|
+
)
|
|
154
|
+
self.connection.commit()
|
|
155
|
+
|
|
156
|
+
# Get total rows needing update
|
|
157
|
+
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {column} IS NULL")
|
|
158
|
+
total_rows = cur.fetchone()[0]
|
|
159
|
+
|
|
160
|
+
if total_rows == 0:
|
|
161
|
+
return BatchProgress(total_rows=0)
|
|
162
|
+
|
|
163
|
+
total_batches = (total_rows + self.config.batch_size - 1) // self.config.batch_size
|
|
164
|
+
processed = start_from
|
|
165
|
+
progress = BatchProgress(
|
|
166
|
+
total_rows=total_rows,
|
|
167
|
+
processed_rows=processed,
|
|
168
|
+
total_batches=total_batches,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
batch_num = start_from // self.config.batch_size
|
|
172
|
+
|
|
173
|
+
while processed < total_rows:
|
|
174
|
+
batch_num += 1
|
|
175
|
+
|
|
176
|
+
for attempt in range(self.config.max_retries):
|
|
177
|
+
try:
|
|
178
|
+
# Update batch using ctid for efficiency
|
|
179
|
+
cur.execute(
|
|
180
|
+
f"""
|
|
181
|
+
UPDATE {table}
|
|
182
|
+
SET {column} = {default}
|
|
183
|
+
WHERE ctid IN (
|
|
184
|
+
SELECT ctid FROM {table}
|
|
185
|
+
WHERE {column} IS NULL
|
|
186
|
+
LIMIT {self.config.batch_size}
|
|
187
|
+
)
|
|
188
|
+
"""
|
|
189
|
+
)
|
|
190
|
+
rows_affected = cur.rowcount
|
|
191
|
+
self.connection.commit()
|
|
192
|
+
break
|
|
193
|
+
except Exception as e:
|
|
194
|
+
self.connection.rollback()
|
|
195
|
+
if attempt == self.config.max_retries - 1:
|
|
196
|
+
progress.errors.append(f"Batch {batch_num}: {e}")
|
|
197
|
+
raise
|
|
198
|
+
logger.warning(f"Batch {batch_num} failed, retrying: {e}")
|
|
199
|
+
time.sleep(self.config.sleep_between_batches * 2)
|
|
200
|
+
|
|
201
|
+
processed += rows_affected
|
|
202
|
+
progress.processed_rows = processed
|
|
203
|
+
progress.current_batch = batch_num
|
|
204
|
+
progress.elapsed_seconds = time.perf_counter() - start_time
|
|
205
|
+
|
|
206
|
+
if self.config.progress_callback:
|
|
207
|
+
self.config.progress_callback(processed, total_rows)
|
|
208
|
+
|
|
209
|
+
if self.config.checkpoint_callback:
|
|
210
|
+
self.config.checkpoint_callback(processed)
|
|
211
|
+
|
|
212
|
+
logger.info(
|
|
213
|
+
f"Batch {batch_num}/{total_batches}: "
|
|
214
|
+
f"{progress.percent_complete:.1f}% complete "
|
|
215
|
+
f"({progress.rows_per_second:.0f} rows/sec)"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if rows_affected == 0:
|
|
219
|
+
break
|
|
220
|
+
|
|
221
|
+
if self.config.sleep_between_batches > 0:
|
|
222
|
+
time.sleep(self.config.sleep_between_batches)
|
|
223
|
+
|
|
224
|
+
# Set default for future inserts
|
|
225
|
+
cur.execute(
|
|
226
|
+
f"""
|
|
227
|
+
ALTER TABLE {table}
|
|
228
|
+
ALTER COLUMN {column} SET DEFAULT {default}
|
|
229
|
+
"""
|
|
230
|
+
)
|
|
231
|
+
self.connection.commit()
|
|
232
|
+
|
|
233
|
+
progress.elapsed_seconds = time.perf_counter() - start_time
|
|
234
|
+
return progress
|
|
235
|
+
|
|
236
|
+
def backfill_column(
|
|
237
|
+
self,
|
|
238
|
+
table: str,
|
|
239
|
+
column: str,
|
|
240
|
+
expression: str,
|
|
241
|
+
where_clause: str = "TRUE",
|
|
242
|
+
start_from: int = 0,
|
|
243
|
+
) -> BatchProgress:
|
|
244
|
+
"""Backfill column values in batches.
|
|
245
|
+
|
|
246
|
+
Example:
|
|
247
|
+
>>> progress = batched.backfill_column(
|
|
248
|
+
... table="orders",
|
|
249
|
+
... column="total_cents",
|
|
250
|
+
... expression="(subtotal + tax) * 100",
|
|
251
|
+
... where_clause="total_cents IS NULL"
|
|
252
|
+
... )
|
|
253
|
+
"""
|
|
254
|
+
start_time = time.perf_counter()
|
|
255
|
+
|
|
256
|
+
with self.connection.cursor() as cur:
|
|
257
|
+
# Get total rows
|
|
258
|
+
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {where_clause}")
|
|
259
|
+
total_rows = cur.fetchone()[0]
|
|
260
|
+
|
|
261
|
+
if total_rows == 0:
|
|
262
|
+
return BatchProgress(total_rows=0)
|
|
263
|
+
|
|
264
|
+
total_batches = (total_rows + self.config.batch_size - 1) // self.config.batch_size
|
|
265
|
+
processed = start_from
|
|
266
|
+
progress = BatchProgress(
|
|
267
|
+
total_rows=total_rows,
|
|
268
|
+
processed_rows=processed,
|
|
269
|
+
total_batches=total_batches,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
batch_num = start_from // self.config.batch_size
|
|
273
|
+
|
|
274
|
+
while True:
|
|
275
|
+
batch_num += 1
|
|
276
|
+
|
|
277
|
+
cur.execute(
|
|
278
|
+
f"""
|
|
279
|
+
UPDATE {table}
|
|
280
|
+
SET {column} = {expression}
|
|
281
|
+
WHERE ctid IN (
|
|
282
|
+
SELECT ctid FROM {table}
|
|
283
|
+
WHERE {where_clause}
|
|
284
|
+
LIMIT {self.config.batch_size}
|
|
285
|
+
)
|
|
286
|
+
"""
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
rows_affected = cur.rowcount
|
|
290
|
+
if rows_affected == 0:
|
|
291
|
+
break
|
|
292
|
+
|
|
293
|
+
self.connection.commit()
|
|
294
|
+
processed += rows_affected
|
|
295
|
+
progress.processed_rows = processed
|
|
296
|
+
progress.current_batch = batch_num
|
|
297
|
+
progress.elapsed_seconds = time.perf_counter() - start_time
|
|
298
|
+
|
|
299
|
+
if self.config.progress_callback:
|
|
300
|
+
self.config.progress_callback(processed, total_rows)
|
|
301
|
+
|
|
302
|
+
if self.config.checkpoint_callback:
|
|
303
|
+
self.config.checkpoint_callback(processed)
|
|
304
|
+
|
|
305
|
+
logger.info(
|
|
306
|
+
f"Backfill batch {batch_num}: {progress.percent_complete:.1f}% complete"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
if self.config.sleep_between_batches > 0:
|
|
310
|
+
time.sleep(self.config.sleep_between_batches)
|
|
311
|
+
|
|
312
|
+
progress.elapsed_seconds = time.perf_counter() - start_time
|
|
313
|
+
return progress
|
|
314
|
+
|
|
315
|
+
def delete_in_batches(
|
|
316
|
+
self,
|
|
317
|
+
table: str,
|
|
318
|
+
where_clause: str,
|
|
319
|
+
start_from: int = 0,
|
|
320
|
+
) -> BatchProgress:
|
|
321
|
+
"""Delete rows in batches to avoid long locks.
|
|
322
|
+
|
|
323
|
+
Example:
|
|
324
|
+
>>> progress = batched.delete_in_batches(
|
|
325
|
+
... table="audit_logs",
|
|
326
|
+
... where_clause="created_at < NOW() - INTERVAL '1 year'"
|
|
327
|
+
... )
|
|
328
|
+
"""
|
|
329
|
+
start_time = time.perf_counter()
|
|
330
|
+
|
|
331
|
+
with self.connection.cursor() as cur:
|
|
332
|
+
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {where_clause}")
|
|
333
|
+
total_rows = cur.fetchone()[0]
|
|
334
|
+
|
|
335
|
+
if total_rows == 0:
|
|
336
|
+
return BatchProgress(total_rows=0)
|
|
337
|
+
|
|
338
|
+
total_batches = (total_rows + self.config.batch_size - 1) // self.config.batch_size
|
|
339
|
+
processed = start_from
|
|
340
|
+
progress = BatchProgress(
|
|
341
|
+
total_rows=total_rows,
|
|
342
|
+
processed_rows=processed,
|
|
343
|
+
total_batches=total_batches,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
batch_num = start_from // self.config.batch_size
|
|
347
|
+
|
|
348
|
+
while True:
|
|
349
|
+
batch_num += 1
|
|
350
|
+
|
|
351
|
+
cur.execute(
|
|
352
|
+
f"""
|
|
353
|
+
DELETE FROM {table}
|
|
354
|
+
WHERE ctid IN (
|
|
355
|
+
SELECT ctid FROM {table}
|
|
356
|
+
WHERE {where_clause}
|
|
357
|
+
LIMIT {self.config.batch_size}
|
|
358
|
+
)
|
|
359
|
+
"""
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
rows_deleted = cur.rowcount
|
|
363
|
+
if rows_deleted == 0:
|
|
364
|
+
break
|
|
365
|
+
|
|
366
|
+
self.connection.commit()
|
|
367
|
+
processed += rows_deleted
|
|
368
|
+
progress.processed_rows = processed
|
|
369
|
+
progress.current_batch = batch_num
|
|
370
|
+
progress.elapsed_seconds = time.perf_counter() - start_time
|
|
371
|
+
|
|
372
|
+
if self.config.progress_callback:
|
|
373
|
+
self.config.progress_callback(processed, total_rows)
|
|
374
|
+
|
|
375
|
+
if self.config.checkpoint_callback:
|
|
376
|
+
self.config.checkpoint_callback(processed)
|
|
377
|
+
|
|
378
|
+
logger.info(f"Delete batch {batch_num}: {progress.percent_complete:.1f}% complete")
|
|
379
|
+
|
|
380
|
+
if self.config.sleep_between_batches > 0:
|
|
381
|
+
time.sleep(self.config.sleep_between_batches)
|
|
382
|
+
|
|
383
|
+
progress.elapsed_seconds = time.perf_counter() - start_time
|
|
384
|
+
return progress
|
|
385
|
+
|
|
386
|
+
def copy_to_new_table(
|
|
387
|
+
self,
|
|
388
|
+
source_table: str,
|
|
389
|
+
target_table: str,
|
|
390
|
+
columns: list[str] | None = None,
|
|
391
|
+
transform: dict[str, str] | None = None,
|
|
392
|
+
where_clause: str = "TRUE",
|
|
393
|
+
) -> BatchProgress:
|
|
394
|
+
"""Copy data to a new table in batches.
|
|
395
|
+
|
|
396
|
+
Useful for table restructuring without blocking reads on source.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
source_table: Source table name
|
|
400
|
+
target_table: Target table name (must exist)
|
|
401
|
+
columns: Columns to copy (None = all)
|
|
402
|
+
transform: Column transformations {col: expression}
|
|
403
|
+
where_clause: Filter condition
|
|
404
|
+
|
|
405
|
+
Example:
|
|
406
|
+
>>> progress = batched.copy_to_new_table(
|
|
407
|
+
... source_table="users",
|
|
408
|
+
... target_table="users_new",
|
|
409
|
+
... transform={"email": "LOWER(email)"}
|
|
410
|
+
... )
|
|
411
|
+
"""
|
|
412
|
+
start_time = time.perf_counter()
|
|
413
|
+
|
|
414
|
+
with self.connection.cursor() as cur:
|
|
415
|
+
# Get total rows
|
|
416
|
+
cur.execute(f"SELECT COUNT(*) FROM {source_table} WHERE {where_clause}")
|
|
417
|
+
total_rows = cur.fetchone()[0]
|
|
418
|
+
|
|
419
|
+
if total_rows == 0:
|
|
420
|
+
return BatchProgress(total_rows=0)
|
|
421
|
+
|
|
422
|
+
# Get columns if not specified
|
|
423
|
+
if columns is None:
|
|
424
|
+
cur.execute(
|
|
425
|
+
"""
|
|
426
|
+
SELECT column_name FROM information_schema.columns
|
|
427
|
+
WHERE table_name = %s AND table_schema = 'public'
|
|
428
|
+
ORDER BY ordinal_position
|
|
429
|
+
""",
|
|
430
|
+
(source_table,),
|
|
431
|
+
)
|
|
432
|
+
columns = [row[0] for row in cur.fetchall()]
|
|
433
|
+
|
|
434
|
+
# Build select expressions
|
|
435
|
+
transform = transform or {}
|
|
436
|
+
select_exprs = [transform.get(col, col) for col in columns]
|
|
437
|
+
select_str = ", ".join(select_exprs)
|
|
438
|
+
columns_str = ", ".join(columns)
|
|
439
|
+
|
|
440
|
+
# Track last ID for pagination
|
|
441
|
+
cur.execute(f"SELECT MIN(ctid) FROM {source_table} WHERE {where_clause}")
|
|
442
|
+
result = cur.fetchone()
|
|
443
|
+
if result[0] is None:
|
|
444
|
+
return BatchProgress(total_rows=0)
|
|
445
|
+
|
|
446
|
+
total_batches = (total_rows + self.config.batch_size - 1) // self.config.batch_size
|
|
447
|
+
processed = 0
|
|
448
|
+
progress = BatchProgress(
|
|
449
|
+
total_rows=total_rows,
|
|
450
|
+
total_batches=total_batches,
|
|
451
|
+
)
|
|
452
|
+
batch_num = 0
|
|
453
|
+
|
|
454
|
+
# Use a tracking column for batching
|
|
455
|
+
cur.execute(
|
|
456
|
+
f"""
|
|
457
|
+
CREATE TEMP TABLE _batch_tracker AS
|
|
458
|
+
SELECT ctid as row_ctid, ROW_NUMBER() OVER () as rn
|
|
459
|
+
FROM {source_table}
|
|
460
|
+
WHERE {where_clause}
|
|
461
|
+
"""
|
|
462
|
+
)
|
|
463
|
+
self.connection.commit()
|
|
464
|
+
|
|
465
|
+
try:
|
|
466
|
+
while processed < total_rows:
|
|
467
|
+
batch_num += 1
|
|
468
|
+
offset = processed
|
|
469
|
+
|
|
470
|
+
cur.execute(
|
|
471
|
+
f"""
|
|
472
|
+
INSERT INTO {target_table} ({columns_str})
|
|
473
|
+
SELECT {select_str}
|
|
474
|
+
FROM {source_table} s
|
|
475
|
+
WHERE s.ctid IN (
|
|
476
|
+
SELECT row_ctid FROM _batch_tracker
|
|
477
|
+
WHERE rn > %s AND rn <= %s
|
|
478
|
+
)
|
|
479
|
+
""",
|
|
480
|
+
(offset, offset + self.config.batch_size),
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
rows_inserted = cur.rowcount
|
|
484
|
+
if rows_inserted == 0:
|
|
485
|
+
break
|
|
486
|
+
|
|
487
|
+
self.connection.commit()
|
|
488
|
+
processed += rows_inserted
|
|
489
|
+
progress.processed_rows = processed
|
|
490
|
+
progress.current_batch = batch_num
|
|
491
|
+
progress.elapsed_seconds = time.perf_counter() - start_time
|
|
492
|
+
|
|
493
|
+
if self.config.progress_callback:
|
|
494
|
+
self.config.progress_callback(processed, total_rows)
|
|
495
|
+
|
|
496
|
+
logger.info(
|
|
497
|
+
f"Copy batch {batch_num}/{total_batches}: "
|
|
498
|
+
f"{progress.percent_complete:.1f}% complete"
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
if self.config.sleep_between_batches > 0:
|
|
502
|
+
time.sleep(self.config.sleep_between_batches)
|
|
503
|
+
|
|
504
|
+
finally:
|
|
505
|
+
cur.execute("DROP TABLE IF EXISTS _batch_tracker")
|
|
506
|
+
self.connection.commit()
|
|
507
|
+
|
|
508
|
+
progress.elapsed_seconds = time.perf_counter() - start_time
|
|
509
|
+
return progress
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
class OnlineIndexBuilder:
|
|
513
|
+
"""Build indexes without blocking writes.
|
|
514
|
+
|
|
515
|
+
Provides utilities for creating, dropping, and rebuilding indexes
|
|
516
|
+
using CONCURRENTLY operations to avoid blocking writes.
|
|
517
|
+
|
|
518
|
+
Example:
|
|
519
|
+
>>> builder = OnlineIndexBuilder(conn)
|
|
520
|
+
>>> index_name = builder.create_index_concurrently(
|
|
521
|
+
... table="users",
|
|
522
|
+
... columns=["email"],
|
|
523
|
+
... unique=True
|
|
524
|
+
... )
|
|
525
|
+
"""
|
|
526
|
+
|
|
527
|
+
def __init__(self, connection: Any):
|
|
528
|
+
"""Initialize index builder.
|
|
529
|
+
|
|
530
|
+
Args:
|
|
531
|
+
connection: Database connection
|
|
532
|
+
"""
|
|
533
|
+
self.connection = connection
|
|
534
|
+
|
|
535
|
+
def create_index_concurrently(
|
|
536
|
+
self,
|
|
537
|
+
table: str,
|
|
538
|
+
columns: list[str],
|
|
539
|
+
index_name: str | None = None,
|
|
540
|
+
unique: bool = False,
|
|
541
|
+
where: str | None = None,
|
|
542
|
+
method: str = "btree",
|
|
543
|
+
include: list[str] | None = None,
|
|
544
|
+
) -> str:
|
|
545
|
+
"""Create index without blocking writes.
|
|
546
|
+
|
|
547
|
+
Note: Requires autocommit mode. This method handles that automatically.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
table: Table name
|
|
551
|
+
columns: Columns to index
|
|
552
|
+
index_name: Optional index name (auto-generated if not provided)
|
|
553
|
+
unique: Create unique index
|
|
554
|
+
where: Partial index condition
|
|
555
|
+
method: Index method (btree, hash, gin, gist, etc.)
|
|
556
|
+
include: Additional columns to include (covering index)
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
Name of created index
|
|
560
|
+
"""
|
|
561
|
+
if index_name is None:
|
|
562
|
+
col_names = "_".join(columns)
|
|
563
|
+
index_name = f"idx_{table}_{col_names}"
|
|
564
|
+
|
|
565
|
+
unique_str = "UNIQUE " if unique else ""
|
|
566
|
+
columns_str = ", ".join(columns)
|
|
567
|
+
where_str = f" WHERE {where}" if where else ""
|
|
568
|
+
include_str = f" INCLUDE ({', '.join(include)})" if include else ""
|
|
569
|
+
|
|
570
|
+
# Must use autocommit for CONCURRENTLY
|
|
571
|
+
old_autocommit = self.connection.autocommit
|
|
572
|
+
self.connection.autocommit = True
|
|
573
|
+
|
|
574
|
+
try:
|
|
575
|
+
with self.connection.cursor() as cur:
|
|
576
|
+
sql = f"""
|
|
577
|
+
CREATE {unique_str}INDEX CONCURRENTLY IF NOT EXISTS
|
|
578
|
+
{index_name} ON {table} USING {method} ({columns_str})
|
|
579
|
+
{include_str}{where_str}
|
|
580
|
+
"""
|
|
581
|
+
logger.info(f"Creating index: {index_name}")
|
|
582
|
+
cur.execute(sql)
|
|
583
|
+
logger.info(f"Index created: {index_name}")
|
|
584
|
+
finally:
|
|
585
|
+
self.connection.autocommit = old_autocommit
|
|
586
|
+
|
|
587
|
+
return index_name
|
|
588
|
+
|
|
589
|
+
def drop_index_concurrently(self, index_name: str) -> None:
|
|
590
|
+
"""Drop index without blocking writes.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
index_name: Name of index to drop
|
|
594
|
+
"""
|
|
595
|
+
old_autocommit = self.connection.autocommit
|
|
596
|
+
self.connection.autocommit = True
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
with self.connection.cursor() as cur:
|
|
600
|
+
logger.info(f"Dropping index: {index_name}")
|
|
601
|
+
cur.execute(f"DROP INDEX CONCURRENTLY IF EXISTS {index_name}")
|
|
602
|
+
logger.info(f"Index dropped: {index_name}")
|
|
603
|
+
finally:
|
|
604
|
+
self.connection.autocommit = old_autocommit
|
|
605
|
+
|
|
606
|
+
def reindex_concurrently(self, index_name: str) -> None:
|
|
607
|
+
"""Rebuild index without blocking writes (PG 12+).
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
index_name: Name of index to rebuild
|
|
611
|
+
"""
|
|
612
|
+
old_autocommit = self.connection.autocommit
|
|
613
|
+
self.connection.autocommit = True
|
|
614
|
+
|
|
615
|
+
try:
|
|
616
|
+
with self.connection.cursor() as cur:
|
|
617
|
+
logger.info(f"Reindexing: {index_name}")
|
|
618
|
+
cur.execute(f"REINDEX INDEX CONCURRENTLY {index_name}")
|
|
619
|
+
logger.info(f"Reindex complete: {index_name}")
|
|
620
|
+
finally:
|
|
621
|
+
self.connection.autocommit = old_autocommit
|
|
622
|
+
|
|
623
|
+
def check_index_validity(self, index_name: str) -> bool:
|
|
624
|
+
"""Check if index is valid (not corrupted/invalid from failed creation).
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
index_name: Name of index to check
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
True if index is valid
|
|
631
|
+
"""
|
|
632
|
+
with self.connection.cursor() as cur:
|
|
633
|
+
cur.execute(
|
|
634
|
+
"""
|
|
635
|
+
SELECT indisvalid
|
|
636
|
+
FROM pg_index
|
|
637
|
+
JOIN pg_class ON pg_index.indexrelid = pg_class.oid
|
|
638
|
+
WHERE pg_class.relname = %s
|
|
639
|
+
""",
|
|
640
|
+
(index_name,),
|
|
641
|
+
)
|
|
642
|
+
result = cur.fetchone()
|
|
643
|
+
if result is None:
|
|
644
|
+
return False
|
|
645
|
+
return result[0]
|
|
646
|
+
|
|
647
|
+
def get_index_size(self, index_name: str) -> int:
|
|
648
|
+
"""Get index size in bytes.
|
|
649
|
+
|
|
650
|
+
Args:
|
|
651
|
+
index_name: Name of index
|
|
652
|
+
|
|
653
|
+
Returns:
|
|
654
|
+
Size in bytes
|
|
655
|
+
"""
|
|
656
|
+
with self.connection.cursor() as cur:
|
|
657
|
+
cur.execute(
|
|
658
|
+
"SELECT pg_relation_size(%s)",
|
|
659
|
+
(index_name,),
|
|
660
|
+
)
|
|
661
|
+
result = cur.fetchone()
|
|
662
|
+
return result[0] if result else 0
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
class TableSizeEstimator:
|
|
666
|
+
"""Estimate table sizes for migration planning.
|
|
667
|
+
|
|
668
|
+
Helps decide whether to use batched operations based on
|
|
669
|
+
table size.
|
|
670
|
+
"""
|
|
671
|
+
|
|
672
|
+
# Threshold in rows for using batched operations
|
|
673
|
+
LARGE_TABLE_THRESHOLD = 100_000
|
|
674
|
+
|
|
675
|
+
def __init__(self, connection: Any):
|
|
676
|
+
"""Initialize estimator.
|
|
677
|
+
|
|
678
|
+
Args:
|
|
679
|
+
connection: Database connection
|
|
680
|
+
"""
|
|
681
|
+
self.connection = connection
|
|
682
|
+
|
|
683
|
+
def get_row_count_estimate(self, table: str) -> int:
|
|
684
|
+
"""Get estimated row count (fast but approximate).
|
|
685
|
+
|
|
686
|
+
Uses pg_class statistics rather than COUNT(*).
|
|
687
|
+
|
|
688
|
+
Args:
|
|
689
|
+
table: Table name
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
Estimated row count
|
|
693
|
+
"""
|
|
694
|
+
with self.connection.cursor() as cur:
|
|
695
|
+
cur.execute(
|
|
696
|
+
"""
|
|
697
|
+
SELECT reltuples::bigint
|
|
698
|
+
FROM pg_class
|
|
699
|
+
WHERE relname = %s
|
|
700
|
+
""",
|
|
701
|
+
(table,),
|
|
702
|
+
)
|
|
703
|
+
result = cur.fetchone()
|
|
704
|
+
return max(0, result[0]) if result else 0
|
|
705
|
+
|
|
706
|
+
def get_exact_row_count(self, table: str, where_clause: str = "TRUE") -> int:
|
|
707
|
+
"""Get exact row count (slow but accurate).
|
|
708
|
+
|
|
709
|
+
Args:
|
|
710
|
+
table: Table name
|
|
711
|
+
where_clause: Optional filter
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
Exact row count
|
|
715
|
+
"""
|
|
716
|
+
with self.connection.cursor() as cur:
|
|
717
|
+
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {where_clause}")
|
|
718
|
+
return cur.fetchone()[0]
|
|
719
|
+
|
|
720
|
+
def get_table_size(self, table: str) -> dict[str, int]:
|
|
721
|
+
"""Get table size information.
|
|
722
|
+
|
|
723
|
+
Args:
|
|
724
|
+
table: Table name
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
Dictionary with size information
|
|
728
|
+
"""
|
|
729
|
+
with self.connection.cursor() as cur:
|
|
730
|
+
cur.execute(
|
|
731
|
+
"""
|
|
732
|
+
SELECT
|
|
733
|
+
pg_table_size(%s) as table_size,
|
|
734
|
+
pg_indexes_size(%s) as index_size,
|
|
735
|
+
pg_total_relation_size(%s) as total_size
|
|
736
|
+
""",
|
|
737
|
+
(table, table, table),
|
|
738
|
+
)
|
|
739
|
+
row = cur.fetchone()
|
|
740
|
+
return {
|
|
741
|
+
"table_size_bytes": row[0],
|
|
742
|
+
"index_size_bytes": row[1],
|
|
743
|
+
"total_size_bytes": row[2],
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
def should_use_batched_operation(self, table: str) -> bool:
|
|
747
|
+
"""Determine if batched operations should be used.
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
table: Table name
|
|
751
|
+
|
|
752
|
+
Returns:
|
|
753
|
+
True if table is large enough to warrant batching
|
|
754
|
+
"""
|
|
755
|
+
estimate = self.get_row_count_estimate(table)
|
|
756
|
+
return estimate >= self.LARGE_TABLE_THRESHOLD
|
|
757
|
+
|
|
758
|
+
def estimate_operation_time(
|
|
759
|
+
self,
|
|
760
|
+
table: str,
|
|
761
|
+
rows_per_second: float = 10000.0,
|
|
762
|
+
) -> float:
|
|
763
|
+
"""Estimate time for a full-table operation.
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
table: Table name
|
|
767
|
+
rows_per_second: Expected processing rate
|
|
768
|
+
|
|
769
|
+
Returns:
|
|
770
|
+
Estimated seconds
|
|
771
|
+
"""
|
|
772
|
+
estimate = self.get_row_count_estimate(table)
|
|
773
|
+
if rows_per_second <= 0:
|
|
774
|
+
return 0.0
|
|
775
|
+
return estimate / rows_per_second
|