fraiseql-confiture 0.3.4__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. confiture/__init__.py +48 -0
  2. confiture/_core.cp311-win_amd64.pyd +0 -0
  3. confiture/cli/__init__.py +0 -0
  4. confiture/cli/dry_run.py +116 -0
  5. confiture/cli/lint_formatter.py +193 -0
  6. confiture/cli/main.py +1656 -0
  7. confiture/config/__init__.py +0 -0
  8. confiture/config/environment.py +263 -0
  9. confiture/core/__init__.py +51 -0
  10. confiture/core/anonymization/__init__.py +0 -0
  11. confiture/core/anonymization/audit.py +485 -0
  12. confiture/core/anonymization/benchmarking.py +372 -0
  13. confiture/core/anonymization/breach_notification.py +652 -0
  14. confiture/core/anonymization/compliance.py +617 -0
  15. confiture/core/anonymization/composer.py +298 -0
  16. confiture/core/anonymization/data_subject_rights.py +669 -0
  17. confiture/core/anonymization/factory.py +319 -0
  18. confiture/core/anonymization/governance.py +737 -0
  19. confiture/core/anonymization/performance.py +1092 -0
  20. confiture/core/anonymization/profile.py +284 -0
  21. confiture/core/anonymization/registry.py +195 -0
  22. confiture/core/anonymization/security/kms_manager.py +547 -0
  23. confiture/core/anonymization/security/lineage.py +888 -0
  24. confiture/core/anonymization/security/token_store.py +686 -0
  25. confiture/core/anonymization/strategies/__init__.py +41 -0
  26. confiture/core/anonymization/strategies/address.py +359 -0
  27. confiture/core/anonymization/strategies/credit_card.py +374 -0
  28. confiture/core/anonymization/strategies/custom.py +161 -0
  29. confiture/core/anonymization/strategies/date.py +218 -0
  30. confiture/core/anonymization/strategies/differential_privacy.py +398 -0
  31. confiture/core/anonymization/strategies/email.py +141 -0
  32. confiture/core/anonymization/strategies/format_preserving_encryption.py +310 -0
  33. confiture/core/anonymization/strategies/hash.py +150 -0
  34. confiture/core/anonymization/strategies/ip_address.py +235 -0
  35. confiture/core/anonymization/strategies/masking_retention.py +252 -0
  36. confiture/core/anonymization/strategies/name.py +298 -0
  37. confiture/core/anonymization/strategies/phone.py +119 -0
  38. confiture/core/anonymization/strategies/preserve.py +85 -0
  39. confiture/core/anonymization/strategies/redact.py +101 -0
  40. confiture/core/anonymization/strategies/salted_hashing.py +322 -0
  41. confiture/core/anonymization/strategies/text_redaction.py +183 -0
  42. confiture/core/anonymization/strategies/tokenization.py +334 -0
  43. confiture/core/anonymization/strategy.py +241 -0
  44. confiture/core/anonymization/syncer_audit.py +357 -0
  45. confiture/core/blue_green.py +683 -0
  46. confiture/core/builder.py +500 -0
  47. confiture/core/checksum.py +358 -0
  48. confiture/core/connection.py +132 -0
  49. confiture/core/differ.py +522 -0
  50. confiture/core/drift.py +564 -0
  51. confiture/core/dry_run.py +182 -0
  52. confiture/core/health.py +313 -0
  53. confiture/core/hooks/__init__.py +87 -0
  54. confiture/core/hooks/base.py +232 -0
  55. confiture/core/hooks/context.py +146 -0
  56. confiture/core/hooks/execution_strategies.py +57 -0
  57. confiture/core/hooks/observability.py +220 -0
  58. confiture/core/hooks/phases.py +53 -0
  59. confiture/core/hooks/registry.py +295 -0
  60. confiture/core/large_tables.py +775 -0
  61. confiture/core/linting/__init__.py +70 -0
  62. confiture/core/linting/composer.py +192 -0
  63. confiture/core/linting/libraries/__init__.py +17 -0
  64. confiture/core/linting/libraries/gdpr.py +168 -0
  65. confiture/core/linting/libraries/general.py +184 -0
  66. confiture/core/linting/libraries/hipaa.py +144 -0
  67. confiture/core/linting/libraries/pci_dss.py +104 -0
  68. confiture/core/linting/libraries/sox.py +120 -0
  69. confiture/core/linting/schema_linter.py +491 -0
  70. confiture/core/linting/versioning.py +151 -0
  71. confiture/core/locking.py +389 -0
  72. confiture/core/migration_generator.py +298 -0
  73. confiture/core/migrator.py +793 -0
  74. confiture/core/observability/__init__.py +44 -0
  75. confiture/core/observability/audit.py +323 -0
  76. confiture/core/observability/logging.py +187 -0
  77. confiture/core/observability/metrics.py +174 -0
  78. confiture/core/observability/tracing.py +192 -0
  79. confiture/core/pg_version.py +418 -0
  80. confiture/core/pool.py +406 -0
  81. confiture/core/risk/__init__.py +39 -0
  82. confiture/core/risk/predictor.py +188 -0
  83. confiture/core/risk/scoring.py +248 -0
  84. confiture/core/rollback_generator.py +388 -0
  85. confiture/core/schema_analyzer.py +769 -0
  86. confiture/core/schema_to_schema.py +590 -0
  87. confiture/core/security/__init__.py +32 -0
  88. confiture/core/security/logging.py +201 -0
  89. confiture/core/security/validation.py +416 -0
  90. confiture/core/signals.py +371 -0
  91. confiture/core/syncer.py +540 -0
  92. confiture/exceptions.py +192 -0
  93. confiture/integrations/__init__.py +0 -0
  94. confiture/models/__init__.py +0 -0
  95. confiture/models/lint.py +193 -0
  96. confiture/models/migration.py +180 -0
  97. confiture/models/schema.py +203 -0
  98. confiture/scenarios/__init__.py +36 -0
  99. confiture/scenarios/compliance.py +586 -0
  100. confiture/scenarios/ecommerce.py +199 -0
  101. confiture/scenarios/financial.py +253 -0
  102. confiture/scenarios/healthcare.py +315 -0
  103. confiture/scenarios/multi_tenant.py +340 -0
  104. confiture/scenarios/saas.py +295 -0
  105. confiture/testing/FRAMEWORK_API.md +722 -0
  106. confiture/testing/__init__.py +38 -0
  107. confiture/testing/fixtures/__init__.py +11 -0
  108. confiture/testing/fixtures/data_validator.py +229 -0
  109. confiture/testing/fixtures/migration_runner.py +167 -0
  110. confiture/testing/fixtures/schema_snapshotter.py +352 -0
  111. confiture/testing/frameworks/__init__.py +10 -0
  112. confiture/testing/frameworks/mutation.py +587 -0
  113. confiture/testing/frameworks/performance.py +479 -0
  114. confiture/testing/utils/__init__.py +0 -0
  115. fraiseql_confiture-0.3.4.dist-info/METADATA +438 -0
  116. fraiseql_confiture-0.3.4.dist-info/RECORD +119 -0
  117. fraiseql_confiture-0.3.4.dist-info/WHEEL +4 -0
  118. fraiseql_confiture-0.3.4.dist-info/entry_points.txt +2 -0
  119. fraiseql_confiture-0.3.4.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,775 @@
1
+ """Large table migration patterns.
2
+
3
+ Provides utilities for migrating large tables (>1M rows) without
4
+ blocking production traffic. Includes batched operations, progress
5
+ reporting, and resumable patterns.
6
+ """
7
+
8
+ import logging
9
+ import time
10
+ from collections.abc import Callable
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class BatchConfig:
19
+ """Configuration for batched operations.
20
+
21
+ Attributes:
22
+ batch_size: Number of rows per batch
23
+ sleep_between_batches: Seconds to wait between batches
24
+ max_retries: Maximum retries per batch on failure
25
+ progress_callback: Optional callback for progress updates
26
+ checkpoint_callback: Optional callback for checkpointing
27
+ """
28
+
29
+ batch_size: int = 10000
30
+ sleep_between_batches: float = 0.1
31
+ max_retries: int = 3
32
+ progress_callback: Callable[[int, int], None] | None = None
33
+ checkpoint_callback: Callable[[int], None] | None = None
34
+
35
+
36
+ @dataclass
37
+ class BatchProgress:
38
+ """Progress of a batched operation.
39
+
40
+ Tracks rows processed, batches completed, and timing information.
41
+ """
42
+
43
+ total_rows: int
44
+ processed_rows: int = 0
45
+ current_batch: int = 0
46
+ total_batches: int = 0
47
+ elapsed_seconds: float = 0.0
48
+ errors: list[str] = field(default_factory=list)
49
+
50
+ @property
51
+ def percent_complete(self) -> float:
52
+ """Get completion percentage."""
53
+ if self.total_rows == 0:
54
+ return 100.0
55
+ return (self.processed_rows / self.total_rows) * 100
56
+
57
+ @property
58
+ def is_complete(self) -> bool:
59
+ """Check if operation is complete."""
60
+ return self.processed_rows >= self.total_rows
61
+
62
+ @property
63
+ def rows_per_second(self) -> float:
64
+ """Calculate processing rate."""
65
+ if self.elapsed_seconds == 0:
66
+ return 0.0
67
+ return self.processed_rows / self.elapsed_seconds
68
+
69
+ @property
70
+ def estimated_remaining_seconds(self) -> float:
71
+ """Estimate remaining time."""
72
+ if self.rows_per_second == 0:
73
+ return 0.0
74
+ remaining_rows = self.total_rows - self.processed_rows
75
+ return remaining_rows / self.rows_per_second
76
+
77
+ def to_dict(self) -> dict[str, Any]:
78
+ """Convert to dictionary."""
79
+ return {
80
+ "total_rows": self.total_rows,
81
+ "processed_rows": self.processed_rows,
82
+ "percent_complete": round(self.percent_complete, 2),
83
+ "current_batch": self.current_batch,
84
+ "total_batches": self.total_batches,
85
+ "elapsed_seconds": round(self.elapsed_seconds, 2),
86
+ "rows_per_second": round(self.rows_per_second, 2),
87
+ "estimated_remaining_seconds": round(self.estimated_remaining_seconds, 2),
88
+ "errors": self.errors,
89
+ }
90
+
91
+
92
+ class BatchedMigration:
93
+ """Execute migrations in batches for large tables.
94
+
95
+ Provides methods for common large table operations that need
96
+ to be done in batches to avoid long-running transactions.
97
+
98
+ Example:
99
+ >>> config = BatchConfig(batch_size=10000)
100
+ >>> batched = BatchedMigration(conn, config)
101
+ >>> progress = batched.add_column_with_default(
102
+ ... table="users",
103
+ ... column="status",
104
+ ... column_type="TEXT",
105
+ ... default="'active'"
106
+ ... )
107
+ >>> print(f"Processed {progress.processed_rows} rows")
108
+ """
109
+
110
+ def __init__(self, connection: Any, config: BatchConfig | None = None):
111
+ """Initialize batched migration.
112
+
113
+ Args:
114
+ connection: Database connection
115
+ config: Batch configuration (optional)
116
+ """
117
+ self.connection = connection
118
+ self.config = config or BatchConfig()
119
+
120
+ def add_column_with_default(
121
+ self,
122
+ table: str,
123
+ column: str,
124
+ column_type: str,
125
+ default: str,
126
+ start_from: int = 0,
127
+ ) -> BatchProgress:
128
+ """Add column with default value in batches.
129
+
130
+ PostgreSQL 11+ adds columns with defaults instantly, but
131
+ backfilling existing NULL rows can lock the table. This
132
+ does the backfill in batches.
133
+
134
+ Args:
135
+ table: Table name
136
+ column: Column name
137
+ column_type: Column type (e.g., "TEXT", "INTEGER")
138
+ default: Default value expression
139
+ start_from: Resume from this row count (for resumption)
140
+
141
+ Returns:
142
+ BatchProgress with operation result
143
+ """
144
+ start_time = time.perf_counter()
145
+
146
+ with self.connection.cursor() as cur:
147
+ # Add column without default first (instant in PG 11+)
148
+ cur.execute(
149
+ f"""
150
+ ALTER TABLE {table}
151
+ ADD COLUMN IF NOT EXISTS {column} {column_type}
152
+ """
153
+ )
154
+ self.connection.commit()
155
+
156
+ # Get total rows needing update
157
+ cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {column} IS NULL")
158
+ total_rows = cur.fetchone()[0]
159
+
160
+ if total_rows == 0:
161
+ return BatchProgress(total_rows=0)
162
+
163
+ total_batches = (total_rows + self.config.batch_size - 1) // self.config.batch_size
164
+ processed = start_from
165
+ progress = BatchProgress(
166
+ total_rows=total_rows,
167
+ processed_rows=processed,
168
+ total_batches=total_batches,
169
+ )
170
+
171
+ batch_num = start_from // self.config.batch_size
172
+
173
+ while processed < total_rows:
174
+ batch_num += 1
175
+
176
+ for attempt in range(self.config.max_retries):
177
+ try:
178
+ # Update batch using ctid for efficiency
179
+ cur.execute(
180
+ f"""
181
+ UPDATE {table}
182
+ SET {column} = {default}
183
+ WHERE ctid IN (
184
+ SELECT ctid FROM {table}
185
+ WHERE {column} IS NULL
186
+ LIMIT {self.config.batch_size}
187
+ )
188
+ """
189
+ )
190
+ rows_affected = cur.rowcount
191
+ self.connection.commit()
192
+ break
193
+ except Exception as e:
194
+ self.connection.rollback()
195
+ if attempt == self.config.max_retries - 1:
196
+ progress.errors.append(f"Batch {batch_num}: {e}")
197
+ raise
198
+ logger.warning(f"Batch {batch_num} failed, retrying: {e}")
199
+ time.sleep(self.config.sleep_between_batches * 2)
200
+
201
+ processed += rows_affected
202
+ progress.processed_rows = processed
203
+ progress.current_batch = batch_num
204
+ progress.elapsed_seconds = time.perf_counter() - start_time
205
+
206
+ if self.config.progress_callback:
207
+ self.config.progress_callback(processed, total_rows)
208
+
209
+ if self.config.checkpoint_callback:
210
+ self.config.checkpoint_callback(processed)
211
+
212
+ logger.info(
213
+ f"Batch {batch_num}/{total_batches}: "
214
+ f"{progress.percent_complete:.1f}% complete "
215
+ f"({progress.rows_per_second:.0f} rows/sec)"
216
+ )
217
+
218
+ if rows_affected == 0:
219
+ break
220
+
221
+ if self.config.sleep_between_batches > 0:
222
+ time.sleep(self.config.sleep_between_batches)
223
+
224
+ # Set default for future inserts
225
+ cur.execute(
226
+ f"""
227
+ ALTER TABLE {table}
228
+ ALTER COLUMN {column} SET DEFAULT {default}
229
+ """
230
+ )
231
+ self.connection.commit()
232
+
233
+ progress.elapsed_seconds = time.perf_counter() - start_time
234
+ return progress
235
+
236
+ def backfill_column(
237
+ self,
238
+ table: str,
239
+ column: str,
240
+ expression: str,
241
+ where_clause: str = "TRUE",
242
+ start_from: int = 0,
243
+ ) -> BatchProgress:
244
+ """Backfill column values in batches.
245
+
246
+ Example:
247
+ >>> progress = batched.backfill_column(
248
+ ... table="orders",
249
+ ... column="total_cents",
250
+ ... expression="(subtotal + tax) * 100",
251
+ ... where_clause="total_cents IS NULL"
252
+ ... )
253
+ """
254
+ start_time = time.perf_counter()
255
+
256
+ with self.connection.cursor() as cur:
257
+ # Get total rows
258
+ cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {where_clause}")
259
+ total_rows = cur.fetchone()[0]
260
+
261
+ if total_rows == 0:
262
+ return BatchProgress(total_rows=0)
263
+
264
+ total_batches = (total_rows + self.config.batch_size - 1) // self.config.batch_size
265
+ processed = start_from
266
+ progress = BatchProgress(
267
+ total_rows=total_rows,
268
+ processed_rows=processed,
269
+ total_batches=total_batches,
270
+ )
271
+
272
+ batch_num = start_from // self.config.batch_size
273
+
274
+ while True:
275
+ batch_num += 1
276
+
277
+ cur.execute(
278
+ f"""
279
+ UPDATE {table}
280
+ SET {column} = {expression}
281
+ WHERE ctid IN (
282
+ SELECT ctid FROM {table}
283
+ WHERE {where_clause}
284
+ LIMIT {self.config.batch_size}
285
+ )
286
+ """
287
+ )
288
+
289
+ rows_affected = cur.rowcount
290
+ if rows_affected == 0:
291
+ break
292
+
293
+ self.connection.commit()
294
+ processed += rows_affected
295
+ progress.processed_rows = processed
296
+ progress.current_batch = batch_num
297
+ progress.elapsed_seconds = time.perf_counter() - start_time
298
+
299
+ if self.config.progress_callback:
300
+ self.config.progress_callback(processed, total_rows)
301
+
302
+ if self.config.checkpoint_callback:
303
+ self.config.checkpoint_callback(processed)
304
+
305
+ logger.info(
306
+ f"Backfill batch {batch_num}: {progress.percent_complete:.1f}% complete"
307
+ )
308
+
309
+ if self.config.sleep_between_batches > 0:
310
+ time.sleep(self.config.sleep_between_batches)
311
+
312
+ progress.elapsed_seconds = time.perf_counter() - start_time
313
+ return progress
314
+
315
+ def delete_in_batches(
316
+ self,
317
+ table: str,
318
+ where_clause: str,
319
+ start_from: int = 0,
320
+ ) -> BatchProgress:
321
+ """Delete rows in batches to avoid long locks.
322
+
323
+ Example:
324
+ >>> progress = batched.delete_in_batches(
325
+ ... table="audit_logs",
326
+ ... where_clause="created_at < NOW() - INTERVAL '1 year'"
327
+ ... )
328
+ """
329
+ start_time = time.perf_counter()
330
+
331
+ with self.connection.cursor() as cur:
332
+ cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {where_clause}")
333
+ total_rows = cur.fetchone()[0]
334
+
335
+ if total_rows == 0:
336
+ return BatchProgress(total_rows=0)
337
+
338
+ total_batches = (total_rows + self.config.batch_size - 1) // self.config.batch_size
339
+ processed = start_from
340
+ progress = BatchProgress(
341
+ total_rows=total_rows,
342
+ processed_rows=processed,
343
+ total_batches=total_batches,
344
+ )
345
+
346
+ batch_num = start_from // self.config.batch_size
347
+
348
+ while True:
349
+ batch_num += 1
350
+
351
+ cur.execute(
352
+ f"""
353
+ DELETE FROM {table}
354
+ WHERE ctid IN (
355
+ SELECT ctid FROM {table}
356
+ WHERE {where_clause}
357
+ LIMIT {self.config.batch_size}
358
+ )
359
+ """
360
+ )
361
+
362
+ rows_deleted = cur.rowcount
363
+ if rows_deleted == 0:
364
+ break
365
+
366
+ self.connection.commit()
367
+ processed += rows_deleted
368
+ progress.processed_rows = processed
369
+ progress.current_batch = batch_num
370
+ progress.elapsed_seconds = time.perf_counter() - start_time
371
+
372
+ if self.config.progress_callback:
373
+ self.config.progress_callback(processed, total_rows)
374
+
375
+ if self.config.checkpoint_callback:
376
+ self.config.checkpoint_callback(processed)
377
+
378
+ logger.info(f"Delete batch {batch_num}: {progress.percent_complete:.1f}% complete")
379
+
380
+ if self.config.sleep_between_batches > 0:
381
+ time.sleep(self.config.sleep_between_batches)
382
+
383
+ progress.elapsed_seconds = time.perf_counter() - start_time
384
+ return progress
385
+
386
+ def copy_to_new_table(
387
+ self,
388
+ source_table: str,
389
+ target_table: str,
390
+ columns: list[str] | None = None,
391
+ transform: dict[str, str] | None = None,
392
+ where_clause: str = "TRUE",
393
+ ) -> BatchProgress:
394
+ """Copy data to a new table in batches.
395
+
396
+ Useful for table restructuring without blocking reads on source.
397
+
398
+ Args:
399
+ source_table: Source table name
400
+ target_table: Target table name (must exist)
401
+ columns: Columns to copy (None = all)
402
+ transform: Column transformations {col: expression}
403
+ where_clause: Filter condition
404
+
405
+ Example:
406
+ >>> progress = batched.copy_to_new_table(
407
+ ... source_table="users",
408
+ ... target_table="users_new",
409
+ ... transform={"email": "LOWER(email)"}
410
+ ... )
411
+ """
412
+ start_time = time.perf_counter()
413
+
414
+ with self.connection.cursor() as cur:
415
+ # Get total rows
416
+ cur.execute(f"SELECT COUNT(*) FROM {source_table} WHERE {where_clause}")
417
+ total_rows = cur.fetchone()[0]
418
+
419
+ if total_rows == 0:
420
+ return BatchProgress(total_rows=0)
421
+
422
+ # Get columns if not specified
423
+ if columns is None:
424
+ cur.execute(
425
+ """
426
+ SELECT column_name FROM information_schema.columns
427
+ WHERE table_name = %s AND table_schema = 'public'
428
+ ORDER BY ordinal_position
429
+ """,
430
+ (source_table,),
431
+ )
432
+ columns = [row[0] for row in cur.fetchall()]
433
+
434
+ # Build select expressions
435
+ transform = transform or {}
436
+ select_exprs = [transform.get(col, col) for col in columns]
437
+ select_str = ", ".join(select_exprs)
438
+ columns_str = ", ".join(columns)
439
+
440
+ # Track last ID for pagination
441
+ cur.execute(f"SELECT MIN(ctid) FROM {source_table} WHERE {where_clause}")
442
+ result = cur.fetchone()
443
+ if result[0] is None:
444
+ return BatchProgress(total_rows=0)
445
+
446
+ total_batches = (total_rows + self.config.batch_size - 1) // self.config.batch_size
447
+ processed = 0
448
+ progress = BatchProgress(
449
+ total_rows=total_rows,
450
+ total_batches=total_batches,
451
+ )
452
+ batch_num = 0
453
+
454
+ # Use a tracking column for batching
455
+ cur.execute(
456
+ f"""
457
+ CREATE TEMP TABLE _batch_tracker AS
458
+ SELECT ctid as row_ctid, ROW_NUMBER() OVER () as rn
459
+ FROM {source_table}
460
+ WHERE {where_clause}
461
+ """
462
+ )
463
+ self.connection.commit()
464
+
465
+ try:
466
+ while processed < total_rows:
467
+ batch_num += 1
468
+ offset = processed
469
+
470
+ cur.execute(
471
+ f"""
472
+ INSERT INTO {target_table} ({columns_str})
473
+ SELECT {select_str}
474
+ FROM {source_table} s
475
+ WHERE s.ctid IN (
476
+ SELECT row_ctid FROM _batch_tracker
477
+ WHERE rn > %s AND rn <= %s
478
+ )
479
+ """,
480
+ (offset, offset + self.config.batch_size),
481
+ )
482
+
483
+ rows_inserted = cur.rowcount
484
+ if rows_inserted == 0:
485
+ break
486
+
487
+ self.connection.commit()
488
+ processed += rows_inserted
489
+ progress.processed_rows = processed
490
+ progress.current_batch = batch_num
491
+ progress.elapsed_seconds = time.perf_counter() - start_time
492
+
493
+ if self.config.progress_callback:
494
+ self.config.progress_callback(processed, total_rows)
495
+
496
+ logger.info(
497
+ f"Copy batch {batch_num}/{total_batches}: "
498
+ f"{progress.percent_complete:.1f}% complete"
499
+ )
500
+
501
+ if self.config.sleep_between_batches > 0:
502
+ time.sleep(self.config.sleep_between_batches)
503
+
504
+ finally:
505
+ cur.execute("DROP TABLE IF EXISTS _batch_tracker")
506
+ self.connection.commit()
507
+
508
+ progress.elapsed_seconds = time.perf_counter() - start_time
509
+ return progress
510
+
511
+
512
+ class OnlineIndexBuilder:
513
+ """Build indexes without blocking writes.
514
+
515
+ Provides utilities for creating, dropping, and rebuilding indexes
516
+ using CONCURRENTLY operations to avoid blocking writes.
517
+
518
+ Example:
519
+ >>> builder = OnlineIndexBuilder(conn)
520
+ >>> index_name = builder.create_index_concurrently(
521
+ ... table="users",
522
+ ... columns=["email"],
523
+ ... unique=True
524
+ ... )
525
+ """
526
+
527
+ def __init__(self, connection: Any):
528
+ """Initialize index builder.
529
+
530
+ Args:
531
+ connection: Database connection
532
+ """
533
+ self.connection = connection
534
+
535
+ def create_index_concurrently(
536
+ self,
537
+ table: str,
538
+ columns: list[str],
539
+ index_name: str | None = None,
540
+ unique: bool = False,
541
+ where: str | None = None,
542
+ method: str = "btree",
543
+ include: list[str] | None = None,
544
+ ) -> str:
545
+ """Create index without blocking writes.
546
+
547
+ Note: Requires autocommit mode. This method handles that automatically.
548
+
549
+ Args:
550
+ table: Table name
551
+ columns: Columns to index
552
+ index_name: Optional index name (auto-generated if not provided)
553
+ unique: Create unique index
554
+ where: Partial index condition
555
+ method: Index method (btree, hash, gin, gist, etc.)
556
+ include: Additional columns to include (covering index)
557
+
558
+ Returns:
559
+ Name of created index
560
+ """
561
+ if index_name is None:
562
+ col_names = "_".join(columns)
563
+ index_name = f"idx_{table}_{col_names}"
564
+
565
+ unique_str = "UNIQUE " if unique else ""
566
+ columns_str = ", ".join(columns)
567
+ where_str = f" WHERE {where}" if where else ""
568
+ include_str = f" INCLUDE ({', '.join(include)})" if include else ""
569
+
570
+ # Must use autocommit for CONCURRENTLY
571
+ old_autocommit = self.connection.autocommit
572
+ self.connection.autocommit = True
573
+
574
+ try:
575
+ with self.connection.cursor() as cur:
576
+ sql = f"""
577
+ CREATE {unique_str}INDEX CONCURRENTLY IF NOT EXISTS
578
+ {index_name} ON {table} USING {method} ({columns_str})
579
+ {include_str}{where_str}
580
+ """
581
+ logger.info(f"Creating index: {index_name}")
582
+ cur.execute(sql)
583
+ logger.info(f"Index created: {index_name}")
584
+ finally:
585
+ self.connection.autocommit = old_autocommit
586
+
587
+ return index_name
588
+
589
+ def drop_index_concurrently(self, index_name: str) -> None:
590
+ """Drop index without blocking writes.
591
+
592
+ Args:
593
+ index_name: Name of index to drop
594
+ """
595
+ old_autocommit = self.connection.autocommit
596
+ self.connection.autocommit = True
597
+
598
+ try:
599
+ with self.connection.cursor() as cur:
600
+ logger.info(f"Dropping index: {index_name}")
601
+ cur.execute(f"DROP INDEX CONCURRENTLY IF EXISTS {index_name}")
602
+ logger.info(f"Index dropped: {index_name}")
603
+ finally:
604
+ self.connection.autocommit = old_autocommit
605
+
606
+ def reindex_concurrently(self, index_name: str) -> None:
607
+ """Rebuild index without blocking writes (PG 12+).
608
+
609
+ Args:
610
+ index_name: Name of index to rebuild
611
+ """
612
+ old_autocommit = self.connection.autocommit
613
+ self.connection.autocommit = True
614
+
615
+ try:
616
+ with self.connection.cursor() as cur:
617
+ logger.info(f"Reindexing: {index_name}")
618
+ cur.execute(f"REINDEX INDEX CONCURRENTLY {index_name}")
619
+ logger.info(f"Reindex complete: {index_name}")
620
+ finally:
621
+ self.connection.autocommit = old_autocommit
622
+
623
+ def check_index_validity(self, index_name: str) -> bool:
624
+ """Check if index is valid (not corrupted/invalid from failed creation).
625
+
626
+ Args:
627
+ index_name: Name of index to check
628
+
629
+ Returns:
630
+ True if index is valid
631
+ """
632
+ with self.connection.cursor() as cur:
633
+ cur.execute(
634
+ """
635
+ SELECT indisvalid
636
+ FROM pg_index
637
+ JOIN pg_class ON pg_index.indexrelid = pg_class.oid
638
+ WHERE pg_class.relname = %s
639
+ """,
640
+ (index_name,),
641
+ )
642
+ result = cur.fetchone()
643
+ if result is None:
644
+ return False
645
+ return result[0]
646
+
647
+ def get_index_size(self, index_name: str) -> int:
648
+ """Get index size in bytes.
649
+
650
+ Args:
651
+ index_name: Name of index
652
+
653
+ Returns:
654
+ Size in bytes
655
+ """
656
+ with self.connection.cursor() as cur:
657
+ cur.execute(
658
+ "SELECT pg_relation_size(%s)",
659
+ (index_name,),
660
+ )
661
+ result = cur.fetchone()
662
+ return result[0] if result else 0
663
+
664
+
665
+ class TableSizeEstimator:
666
+ """Estimate table sizes for migration planning.
667
+
668
+ Helps decide whether to use batched operations based on
669
+ table size.
670
+ """
671
+
672
+ # Threshold in rows for using batched operations
673
+ LARGE_TABLE_THRESHOLD = 100_000
674
+
675
+ def __init__(self, connection: Any):
676
+ """Initialize estimator.
677
+
678
+ Args:
679
+ connection: Database connection
680
+ """
681
+ self.connection = connection
682
+
683
+ def get_row_count_estimate(self, table: str) -> int:
684
+ """Get estimated row count (fast but approximate).
685
+
686
+ Uses pg_class statistics rather than COUNT(*).
687
+
688
+ Args:
689
+ table: Table name
690
+
691
+ Returns:
692
+ Estimated row count
693
+ """
694
+ with self.connection.cursor() as cur:
695
+ cur.execute(
696
+ """
697
+ SELECT reltuples::bigint
698
+ FROM pg_class
699
+ WHERE relname = %s
700
+ """,
701
+ (table,),
702
+ )
703
+ result = cur.fetchone()
704
+ return max(0, result[0]) if result else 0
705
+
706
+ def get_exact_row_count(self, table: str, where_clause: str = "TRUE") -> int:
707
+ """Get exact row count (slow but accurate).
708
+
709
+ Args:
710
+ table: Table name
711
+ where_clause: Optional filter
712
+
713
+ Returns:
714
+ Exact row count
715
+ """
716
+ with self.connection.cursor() as cur:
717
+ cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {where_clause}")
718
+ return cur.fetchone()[0]
719
+
720
+ def get_table_size(self, table: str) -> dict[str, int]:
721
+ """Get table size information.
722
+
723
+ Args:
724
+ table: Table name
725
+
726
+ Returns:
727
+ Dictionary with size information
728
+ """
729
+ with self.connection.cursor() as cur:
730
+ cur.execute(
731
+ """
732
+ SELECT
733
+ pg_table_size(%s) as table_size,
734
+ pg_indexes_size(%s) as index_size,
735
+ pg_total_relation_size(%s) as total_size
736
+ """,
737
+ (table, table, table),
738
+ )
739
+ row = cur.fetchone()
740
+ return {
741
+ "table_size_bytes": row[0],
742
+ "index_size_bytes": row[1],
743
+ "total_size_bytes": row[2],
744
+ }
745
+
746
+ def should_use_batched_operation(self, table: str) -> bool:
747
+ """Determine if batched operations should be used.
748
+
749
+ Args:
750
+ table: Table name
751
+
752
+ Returns:
753
+ True if table is large enough to warrant batching
754
+ """
755
+ estimate = self.get_row_count_estimate(table)
756
+ return estimate >= self.LARGE_TABLE_THRESHOLD
757
+
758
+ def estimate_operation_time(
759
+ self,
760
+ table: str,
761
+ rows_per_second: float = 10000.0,
762
+ ) -> float:
763
+ """Estimate time for a full-table operation.
764
+
765
+ Args:
766
+ table: Table name
767
+ rows_per_second: Expected processing rate
768
+
769
+ Returns:
770
+ Estimated seconds
771
+ """
772
+ estimate = self.get_row_count_estimate(table)
773
+ if rows_per_second <= 0:
774
+ return 0.0
775
+ return estimate / rows_per_second