starrocks-br 0.5.2__py3-none-any.whl → 0.7.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
starrocks_br/prune.py ADDED
@@ -0,0 +1,208 @@
1
+ # Copyright 2025 deep-bi
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from datetime import datetime
16
+
17
+ from . import logger
18
+
19
+
20
+ def get_successful_backups(
21
+ db, repository: str, group: str = None, ops_database: str = "ops"
22
+ ) -> list[dict]:
23
+ """Get all successful backups from backup_history, optionally filtered by group.
24
+
25
+ Args:
26
+ db: Database connection
27
+ repository: Repository name to filter by
28
+ group: Optional inventory group to filter by
29
+ ops_database: Name of the ops database (defaults to "ops")
30
+
31
+ Returns:
32
+ List of backup records as dicts with keys: label, finished_at, inventory_group (if group filtering is used)
33
+ """
34
+ if group:
35
+ sql = f"""
36
+ SELECT DISTINCT
37
+ bh.label,
38
+ bh.finished_at,
39
+ ti.inventory_group
40
+ FROM {ops_database}.backup_history bh
41
+ INNER JOIN {ops_database}.backup_partitions bp ON bh.label = bp.label
42
+ INNER JOIN {ops_database}.table_inventory ti
43
+ ON bp.database_name = ti.database_name
44
+ AND (bp.table_name = ti.table_name OR ti.table_name = '*')
45
+ WHERE bh.repository = '{repository}'
46
+ AND bh.status = 'FINISHED'
47
+ AND ti.inventory_group = '{group}'
48
+ ORDER BY bh.finished_at ASC
49
+ """
50
+ else:
51
+ sql = f"""
52
+ SELECT
53
+ label,
54
+ finished_at
55
+ FROM {ops_database}.backup_history
56
+ WHERE repository = '{repository}'
57
+ AND status = 'FINISHED'
58
+ ORDER BY finished_at ASC
59
+ """
60
+
61
+ rows = db.query(sql)
62
+ results = []
63
+
64
+ for row in rows:
65
+ if group:
66
+ results.append({"label": row[0], "finished_at": str(row[1]), "inventory_group": row[2]})
67
+ else:
68
+ results.append({"label": row[0], "finished_at": str(row[1])})
69
+
70
+ return results
71
+
72
+
73
+ def filter_snapshots_to_delete(
74
+ all_snapshots: list[dict], strategy: str, **kwargs
75
+ ) -> list[dict]:
76
+ """Filter snapshots based on pruning strategy.
77
+
78
+ Args:
79
+ all_snapshots: List of snapshot dicts (must be sorted by finished_at ASC)
80
+ strategy: Pruning strategy - 'keep_last', 'older_than', 'specific', or 'multiple'
81
+ **kwargs: Strategy-specific parameters:
82
+ - keep_last: 'count' (int) - number of backups to keep
83
+ - older_than: 'timestamp' (str) - timestamp in 'YYYY-MM-DD HH:MM:SS' format
84
+ - specific: 'snapshot' (str) - specific snapshot name
85
+ - multiple: 'snapshots' (list) - list of snapshot names
86
+
87
+ Returns:
88
+ List of snapshots to delete
89
+ """
90
+ if strategy == "keep_last":
91
+ count = kwargs.get("count")
92
+ if count is None or count <= 0:
93
+ raise ValueError("keep_last strategy requires a positive count")
94
+
95
+ # Keep the last N, delete the rest
96
+ if len(all_snapshots) <= count:
97
+ return []
98
+ return all_snapshots[: -count] # Delete all except last N
99
+
100
+ elif strategy == "older_than":
101
+ timestamp_str = kwargs.get("timestamp")
102
+ if not timestamp_str:
103
+ raise ValueError("older_than strategy requires a timestamp")
104
+
105
+ try:
106
+ cutoff = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
107
+ except ValueError as e:
108
+ raise ValueError(
109
+ f"Invalid timestamp format '{timestamp_str}'. Expected 'YYYY-MM-DD HH:MM:SS'"
110
+ ) from e
111
+
112
+ to_delete = []
113
+ for snapshot in all_snapshots:
114
+ snapshot_time = datetime.strptime(snapshot["finished_at"], "%Y-%m-%d %H:%M:%S")
115
+ if snapshot_time < cutoff:
116
+ to_delete.append(snapshot)
117
+
118
+ return to_delete
119
+
120
+ elif strategy == "specific":
121
+ snapshot_name = kwargs.get("snapshot")
122
+ if not snapshot_name:
123
+ raise ValueError("specific strategy requires a snapshot name")
124
+
125
+ for snapshot in all_snapshots:
126
+ if snapshot["label"] == snapshot_name:
127
+ return [snapshot]
128
+
129
+ return []
130
+
131
+ elif strategy == "multiple":
132
+ snapshot_names = kwargs.get("snapshots")
133
+ if not snapshot_names:
134
+ raise ValueError("multiple strategy requires a list of snapshot names")
135
+
136
+ to_delete = []
137
+ for snapshot in all_snapshots:
138
+ if snapshot["label"] in snapshot_names:
139
+ to_delete.append(snapshot)
140
+
141
+ return to_delete
142
+
143
+ else:
144
+ raise ValueError(f"Unknown pruning strategy: {strategy}")
145
+
146
+
147
+ def verify_snapshot_exists(db, repository: str, snapshot_name: str) -> bool:
148
+ """Verify that a snapshot exists in the repository.
149
+
150
+ Args:
151
+ db: Database connection
152
+ repository: Repository name
153
+ snapshot_name: Snapshot name to verify
154
+
155
+ Returns:
156
+ True if snapshot exists, False otherwise
157
+
158
+ Raises:
159
+ Exception if snapshot is not found
160
+ """
161
+ sql = f"SHOW SNAPSHOT ON {repository} WHERE SNAPSHOT = '{snapshot_name}'"
162
+
163
+ try:
164
+ rows = db.query(sql)
165
+ if not rows:
166
+ raise Exception(f"Snapshot '{snapshot_name}' not found in repository '{repository}'")
167
+ return True
168
+ except Exception as e:
169
+ logger.error(f"Failed to verify snapshot '{snapshot_name}': {e}")
170
+ raise
171
+
172
+
173
+ def execute_drop_snapshot(db, repository: str, snapshot_name: str) -> None:
174
+ """Execute DROP SNAPSHOT command for a single snapshot.
175
+
176
+ Args:
177
+ db: Database connection
178
+ repository: Repository name
179
+ snapshot_name: Snapshot name to delete
180
+
181
+ Raises:
182
+ Exception if deletion fails
183
+ """
184
+ sql = f"DROP SNAPSHOT ON {repository} WHERE SNAPSHOT = '{snapshot_name}'"
185
+
186
+ try:
187
+ logger.info(f"Deleting snapshot: {snapshot_name}")
188
+ db.execute(sql)
189
+ logger.success(f"Successfully deleted snapshot: {snapshot_name}")
190
+ except Exception as e:
191
+ logger.error(f"Failed to delete snapshot '{snapshot_name}': {e}")
192
+ raise
193
+
194
+
195
+ def cleanup_backup_history(db, snapshot_label: str, ops_database: str = "ops") -> None:
196
+ """Remove backup history entry after snapshot deletion.
197
+
198
+ Args:
199
+ db: Database connection
200
+ snapshot_label: Snapshot label to remove from history
201
+ ops_database: Name of the ops database (defaults to "ops")
202
+ """
203
+ try:
204
+ db.execute(f"DELETE FROM {ops_database}.backup_partitions WHERE label = '{snapshot_label}'")
205
+ db.execute(f"DELETE FROM {ops_database}.backup_history WHERE label = '{snapshot_label}'")
206
+ logger.debug(f"Cleaned up backup history for: {snapshot_label}")
207
+ except Exception as e:
208
+ logger.warning(f"Failed to cleanup backup history for '{snapshot_label}': {e}")
@@ -30,7 +30,7 @@ def ensure_repository(db, name: str) -> None:
30
30
  raise RuntimeError(
31
31
  f"Repository '{name}' not found. Please create it first using:\n"
32
32
  f" CREATE REPOSITORY {name} WITH BROKER ON LOCATION '...' PROPERTIES(...)\n"
33
- f"For examples, see: https://docs.starrocks.io/docs/sql-reference/sql-statements/data-definition/backup_restore/CREATE_REPOSITORY/"
33
+ f"For examples, see: https://docs.starrocks.io/docs/sql-reference/sql-statements/backup_restore/CREATE_REPOSITORY/"
34
34
  )
35
35
 
36
36
  # SHOW REPOSITORIES returns: RepoId, RepoName, CreateTime, IsReadOnly, Location, Broker, ErrMsg
starrocks_br/restore.py CHANGED
@@ -201,9 +201,22 @@ def execute_restore(
201
201
  max_polls: int = MAX_POLLS,
202
202
  poll_interval: float = 1.0,
203
203
  scope: str = "restore",
204
+ ops_database: str = "ops",
204
205
  ) -> dict:
205
206
  """Execute a complete restore workflow: submit command and monitor progress.
206
207
 
208
+ Args:
209
+ db: Database connection
210
+ restore_command: Restore SQL command to execute
211
+ backup_label: Label of the backup being restored
212
+ restore_type: Type of restore operation
213
+ repository: Repository name
214
+ database: Database name
215
+ max_polls: Maximum polling attempts
216
+ poll_interval: Seconds between polls
217
+ scope: Job scope (for concurrency control)
218
+ ops_database: Name of ops database (default: "ops")
219
+
207
220
  Returns dictionary with keys: success, final_status, error_message
208
221
  """
209
222
  cluster_tz = db.timezone
@@ -240,13 +253,18 @@ def execute_restore(
240
253
  "finished_at": finished_at,
241
254
  "error_message": None if success else final_status["state"],
242
255
  },
256
+ ops_database=ops_database,
243
257
  )
244
258
  except Exception as e:
245
259
  logger.error(f"Failed to log restore history: {str(e)}")
246
260
 
247
261
  try:
248
262
  concurrency.complete_job_slot(
249
- db, scope=scope, label=label, final_state=final_status["state"]
263
+ db,
264
+ scope=scope,
265
+ label=label,
266
+ final_state=final_status["state"],
267
+ ops_database=ops_database,
250
268
  )
251
269
  except Exception as e:
252
270
  logger.error(f"Failed to complete job slot: {str(e)}")
@@ -264,7 +282,7 @@ def execute_restore(
264
282
  return {"success": False, "final_status": None, "error_message": str(e)}
265
283
 
266
284
 
267
- def find_restore_pair(db, target_label: str) -> list[str]:
285
+ def find_restore_pair(db, target_label: str, ops_database: str = "ops") -> list[str]:
268
286
  """Find the correct sequence of backups needed for restore.
269
287
 
270
288
  Args:
@@ -280,7 +298,7 @@ def find_restore_pair(db, target_label: str) -> list[str]:
280
298
  """
281
299
  query = f"""
282
300
  SELECT label, backup_type, finished_at
283
- FROM ops.backup_history
301
+ FROM {ops_database}.backup_history
284
302
  WHERE label = {utils.quote_value(target_label)}
285
303
  AND status = 'FINISHED'
286
304
  """
@@ -299,7 +317,7 @@ def find_restore_pair(db, target_label: str) -> list[str]:
299
317
 
300
318
  full_backup_query = f"""
301
319
  SELECT label, backup_type, finished_at
302
- FROM ops.backup_history
320
+ FROM {ops_database}.backup_history
303
321
  WHERE backup_type = 'full'
304
322
  AND status = 'FINISHED'
305
323
  AND label LIKE {utils.quote_value(f"{database_name}_%")}
@@ -326,6 +344,7 @@ def get_tables_from_backup(
326
344
  group: str | None = None,
327
345
  table: str | None = None,
328
346
  database: str | None = None,
347
+ ops_database: str = "ops",
329
348
  ) -> list[str]:
330
349
  """Get list of tables to restore from backup manifest.
331
350
 
@@ -354,7 +373,7 @@ def get_tables_from_backup(
354
373
 
355
374
  query = f"""
356
375
  SELECT DISTINCT database_name, table_name
357
- FROM ops.backup_partitions
376
+ FROM {ops_database}.backup_partitions
358
377
  WHERE label = {utils.quote_value(label)}
359
378
  ORDER BY database_name, table_name
360
379
  """
@@ -377,7 +396,7 @@ def get_tables_from_backup(
377
396
  if group:
378
397
  group_query = f"""
379
398
  SELECT database_name, table_name
380
- FROM ops.table_inventory
399
+ FROM {ops_database}.table_inventory
381
400
  WHERE inventory_group = {utils.quote_value(group)}
382
401
  """
383
402
 
@@ -404,6 +423,35 @@ def get_tables_from_backup(
404
423
  return tables
405
424
 
406
425
 
426
+ def get_partitions_from_backup(
427
+ db, label: str, table: str, ops_database: str = "ops"
428
+ ) -> list[str]:
429
+ """Get list of partitions for a specific table from backup manifest.
430
+
431
+ Args:
432
+ db: Database connection
433
+ label: Backup label
434
+ table: Table name in format 'database.table'
435
+ ops_database: Operations database name
436
+
437
+ Returns:
438
+ List of partition names for the table in this backup
439
+ """
440
+ database_name, table_name = table.split(".", 1)
441
+
442
+ query = f"""
443
+ SELECT partition_name
444
+ FROM {ops_database}.backup_partitions
445
+ WHERE label = {utils.quote_value(label)}
446
+ AND database_name = {utils.quote_value(database_name)}
447
+ AND table_name = {utils.quote_value(table_name)}
448
+ ORDER BY partition_name
449
+ """
450
+
451
+ rows = db.query(query)
452
+ return [row[0] for row in rows]
453
+
454
+
407
455
  def execute_restore_flow(
408
456
  db,
409
457
  repo_name: str,
@@ -411,6 +459,7 @@ def execute_restore_flow(
411
459
  tables_to_restore: list[str],
412
460
  rename_suffix: str = "_restored",
413
461
  skip_confirmation: bool = False,
462
+ ops_database: str = "ops",
414
463
  ) -> dict:
415
464
  """Execute the complete restore flow with safety measures.
416
465
 
@@ -421,6 +470,7 @@ def execute_restore_flow(
421
470
  tables_to_restore: List of tables to restore (format: database.table)
422
471
  rename_suffix: Suffix for temporary tables
423
472
  skip_confirmation: If True, skip interactive confirmation prompt
473
+ ops_database: Name of ops database (default: "ops")
424
474
 
425
475
  Returns:
426
476
  Dictionary with success status and details
@@ -452,59 +502,122 @@ def execute_restore_flow(
452
502
  database_name = tables_to_restore[0].split(".")[0]
453
503
 
454
504
  base_label = restore_pair[0]
455
- logger.info("")
456
- logger.info(f"Step 1: Restoring base backup '{base_label}'...")
457
-
458
- base_timestamp = get_snapshot_timestamp(db, repo_name, base_label)
459
-
460
- base_restore_command = _build_restore_command_with_rename(
461
- base_label, repo_name, tables_to_restore, rename_suffix, database_name, base_timestamp
462
- )
463
505
 
464
- base_result = execute_restore(
465
- db, base_restore_command, base_label, "full", repo_name, database_name, scope="restore"
466
- )
467
-
468
- if not base_result["success"]:
469
- return {
470
- "success": False,
471
- "error_message": f"Base restore failed: {base_result['error_message']}",
472
- }
473
-
474
- logger.success("Base restore completed successfully")
506
+ tables_in_base = get_tables_from_backup(db, base_label, ops_database=ops_database)
507
+ tables_to_restore_from_base = [t for t in tables_to_restore if t in tables_in_base]
475
508
 
476
- if len(restore_pair) > 1:
477
- incremental_label = restore_pair[1]
509
+ if tables_to_restore_from_base:
478
510
  logger.info("")
479
- logger.info(f"Step 2: Applying incremental backup '{incremental_label}'...")
511
+ logger.info(f"Step 1: Restoring base backup '{base_label}'...")
480
512
 
481
- incremental_timestamp = get_snapshot_timestamp(db, repo_name, incremental_label)
513
+ base_timestamp = get_snapshot_timestamp(db, repo_name, base_label)
482
514
 
483
- incremental_restore_command = _build_restore_command_without_rename(
484
- incremental_label,
515
+ base_restore_command = _build_restore_command_with_rename(
516
+ base_label,
485
517
  repo_name,
486
- tables_to_restore,
518
+ tables_to_restore_from_base,
519
+ rename_suffix,
487
520
  database_name,
488
- incremental_timestamp,
521
+ base_timestamp,
489
522
  )
490
523
 
491
- incremental_result = execute_restore(
524
+ base_result = execute_restore(
492
525
  db,
493
- incremental_restore_command,
494
- incremental_label,
495
- "incremental",
526
+ base_restore_command,
527
+ base_label,
528
+ "full",
496
529
  repo_name,
497
530
  database_name,
498
531
  scope="restore",
532
+ ops_database=ops_database,
499
533
  )
500
534
 
501
- if not incremental_result["success"]:
535
+ if not base_result["success"]:
502
536
  return {
503
537
  "success": False,
504
- "error_message": f"Incremental restore failed: {incremental_result['error_message']}",
538
+ "error_message": f"Base restore failed: {base_result['error_message']}",
505
539
  }
506
540
 
507
- logger.success("Incremental restore completed successfully")
541
+ logger.success("Base restore completed successfully")
542
+ else:
543
+ logger.info("")
544
+ logger.info(
545
+ f"Step 1: Skipping base backup '{base_label}' (no requested tables in this backup)"
546
+ )
547
+
548
+ if len(restore_pair) > 1:
549
+ incremental_label = restore_pair[1]
550
+
551
+ tables_in_incremental = get_tables_from_backup(
552
+ db, incremental_label, ops_database=ops_database
553
+ )
554
+ tables_to_restore_from_incremental = [
555
+ t for t in tables_to_restore if t in tables_in_incremental
556
+ ]
557
+
558
+ if not tables_to_restore_from_incremental:
559
+ logger.info("")
560
+ logger.info(
561
+ f"Step 2: Skipping incremental backup '{incremental_label}' (no requested tables in this backup)"
562
+ )
563
+ else:
564
+ logger.info("")
565
+ logger.info(f"Step 2: Applying incremental backup '{incremental_label}'...")
566
+
567
+ incremental_timestamp = get_snapshot_timestamp(db, repo_name, incremental_label)
568
+
569
+ for table in tables_to_restore_from_incremental:
570
+ partitions = get_partitions_from_backup(
571
+ db, incremental_label, table, ops_database=ops_database
572
+ )
573
+
574
+ if not partitions:
575
+ logger.warning(f"No partitions found for {table} in {incremental_label}, skipping")
576
+ continue
577
+
578
+ table_was_in_base = table in tables_to_restore_from_base
579
+
580
+ if table_was_in_base:
581
+ _, table_name = table.split(".", 1)
582
+ target_table_name = f"{table_name}{rename_suffix}"
583
+ incremental_restore_command = _build_partition_restore_command(
584
+ incremental_label,
585
+ repo_name,
586
+ f"{database_name}.{target_table_name}",
587
+ partitions,
588
+ database_name,
589
+ incremental_timestamp,
590
+ rename_suffix=None,
591
+ )
592
+ else:
593
+ incremental_restore_command = _build_partition_restore_command(
594
+ incremental_label,
595
+ repo_name,
596
+ table,
597
+ partitions,
598
+ database_name,
599
+ incremental_timestamp,
600
+ rename_suffix=rename_suffix,
601
+ )
602
+
603
+ incremental_result = execute_restore(
604
+ db,
605
+ incremental_restore_command,
606
+ incremental_label,
607
+ "incremental",
608
+ repo_name,
609
+ database_name,
610
+ scope="restore",
611
+ ops_database=ops_database,
612
+ )
613
+
614
+ if not incremental_result["success"]:
615
+ return {
616
+ "success": False,
617
+ "error_message": f"Incremental restore failed for {table}: {incremental_result['error_message']}",
618
+ }
619
+
620
+ logger.success("Incremental restore completed successfully")
508
621
 
509
622
  logger.info("")
510
623
  logger.info("Step 3: Performing atomic rename...")
@@ -571,6 +684,50 @@ def _build_restore_command_without_rename(
571
684
  PROPERTIES ("backup_timestamp" = "{backup_timestamp}")"""
572
685
 
573
686
 
687
+ def _build_partition_restore_command(
688
+ backup_label: str,
689
+ repo_name: str,
690
+ table: str,
691
+ partitions: list[str],
692
+ database: str,
693
+ backup_timestamp: str,
694
+ rename_suffix: str | None = None,
695
+ ) -> str:
696
+ """Build partition-level restore command with optional AS clause.
697
+
698
+ Args:
699
+ backup_label: Backup snapshot label
700
+ repo_name: Repository name
701
+ table: Table name in format 'database.table'
702
+ partitions: List of partition names to restore
703
+ database: Database name
704
+ backup_timestamp: Backup timestamp
705
+ rename_suffix: Optional suffix for AS clause (e.g., '_restored')
706
+
707
+ Returns:
708
+ SQL RESTORE command string
709
+ """
710
+ _, table_name = table.split(".", 1)
711
+
712
+ # Build partition list
713
+ partition_list = ", ".join([utils.quote_identifier(p) for p in partitions])
714
+
715
+ # Build table clause
716
+ if rename_suffix:
717
+ # Table only in incremental: use AS clause
718
+ temp_table_name = f"{table_name}{rename_suffix}"
719
+ table_clause = f"TABLE {utils.quote_identifier(table_name)} PARTITION ({partition_list}) AS {utils.quote_identifier(temp_table_name)}"
720
+ else:
721
+ # Table in base: target the _restored table directly (no AS)
722
+ table_clause = f"TABLE {utils.quote_identifier(table_name)} PARTITION ({partition_list})"
723
+
724
+ return f"""RESTORE SNAPSHOT {utils.quote_identifier(backup_label)}
725
+ FROM {utils.quote_identifier(repo_name)}
726
+ DATABASE {utils.quote_identifier(database)}
727
+ ON ({table_clause})
728
+ PROPERTIES ("backup_timestamp" = "{backup_timestamp}")"""
729
+
730
+
574
731
  def _generate_timestamped_backup_name(table_name: str) -> str:
575
732
  """Generate a timestamped backup table name.
576
733