starrocks-br 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
starrocks_br/planner.py CHANGED
@@ -1,16 +1,17 @@
1
- from typing import List, Dict, Optional
2
1
  import datetime
2
+ import hashlib
3
+ from typing import Optional
3
4
 
4
- from starrocks_br import logger, timezone
5
+ from starrocks_br import logger, timezone, utils
5
6
 
6
7
 
7
- def find_latest_full_backup(db, database: str) -> Optional[Dict[str, str]]:
8
+ def find_latest_full_backup(db, database: str) -> Optional[dict[str, str]]:
8
9
  """Find the latest successful full backup for a database.
9
-
10
+
10
11
  Args:
11
12
  db: Database connection
12
13
  database: Database name to search for
13
-
14
+
14
15
  Returns:
15
16
  Dictionary with keys: label, backup_type, finished_at, or None if no full backup found.
16
17
  The finished_at value is returned as a string in the cluster timezone format.
@@ -20,125 +21,126 @@ def find_latest_full_backup(db, database: str) -> Optional[Dict[str, str]]:
20
21
  FROM ops.backup_history
21
22
  WHERE backup_type = 'full'
22
23
  AND status = 'FINISHED'
23
- AND label LIKE '{database}_%'
24
+ AND label LIKE {utils.quote_value(f"{database}_%")}
24
25
  ORDER BY finished_at DESC
25
26
  LIMIT 1
26
27
  """
27
-
28
+
28
29
  rows = db.query(query)
29
-
30
+
30
31
  if not rows:
31
32
  return None
32
-
33
+
33
34
  row = rows[0]
34
35
  finished_at = row[2]
35
-
36
+
36
37
  if isinstance(finished_at, datetime.datetime):
37
- cluster_tz = db.timezone
38
- finished_at = finished_at.strftime("%Y-%m-%d %H:%M:%S")
38
+ finished_at_normalized = timezone.normalize_datetime_to_tz(finished_at, db.timezone)
39
+ finished_at = finished_at_normalized.strftime("%Y-%m-%d %H:%M:%S")
39
40
  elif not isinstance(finished_at, str):
40
41
  finished_at = str(finished_at)
41
-
42
- return {
43
- "label": row[0],
44
- "backup_type": row[1],
45
- "finished_at": finished_at
46
- }
47
42
 
43
+ return {"label": row[0], "backup_type": row[1], "finished_at": finished_at}
48
44
 
49
- def find_tables_by_group(db, group_name: str) -> List[Dict[str, str]]:
45
+
46
+ def find_tables_by_group(db, group_name: str) -> list[dict[str, str]]:
50
47
  """Find tables belonging to a specific inventory group.
51
-
48
+
52
49
  Returns list of dictionaries with keys: database, table.
53
50
  Supports '*' table wildcard which signifies all tables in a database.
54
51
  """
55
52
  query = f"""
56
53
  SELECT database_name, table_name
57
54
  FROM ops.table_inventory
58
- WHERE inventory_group = '{group_name}'
55
+ WHERE inventory_group = {utils.quote_value(group_name)}
59
56
  ORDER BY database_name, table_name
60
57
  """
61
58
  rows = db.query(query)
62
- return [
63
- {"database": row[0], "table": row[1]} for row in rows
64
- ]
59
+ return [{"database": row[0], "table": row[1]} for row in rows]
65
60
 
66
61
 
67
- def find_recent_partitions(db, database: str, baseline_backup_label: Optional[str] = None, *, group_name: str) -> List[Dict[str, str]]:
62
+ def find_recent_partitions(
63
+ db, database: str, baseline_backup_label: Optional[str] = None, *, group_name: str
64
+ ) -> list[dict[str, str]]:
68
65
  """Find partitions updated since baseline for tables in the given inventory group.
69
-
66
+
70
67
  Args:
71
68
  db: Database connection
72
69
  database: Database name (StarRocks database scope for backup)
73
70
  baseline_backup_label: Optional specific backup label to use as baseline.
74
71
  group_name: Inventory group whose tables will be considered
75
-
72
+
76
73
  Returns list of dictionaries with keys: database, table, partition_name.
77
74
  Only partitions of tables within the specified database are returned.
78
75
  """
79
76
  cluster_tz = db.timezone
80
-
77
+
81
78
  if baseline_backup_label:
82
79
  baseline_query = f"""
83
80
  SELECT finished_at
84
81
  FROM ops.backup_history
85
- WHERE label = '{baseline_backup_label}'
82
+ WHERE label = {utils.quote_value(baseline_backup_label)}
86
83
  AND status = 'FINISHED'
87
84
  """
88
85
  baseline_rows = db.query(baseline_query)
89
86
  if not baseline_rows:
90
- raise ValueError(f"Baseline backup '{baseline_backup_label}' not found or not successful")
87
+ raise ValueError(
88
+ f"Baseline backup '{baseline_backup_label}' not found or not successful"
89
+ )
91
90
  baseline_time_raw = baseline_rows[0][0]
92
91
  else:
93
92
  latest_backup = find_latest_full_backup(db, database)
94
93
  if not latest_backup:
95
- raise ValueError(f"No successful full backup found for database '{database}'. Run a full database backup first.")
96
- baseline_time_raw = latest_backup['finished_at']
97
-
94
+ raise ValueError(
95
+ f"No successful full backup found for database '{database}'. Run a full database backup first."
96
+ )
97
+ baseline_time_raw = latest_backup["finished_at"]
98
+
98
99
  if isinstance(baseline_time_raw, datetime.datetime):
99
100
  baseline_time_str = baseline_time_raw.strftime("%Y-%m-%d %H:%M:%S")
100
101
  elif isinstance(baseline_time_raw, str):
101
102
  baseline_time_str = baseline_time_raw
102
103
  else:
103
104
  baseline_time_str = str(baseline_time_raw)
104
-
105
+
105
106
  baseline_dt = timezone.parse_datetime_with_tz(baseline_time_str, cluster_tz)
106
-
107
+
107
108
  group_tables = find_tables_by_group(db, group_name)
108
109
 
109
110
  if not group_tables:
110
111
  return []
111
112
 
112
- db_group_tables = [t for t in group_tables if t['database'] == database]
113
+ db_group_tables = [t for t in group_tables if t["database"] == database]
113
114
 
114
115
  if not db_group_tables:
115
116
  return []
116
-
117
+
117
118
  concrete_tables = []
118
119
  for table_entry in db_group_tables:
119
- if table_entry['table'] == '*':
120
- show_tables_query = f"SHOW TABLES FROM {table_entry['database']}"
120
+ if table_entry["table"] == "*":
121
+ show_tables_query = (
122
+ f"SHOW TABLES FROM {utils.quote_identifier(table_entry['database'])}"
123
+ )
121
124
  tables_rows = db.query(show_tables_query)
122
125
  for row in tables_rows:
123
- concrete_tables.append({
124
- 'database': table_entry['database'],
125
- 'table': row[0]
126
- })
126
+ concrete_tables.append({"database": table_entry["database"], "table": row[0]})
127
127
  else:
128
128
  concrete_tables.append(table_entry)
129
-
129
+
130
130
  recent_partitions = []
131
131
  for table_entry in concrete_tables:
132
- db_name = table_entry['database']
133
- table_name = table_entry['table']
134
-
135
- show_partitions_query = f"SHOW PARTITIONS FROM {db_name}.{table_name}"
132
+ db_name = table_entry["database"]
133
+ table_name = table_entry["table"]
134
+
135
+ show_partitions_query = (
136
+ f"SHOW PARTITIONS FROM {utils.build_qualified_table_name(db_name, table_name)}"
137
+ )
136
138
  try:
137
139
  partition_rows = db.query(show_partitions_query)
138
140
  except Exception as e:
139
141
  logger.error(f"Error showing partitions for table {db_name}.{table_name}: {e}")
140
142
  continue
141
-
143
+
142
144
  for row in partition_rows:
143
145
  # FOR SHARED NOTHING CLUSTER:
144
146
  # PartitionId, PartitionName, VisibleVersion, VisibleVersionTime, VisibleVersionHash, State, PartitionKey, Range, DistributionKey, Buckets, ReplicationNum, StorageMedium, CooldownTime, LastConsistencyCheckTime, DataSize, StorageSize, IsInMemory, RowCount, DataVersion, VersionEpoch, VersionTxnType
@@ -151,88 +153,92 @@ def find_recent_partitions(db, database: str, baseline_backup_label: Optional[st
151
153
  visible_version_time_str = visible_version_time
152
154
  else:
153
155
  visible_version_time_str = str(visible_version_time)
154
-
155
- visible_version_dt = timezone.parse_datetime_with_tz(visible_version_time_str, cluster_tz)
156
-
156
+
157
+ visible_version_dt = timezone.parse_datetime_with_tz(
158
+ visible_version_time_str, cluster_tz
159
+ )
160
+
157
161
  if visible_version_dt > baseline_dt:
158
- recent_partitions.append({
159
- 'database': db_name,
160
- 'table': table_name,
161
- 'partition_name': partition_name
162
- })
163
-
162
+ recent_partitions.append(
163
+ {"database": db_name, "table": table_name, "partition_name": partition_name}
164
+ )
165
+
164
166
  return recent_partitions
165
167
 
166
168
 
167
- def build_incremental_backup_command(partitions: List[Dict[str, str]], repository: str, label: str, database: str) -> str:
169
+ def build_incremental_backup_command(
170
+ partitions: list[dict[str, str]], repository: str, label: str, database: str
171
+ ) -> str:
168
172
  """Build BACKUP command for incremental backup of specific partitions.
169
-
173
+
170
174
  Args:
171
175
  partitions: List of partitions to backup
172
176
  repository: Repository name
173
177
  label: Backup label
174
178
  database: Database name (StarRocks requires BACKUP to be database-specific)
175
-
179
+
176
180
  Note: Filters partitions to only include those from the specified database.
177
181
  """
178
182
  if not partitions:
179
183
  return ""
180
-
181
- db_partitions = [p for p in partitions if p['database'] == database]
182
-
184
+
185
+ db_partitions = [p for p in partitions if p["database"] == database]
186
+
183
187
  if not db_partitions:
184
188
  return ""
185
-
189
+
186
190
  table_partitions = {}
187
191
  for partition in db_partitions:
188
- table_name = partition['table']
192
+ table_name = partition["table"]
189
193
  if table_name not in table_partitions:
190
194
  table_partitions[table_name] = []
191
- table_partitions[table_name].append(partition['partition_name'])
192
-
195
+ table_partitions[table_name].append(partition["partition_name"])
196
+
193
197
  on_clauses = []
194
198
  for table, parts in table_partitions.items():
195
- partitions_str = ", ".join(parts)
196
- on_clauses.append(f"TABLE {table} PARTITION ({partitions_str})")
197
-
199
+ partitions_str = ", ".join(utils.quote_identifier(p) for p in parts)
200
+ on_clauses.append(f"TABLE {utils.quote_identifier(table)} PARTITION ({partitions_str})")
201
+
198
202
  on_clause = ",\n ".join(on_clauses)
199
-
200
- command = f"""BACKUP DATABASE {database} SNAPSHOT {label}
201
- TO {repository}
203
+
204
+ command = f"""BACKUP DATABASE {utils.quote_identifier(database)} SNAPSHOT {utils.quote_identifier(label)}
205
+ TO {utils.quote_identifier(repository)}
202
206
  ON ({on_clause})"""
203
-
207
+
204
208
  return command
205
209
 
206
210
 
207
- def build_full_backup_command(db, group_name: str, repository: str, label: str, database: str) -> str:
211
+ def build_full_backup_command(
212
+ db, group_name: str, repository: str, label: str, database: str
213
+ ) -> str:
208
214
  """Build BACKUP command for an inventory group.
209
-
215
+
210
216
  If the group contains '*' for any entry in the target database, generate a
211
217
  simple BACKUP DATABASE command. Otherwise, generate ON (TABLE ...) list for
212
218
  the specific tables within the database.
213
219
  """
214
220
  tables = find_tables_by_group(db, group_name)
215
221
 
216
- db_entries = [t for t in tables if t['database'] == database]
222
+ db_entries = [t for t in tables if t["database"] == database]
217
223
  if not db_entries:
218
224
  return ""
219
225
 
220
- if any(t['table'] == '*' for t in db_entries):
221
- return f"""BACKUP DATABASE {database} SNAPSHOT {label}
222
- TO {repository}"""
226
+ if any(t["table"] == "*" for t in db_entries):
227
+ return f"""BACKUP DATABASE {utils.quote_identifier(database)} SNAPSHOT {utils.quote_identifier(label)}
228
+ TO {utils.quote_identifier(repository)}"""
223
229
 
224
230
  on_clauses = []
225
231
  for t in db_entries:
226
- on_clauses.append(f"TABLE {t['table']}")
232
+ on_clauses.append(f"TABLE {utils.quote_identifier(t['table'])}")
227
233
  on_clause = ",\n ".join(on_clauses)
228
- return f"""BACKUP DATABASE {database} SNAPSHOT {label}
229
- TO {repository}
234
+ return f"""BACKUP DATABASE {utils.quote_identifier(database)} SNAPSHOT {utils.quote_identifier(label)}
235
+ TO {utils.quote_identifier(repository)}
230
236
  ON ({on_clause})"""
231
237
 
232
238
 
233
- def record_backup_partitions(db, label: str, partitions: List[Dict[str, str]]) -> None:
239
+ def record_backup_partitions(db, label: str, partitions: list[dict[str, str]]) -> None:
234
240
  """Record partition metadata for a backup in ops.backup_partitions table.
235
-
241
+
236
242
  Args:
237
243
  db: Database connection
238
244
  label: Backup label
@@ -240,61 +246,61 @@ def record_backup_partitions(db, label: str, partitions: List[Dict[str, str]]) -
240
246
  """
241
247
  if not partitions:
242
248
  return
243
-
249
+
244
250
  for partition in partitions:
251
+ composite_key = (
252
+ f"{label}|{partition['database']}|{partition['table']}|{partition['partition_name']}"
253
+ )
254
+ key_hash = hashlib.md5(composite_key.encode("utf-8")).hexdigest()
255
+
245
256
  db.execute(f"""
246
- INSERT INTO ops.backup_partitions
247
- (label, database_name, table_name, partition_name)
248
- VALUES ('{label}', '{partition['database']}', '{partition['table']}', '{partition['partition_name']}')
257
+ INSERT INTO ops.backup_partitions
258
+ (key_hash, label, database_name, table_name, partition_name)
259
+ VALUES ({utils.quote_value(key_hash)}, {utils.quote_value(label)}, {utils.quote_value(partition["database"])}, {utils.quote_value(partition["table"])}, {utils.quote_value(partition["partition_name"])})
249
260
  """)
250
261
 
251
262
 
252
- def get_all_partitions_for_tables(db, database: str, tables: List[Dict[str, str]]) -> List[Dict[str, str]]:
263
+ def get_all_partitions_for_tables(
264
+ db, database: str, tables: list[dict[str, str]]
265
+ ) -> list[dict[str, str]]:
253
266
  """Get all existing partitions for the specified tables.
254
-
267
+
255
268
  Args:
256
269
  db: Database connection
257
270
  database: Database name
258
271
  tables: List of tables with keys: database, table
259
-
272
+
260
273
  Returns:
261
274
  List of partitions with keys: database, table, partition_name
262
275
  """
263
276
  if not tables:
264
277
  return []
265
-
266
- db_tables = [t for t in tables if t['database'] == database]
278
+
279
+ db_tables = [t for t in tables if t["database"] == database]
267
280
  if not db_tables:
268
281
  return []
269
-
270
- where_conditions = [f"DB_NAME = '{database}'", "PARTITION_NAME IS NOT NULL"]
271
-
282
+
283
+ where_conditions = [f"DB_NAME = {utils.quote_value(database)}", "PARTITION_NAME IS NOT NULL"]
284
+
272
285
  table_conditions = []
273
286
  for table in db_tables:
274
- if table['table'] == '*':
287
+ if table["table"] == "*":
275
288
  pass
276
289
  else:
277
- table_conditions.append(f"TABLE_NAME = '{table['table']}'")
278
-
290
+ table_conditions.append(f"TABLE_NAME = {utils.quote_value(table['table'])}")
291
+
279
292
  if table_conditions:
280
293
  where_conditions.append("(" + " OR ".join(table_conditions) + ")")
281
-
294
+
282
295
  where_clause = " AND ".join(where_conditions)
283
-
296
+
284
297
  query = f"""
285
298
  SELECT DB_NAME, TABLE_NAME, PARTITION_NAME
286
299
  FROM information_schema.partitions_meta
287
300
  WHERE {where_clause}
288
301
  ORDER BY TABLE_NAME, PARTITION_NAME
289
302
  """
290
-
303
+
291
304
  rows = db.query(query)
292
-
293
- return [
294
- {
295
- "database": row[0],
296
- "table": row[1],
297
- "partition_name": row[2]
298
- }
299
- for row in rows
300
- ]
305
+
306
+ return [{"database": row[0], "table": row[1], "partition_name": row[2]} for row in rows]
@@ -3,11 +3,11 @@ from __future__ import annotations
3
3
 
4
4
  def ensure_repository(db, name: str) -> None:
5
5
  """Verify that the specified repository exists and is accessible.
6
-
6
+
7
7
  Args:
8
8
  db: Database connection
9
9
  name: Repository name to verify
10
-
10
+
11
11
  Raises:
12
12
  RuntimeError: If repository doesn't exist or has errors
13
13
  """
@@ -18,7 +18,7 @@ def ensure_repository(db, name: str) -> None:
18
18
  f" CREATE REPOSITORY {name} WITH BROKER ON LOCATION '...' PROPERTIES(...)\n"
19
19
  f"For examples, see: https://docs.starrocks.io/docs/sql-reference/sql-statements/data-definition/backup_restore/CREATE_REPOSITORY/"
20
20
  )
21
-
21
+
22
22
  # SHOW REPOSITORIES returns: RepoId, RepoName, CreateTime, IsReadOnly, Location, Broker, ErrMsg
23
23
  err_msg = existing[6]
24
24
  if err_msg and str(err_msg).strip().upper() not in {"", "NULL", "NONE"}:
@@ -32,5 +32,3 @@ def _find_repository(db, name: str):
32
32
  if row and row[1] == name:
33
33
  return row
34
34
  return None
35
-
36
-