starrocks-br 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- starrocks_br/cli.py +257 -193
- starrocks_br/concurrency.py +50 -50
- starrocks_br/config.py +31 -23
- starrocks_br/db.py +37 -37
- starrocks_br/executor.py +100 -71
- starrocks_br/health.py +1 -6
- starrocks_br/history.py +5 -6
- starrocks_br/labels.py +14 -10
- starrocks_br/planner.py +119 -113
- starrocks_br/repository.py +3 -5
- starrocks_br/restore.py +240 -187
- starrocks_br/schema.py +20 -16
- starrocks_br/timezone.py +28 -29
- starrocks_br/utils.py +86 -0
- starrocks_br-0.4.0.dist-info/METADATA +152 -0
- starrocks_br-0.4.0.dist-info/RECORD +21 -0
- starrocks_br-0.2.0.dist-info/METADATA +0 -12
- starrocks_br-0.2.0.dist-info/RECORD +0 -20
- {starrocks_br-0.2.0.dist-info → starrocks_br-0.4.0.dist-info}/WHEEL +0 -0
- {starrocks_br-0.2.0.dist-info → starrocks_br-0.4.0.dist-info}/entry_points.txt +0 -0
- {starrocks_br-0.2.0.dist-info → starrocks_br-0.4.0.dist-info}/top_level.txt +0 -0
starrocks_br/planner.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
from typing import List, Dict, Optional
|
|
2
1
|
import datetime
|
|
2
|
+
import hashlib
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
|
-
from starrocks_br import logger, timezone
|
|
5
|
+
from starrocks_br import logger, timezone, utils
|
|
5
6
|
|
|
6
7
|
|
|
7
|
-
def find_latest_full_backup(db, database: str) -> Optional[
|
|
8
|
+
def find_latest_full_backup(db, database: str) -> Optional[dict[str, str]]:
|
|
8
9
|
"""Find the latest successful full backup for a database.
|
|
9
|
-
|
|
10
|
+
|
|
10
11
|
Args:
|
|
11
12
|
db: Database connection
|
|
12
13
|
database: Database name to search for
|
|
13
|
-
|
|
14
|
+
|
|
14
15
|
Returns:
|
|
15
16
|
Dictionary with keys: label, backup_type, finished_at, or None if no full backup found.
|
|
16
17
|
The finished_at value is returned as a string in the cluster timezone format.
|
|
@@ -20,125 +21,126 @@ def find_latest_full_backup(db, database: str) -> Optional[Dict[str, str]]:
|
|
|
20
21
|
FROM ops.backup_history
|
|
21
22
|
WHERE backup_type = 'full'
|
|
22
23
|
AND status = 'FINISHED'
|
|
23
|
-
AND label LIKE
|
|
24
|
+
AND label LIKE {utils.quote_value(f"{database}_%")}
|
|
24
25
|
ORDER BY finished_at DESC
|
|
25
26
|
LIMIT 1
|
|
26
27
|
"""
|
|
27
|
-
|
|
28
|
+
|
|
28
29
|
rows = db.query(query)
|
|
29
|
-
|
|
30
|
+
|
|
30
31
|
if not rows:
|
|
31
32
|
return None
|
|
32
|
-
|
|
33
|
+
|
|
33
34
|
row = rows[0]
|
|
34
35
|
finished_at = row[2]
|
|
35
|
-
|
|
36
|
+
|
|
36
37
|
if isinstance(finished_at, datetime.datetime):
|
|
37
|
-
|
|
38
|
-
finished_at =
|
|
38
|
+
finished_at_normalized = timezone.normalize_datetime_to_tz(finished_at, db.timezone)
|
|
39
|
+
finished_at = finished_at_normalized.strftime("%Y-%m-%d %H:%M:%S")
|
|
39
40
|
elif not isinstance(finished_at, str):
|
|
40
41
|
finished_at = str(finished_at)
|
|
41
|
-
|
|
42
|
-
return {
|
|
43
|
-
"label": row[0],
|
|
44
|
-
"backup_type": row[1],
|
|
45
|
-
"finished_at": finished_at
|
|
46
|
-
}
|
|
47
42
|
|
|
43
|
+
return {"label": row[0], "backup_type": row[1], "finished_at": finished_at}
|
|
48
44
|
|
|
49
|
-
|
|
45
|
+
|
|
46
|
+
def find_tables_by_group(db, group_name: str) -> list[dict[str, str]]:
|
|
50
47
|
"""Find tables belonging to a specific inventory group.
|
|
51
|
-
|
|
48
|
+
|
|
52
49
|
Returns list of dictionaries with keys: database, table.
|
|
53
50
|
Supports '*' table wildcard which signifies all tables in a database.
|
|
54
51
|
"""
|
|
55
52
|
query = f"""
|
|
56
53
|
SELECT database_name, table_name
|
|
57
54
|
FROM ops.table_inventory
|
|
58
|
-
WHERE inventory_group =
|
|
55
|
+
WHERE inventory_group = {utils.quote_value(group_name)}
|
|
59
56
|
ORDER BY database_name, table_name
|
|
60
57
|
"""
|
|
61
58
|
rows = db.query(query)
|
|
62
|
-
return [
|
|
63
|
-
{"database": row[0], "table": row[1]} for row in rows
|
|
64
|
-
]
|
|
59
|
+
return [{"database": row[0], "table": row[1]} for row in rows]
|
|
65
60
|
|
|
66
61
|
|
|
67
|
-
def find_recent_partitions(
|
|
62
|
+
def find_recent_partitions(
|
|
63
|
+
db, database: str, baseline_backup_label: Optional[str] = None, *, group_name: str
|
|
64
|
+
) -> list[dict[str, str]]:
|
|
68
65
|
"""Find partitions updated since baseline for tables in the given inventory group.
|
|
69
|
-
|
|
66
|
+
|
|
70
67
|
Args:
|
|
71
68
|
db: Database connection
|
|
72
69
|
database: Database name (StarRocks database scope for backup)
|
|
73
70
|
baseline_backup_label: Optional specific backup label to use as baseline.
|
|
74
71
|
group_name: Inventory group whose tables will be considered
|
|
75
|
-
|
|
72
|
+
|
|
76
73
|
Returns list of dictionaries with keys: database, table, partition_name.
|
|
77
74
|
Only partitions of tables within the specified database are returned.
|
|
78
75
|
"""
|
|
79
76
|
cluster_tz = db.timezone
|
|
80
|
-
|
|
77
|
+
|
|
81
78
|
if baseline_backup_label:
|
|
82
79
|
baseline_query = f"""
|
|
83
80
|
SELECT finished_at
|
|
84
81
|
FROM ops.backup_history
|
|
85
|
-
WHERE label =
|
|
82
|
+
WHERE label = {utils.quote_value(baseline_backup_label)}
|
|
86
83
|
AND status = 'FINISHED'
|
|
87
84
|
"""
|
|
88
85
|
baseline_rows = db.query(baseline_query)
|
|
89
86
|
if not baseline_rows:
|
|
90
|
-
raise ValueError(
|
|
87
|
+
raise ValueError(
|
|
88
|
+
f"Baseline backup '{baseline_backup_label}' not found or not successful"
|
|
89
|
+
)
|
|
91
90
|
baseline_time_raw = baseline_rows[0][0]
|
|
92
91
|
else:
|
|
93
92
|
latest_backup = find_latest_full_backup(db, database)
|
|
94
93
|
if not latest_backup:
|
|
95
|
-
raise ValueError(
|
|
96
|
-
|
|
97
|
-
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"No successful full backup found for database '{database}'. Run a full database backup first."
|
|
96
|
+
)
|
|
97
|
+
baseline_time_raw = latest_backup["finished_at"]
|
|
98
|
+
|
|
98
99
|
if isinstance(baseline_time_raw, datetime.datetime):
|
|
99
100
|
baseline_time_str = baseline_time_raw.strftime("%Y-%m-%d %H:%M:%S")
|
|
100
101
|
elif isinstance(baseline_time_raw, str):
|
|
101
102
|
baseline_time_str = baseline_time_raw
|
|
102
103
|
else:
|
|
103
104
|
baseline_time_str = str(baseline_time_raw)
|
|
104
|
-
|
|
105
|
+
|
|
105
106
|
baseline_dt = timezone.parse_datetime_with_tz(baseline_time_str, cluster_tz)
|
|
106
|
-
|
|
107
|
+
|
|
107
108
|
group_tables = find_tables_by_group(db, group_name)
|
|
108
109
|
|
|
109
110
|
if not group_tables:
|
|
110
111
|
return []
|
|
111
112
|
|
|
112
|
-
db_group_tables = [t for t in group_tables if t[
|
|
113
|
+
db_group_tables = [t for t in group_tables if t["database"] == database]
|
|
113
114
|
|
|
114
115
|
if not db_group_tables:
|
|
115
116
|
return []
|
|
116
|
-
|
|
117
|
+
|
|
117
118
|
concrete_tables = []
|
|
118
119
|
for table_entry in db_group_tables:
|
|
119
|
-
if table_entry[
|
|
120
|
-
show_tables_query =
|
|
120
|
+
if table_entry["table"] == "*":
|
|
121
|
+
show_tables_query = (
|
|
122
|
+
f"SHOW TABLES FROM {utils.quote_identifier(table_entry['database'])}"
|
|
123
|
+
)
|
|
121
124
|
tables_rows = db.query(show_tables_query)
|
|
122
125
|
for row in tables_rows:
|
|
123
|
-
concrete_tables.append({
|
|
124
|
-
'database': table_entry['database'],
|
|
125
|
-
'table': row[0]
|
|
126
|
-
})
|
|
126
|
+
concrete_tables.append({"database": table_entry["database"], "table": row[0]})
|
|
127
127
|
else:
|
|
128
128
|
concrete_tables.append(table_entry)
|
|
129
|
-
|
|
129
|
+
|
|
130
130
|
recent_partitions = []
|
|
131
131
|
for table_entry in concrete_tables:
|
|
132
|
-
db_name = table_entry[
|
|
133
|
-
table_name = table_entry[
|
|
134
|
-
|
|
135
|
-
show_partitions_query =
|
|
132
|
+
db_name = table_entry["database"]
|
|
133
|
+
table_name = table_entry["table"]
|
|
134
|
+
|
|
135
|
+
show_partitions_query = (
|
|
136
|
+
f"SHOW PARTITIONS FROM {utils.build_qualified_table_name(db_name, table_name)}"
|
|
137
|
+
)
|
|
136
138
|
try:
|
|
137
139
|
partition_rows = db.query(show_partitions_query)
|
|
138
140
|
except Exception as e:
|
|
139
141
|
logger.error(f"Error showing partitions for table {db_name}.{table_name}: {e}")
|
|
140
142
|
continue
|
|
141
|
-
|
|
143
|
+
|
|
142
144
|
for row in partition_rows:
|
|
143
145
|
# FOR SHARED NOTHING CLUSTER:
|
|
144
146
|
# PartitionId, PartitionName, VisibleVersion, VisibleVersionTime, VisibleVersionHash, State, PartitionKey, Range, DistributionKey, Buckets, ReplicationNum, StorageMedium, CooldownTime, LastConsistencyCheckTime, DataSize, StorageSize, IsInMemory, RowCount, DataVersion, VersionEpoch, VersionTxnType
|
|
@@ -151,88 +153,92 @@ def find_recent_partitions(db, database: str, baseline_backup_label: Optional[st
|
|
|
151
153
|
visible_version_time_str = visible_version_time
|
|
152
154
|
else:
|
|
153
155
|
visible_version_time_str = str(visible_version_time)
|
|
154
|
-
|
|
155
|
-
visible_version_dt = timezone.parse_datetime_with_tz(
|
|
156
|
-
|
|
156
|
+
|
|
157
|
+
visible_version_dt = timezone.parse_datetime_with_tz(
|
|
158
|
+
visible_version_time_str, cluster_tz
|
|
159
|
+
)
|
|
160
|
+
|
|
157
161
|
if visible_version_dt > baseline_dt:
|
|
158
|
-
recent_partitions.append(
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
})
|
|
163
|
-
|
|
162
|
+
recent_partitions.append(
|
|
163
|
+
{"database": db_name, "table": table_name, "partition_name": partition_name}
|
|
164
|
+
)
|
|
165
|
+
|
|
164
166
|
return recent_partitions
|
|
165
167
|
|
|
166
168
|
|
|
167
|
-
def build_incremental_backup_command(
|
|
169
|
+
def build_incremental_backup_command(
|
|
170
|
+
partitions: list[dict[str, str]], repository: str, label: str, database: str
|
|
171
|
+
) -> str:
|
|
168
172
|
"""Build BACKUP command for incremental backup of specific partitions.
|
|
169
|
-
|
|
173
|
+
|
|
170
174
|
Args:
|
|
171
175
|
partitions: List of partitions to backup
|
|
172
176
|
repository: Repository name
|
|
173
177
|
label: Backup label
|
|
174
178
|
database: Database name (StarRocks requires BACKUP to be database-specific)
|
|
175
|
-
|
|
179
|
+
|
|
176
180
|
Note: Filters partitions to only include those from the specified database.
|
|
177
181
|
"""
|
|
178
182
|
if not partitions:
|
|
179
183
|
return ""
|
|
180
|
-
|
|
181
|
-
db_partitions = [p for p in partitions if p[
|
|
182
|
-
|
|
184
|
+
|
|
185
|
+
db_partitions = [p for p in partitions if p["database"] == database]
|
|
186
|
+
|
|
183
187
|
if not db_partitions:
|
|
184
188
|
return ""
|
|
185
|
-
|
|
189
|
+
|
|
186
190
|
table_partitions = {}
|
|
187
191
|
for partition in db_partitions:
|
|
188
|
-
table_name = partition[
|
|
192
|
+
table_name = partition["table"]
|
|
189
193
|
if table_name not in table_partitions:
|
|
190
194
|
table_partitions[table_name] = []
|
|
191
|
-
table_partitions[table_name].append(partition[
|
|
192
|
-
|
|
195
|
+
table_partitions[table_name].append(partition["partition_name"])
|
|
196
|
+
|
|
193
197
|
on_clauses = []
|
|
194
198
|
for table, parts in table_partitions.items():
|
|
195
|
-
partitions_str = ", ".join(parts)
|
|
196
|
-
on_clauses.append(f"TABLE {table} PARTITION ({partitions_str})")
|
|
197
|
-
|
|
199
|
+
partitions_str = ", ".join(utils.quote_identifier(p) for p in parts)
|
|
200
|
+
on_clauses.append(f"TABLE {utils.quote_identifier(table)} PARTITION ({partitions_str})")
|
|
201
|
+
|
|
198
202
|
on_clause = ",\n ".join(on_clauses)
|
|
199
|
-
|
|
200
|
-
command = f"""BACKUP DATABASE {database} SNAPSHOT {label}
|
|
201
|
-
TO {repository}
|
|
203
|
+
|
|
204
|
+
command = f"""BACKUP DATABASE {utils.quote_identifier(database)} SNAPSHOT {utils.quote_identifier(label)}
|
|
205
|
+
TO {utils.quote_identifier(repository)}
|
|
202
206
|
ON ({on_clause})"""
|
|
203
|
-
|
|
207
|
+
|
|
204
208
|
return command
|
|
205
209
|
|
|
206
210
|
|
|
207
|
-
def build_full_backup_command(
|
|
211
|
+
def build_full_backup_command(
|
|
212
|
+
db, group_name: str, repository: str, label: str, database: str
|
|
213
|
+
) -> str:
|
|
208
214
|
"""Build BACKUP command for an inventory group.
|
|
209
|
-
|
|
215
|
+
|
|
210
216
|
If the group contains '*' for any entry in the target database, generate a
|
|
211
217
|
simple BACKUP DATABASE command. Otherwise, generate ON (TABLE ...) list for
|
|
212
218
|
the specific tables within the database.
|
|
213
219
|
"""
|
|
214
220
|
tables = find_tables_by_group(db, group_name)
|
|
215
221
|
|
|
216
|
-
db_entries = [t for t in tables if t[
|
|
222
|
+
db_entries = [t for t in tables if t["database"] == database]
|
|
217
223
|
if not db_entries:
|
|
218
224
|
return ""
|
|
219
225
|
|
|
220
|
-
if any(t[
|
|
221
|
-
return f"""BACKUP DATABASE {database} SNAPSHOT {label}
|
|
222
|
-
TO {repository}"""
|
|
226
|
+
if any(t["table"] == "*" for t in db_entries):
|
|
227
|
+
return f"""BACKUP DATABASE {utils.quote_identifier(database)} SNAPSHOT {utils.quote_identifier(label)}
|
|
228
|
+
TO {utils.quote_identifier(repository)}"""
|
|
223
229
|
|
|
224
230
|
on_clauses = []
|
|
225
231
|
for t in db_entries:
|
|
226
|
-
on_clauses.append(f"TABLE {t['table']}")
|
|
232
|
+
on_clauses.append(f"TABLE {utils.quote_identifier(t['table'])}")
|
|
227
233
|
on_clause = ",\n ".join(on_clauses)
|
|
228
|
-
return f"""BACKUP DATABASE {database} SNAPSHOT {label}
|
|
229
|
-
TO {repository}
|
|
234
|
+
return f"""BACKUP DATABASE {utils.quote_identifier(database)} SNAPSHOT {utils.quote_identifier(label)}
|
|
235
|
+
TO {utils.quote_identifier(repository)}
|
|
230
236
|
ON ({on_clause})"""
|
|
231
237
|
|
|
232
238
|
|
|
233
|
-
def record_backup_partitions(db, label: str, partitions:
|
|
239
|
+
def record_backup_partitions(db, label: str, partitions: list[dict[str, str]]) -> None:
|
|
234
240
|
"""Record partition metadata for a backup in ops.backup_partitions table.
|
|
235
|
-
|
|
241
|
+
|
|
236
242
|
Args:
|
|
237
243
|
db: Database connection
|
|
238
244
|
label: Backup label
|
|
@@ -240,61 +246,61 @@ def record_backup_partitions(db, label: str, partitions: List[Dict[str, str]]) -
|
|
|
240
246
|
"""
|
|
241
247
|
if not partitions:
|
|
242
248
|
return
|
|
243
|
-
|
|
249
|
+
|
|
244
250
|
for partition in partitions:
|
|
251
|
+
composite_key = (
|
|
252
|
+
f"{label}|{partition['database']}|{partition['table']}|{partition['partition_name']}"
|
|
253
|
+
)
|
|
254
|
+
key_hash = hashlib.md5(composite_key.encode("utf-8")).hexdigest()
|
|
255
|
+
|
|
245
256
|
db.execute(f"""
|
|
246
|
-
INSERT INTO ops.backup_partitions
|
|
247
|
-
(label, database_name, table_name, partition_name)
|
|
248
|
-
VALUES (
|
|
257
|
+
INSERT INTO ops.backup_partitions
|
|
258
|
+
(key_hash, label, database_name, table_name, partition_name)
|
|
259
|
+
VALUES ({utils.quote_value(key_hash)}, {utils.quote_value(label)}, {utils.quote_value(partition["database"])}, {utils.quote_value(partition["table"])}, {utils.quote_value(partition["partition_name"])})
|
|
249
260
|
""")
|
|
250
261
|
|
|
251
262
|
|
|
252
|
-
def get_all_partitions_for_tables(
|
|
263
|
+
def get_all_partitions_for_tables(
|
|
264
|
+
db, database: str, tables: list[dict[str, str]]
|
|
265
|
+
) -> list[dict[str, str]]:
|
|
253
266
|
"""Get all existing partitions for the specified tables.
|
|
254
|
-
|
|
267
|
+
|
|
255
268
|
Args:
|
|
256
269
|
db: Database connection
|
|
257
270
|
database: Database name
|
|
258
271
|
tables: List of tables with keys: database, table
|
|
259
|
-
|
|
272
|
+
|
|
260
273
|
Returns:
|
|
261
274
|
List of partitions with keys: database, table, partition_name
|
|
262
275
|
"""
|
|
263
276
|
if not tables:
|
|
264
277
|
return []
|
|
265
|
-
|
|
266
|
-
db_tables = [t for t in tables if t[
|
|
278
|
+
|
|
279
|
+
db_tables = [t for t in tables if t["database"] == database]
|
|
267
280
|
if not db_tables:
|
|
268
281
|
return []
|
|
269
|
-
|
|
270
|
-
where_conditions = [f"DB_NAME =
|
|
271
|
-
|
|
282
|
+
|
|
283
|
+
where_conditions = [f"DB_NAME = {utils.quote_value(database)}", "PARTITION_NAME IS NOT NULL"]
|
|
284
|
+
|
|
272
285
|
table_conditions = []
|
|
273
286
|
for table in db_tables:
|
|
274
|
-
if table[
|
|
287
|
+
if table["table"] == "*":
|
|
275
288
|
pass
|
|
276
289
|
else:
|
|
277
|
-
table_conditions.append(f"TABLE_NAME =
|
|
278
|
-
|
|
290
|
+
table_conditions.append(f"TABLE_NAME = {utils.quote_value(table['table'])}")
|
|
291
|
+
|
|
279
292
|
if table_conditions:
|
|
280
293
|
where_conditions.append("(" + " OR ".join(table_conditions) + ")")
|
|
281
|
-
|
|
294
|
+
|
|
282
295
|
where_clause = " AND ".join(where_conditions)
|
|
283
|
-
|
|
296
|
+
|
|
284
297
|
query = f"""
|
|
285
298
|
SELECT DB_NAME, TABLE_NAME, PARTITION_NAME
|
|
286
299
|
FROM information_schema.partitions_meta
|
|
287
300
|
WHERE {where_clause}
|
|
288
301
|
ORDER BY TABLE_NAME, PARTITION_NAME
|
|
289
302
|
"""
|
|
290
|
-
|
|
303
|
+
|
|
291
304
|
rows = db.query(query)
|
|
292
|
-
|
|
293
|
-
return [
|
|
294
|
-
{
|
|
295
|
-
"database": row[0],
|
|
296
|
-
"table": row[1],
|
|
297
|
-
"partition_name": row[2]
|
|
298
|
-
}
|
|
299
|
-
for row in rows
|
|
300
|
-
]
|
|
305
|
+
|
|
306
|
+
return [{"database": row[0], "table": row[1], "partition_name": row[2]} for row in rows]
|
starrocks_br/repository.py
CHANGED
|
@@ -3,11 +3,11 @@ from __future__ import annotations
|
|
|
3
3
|
|
|
4
4
|
def ensure_repository(db, name: str) -> None:
|
|
5
5
|
"""Verify that the specified repository exists and is accessible.
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
Args:
|
|
8
8
|
db: Database connection
|
|
9
9
|
name: Repository name to verify
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
Raises:
|
|
12
12
|
RuntimeError: If repository doesn't exist or has errors
|
|
13
13
|
"""
|
|
@@ -18,7 +18,7 @@ def ensure_repository(db, name: str) -> None:
|
|
|
18
18
|
f" CREATE REPOSITORY {name} WITH BROKER ON LOCATION '...' PROPERTIES(...)\n"
|
|
19
19
|
f"For examples, see: https://docs.starrocks.io/docs/sql-reference/sql-statements/data-definition/backup_restore/CREATE_REPOSITORY/"
|
|
20
20
|
)
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
# SHOW REPOSITORIES returns: RepoId, RepoName, CreateTime, IsReadOnly, Location, Broker, ErrMsg
|
|
23
23
|
err_msg = existing[6]
|
|
24
24
|
if err_msg and str(err_msg).strip().upper() not in {"", "NULL", "NONE"}:
|
|
@@ -32,5 +32,3 @@ def _find_repository(db, name: str):
|
|
|
32
32
|
if row and row[1] == name:
|
|
33
33
|
return row
|
|
34
34
|
return None
|
|
35
|
-
|
|
36
|
-
|