@clickzetta/cz-cli-darwin-x64 0.3.18 → 0.3.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
- package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
- package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
- package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +160 -0
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +155 -0
- package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +259 -0
- package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
- package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
- package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
- package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
- package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +218 -0
- package/bin/skills/clickzetta-lakehouse-connect/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +35 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +435 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +478 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +225 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +468 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +445 -0
- package/bin/skills/clickzetta-manage-comments/SKILL.md +219 -0
- package/bin/skills/clickzetta-manage-comments/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-metadata/SKILL.md +483 -0
- package/bin/skills/clickzetta-metadata/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-metadata/references/instance-views-reference.md +276 -0
- package/bin/skills/clickzetta-metadata/references/metering-views-reference.md +137 -0
- package/bin/skills/clickzetta-metadata/references/show-desc-reference.md +326 -0
- package/bin/skills/clickzetta-metadata/references/views-reference.md +271 -0
- package/bin/skills/clickzetta-monitoring/SKILL.md +199 -0
- package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
- package/bin/skills/clickzetta-overview/SKILL.md +102 -0
- package/bin/skills/clickzetta-overview/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-overview/references/brands-and-endpoints.md +79 -0
- package/bin/skills/clickzetta-overview/references/object-model.md +311 -0
- package/bin/skills/clickzetta-overview/references/studio-modules.md +173 -0
- package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
- package/bin/skills/cz-cli/SKILL.md +1 -1
- package/bin/skills/cz-cli-inner/SKILL.md +8 -0
- package/package.json +1 -1
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-ai-vector-search
|
|
3
|
+
description: |
|
|
4
|
+
在 ClickZetta Lakehouse 中实现向量存储、向量索引(HNSW)和向量检索,
|
|
5
|
+
构建 RAG、语义搜索、图像检索等 AI 应用。覆盖 VECTOR 数据类型定义、
|
|
6
|
+
向量索引创建(cosine/l2/hamming 距离)、向量数据插入与转换、
|
|
7
|
+
ANN 近似最近邻检索、向量+倒排索引融合检索等完整工作流。
|
|
8
|
+
当用户说"向量检索"、"向量索引"、"语义搜索"、"embedding 存储"、
|
|
9
|
+
"RAG"、"ANN 搜索"、"HNSW"、"cosine_distance"、"l2_distance"、
|
|
10
|
+
"VECTOR 类型"、"向量数据库"、"相似度搜索"、"向量 + 标量融合检索"、
|
|
11
|
+
"文本向量化"时触发。
|
|
12
|
+
Keywords: vector, HNSW, embedding, RAG, semantic search, similarity, VECTOR type
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
# ClickZetta 向量检索
|
|
16
|
+
|
|
17
|
+
Lakehouse 原生支持 VECTOR 数据类型和 HNSW 向量索引,无需独立向量数据库即可在同一张表中实现向量检索、全文检索和标量过滤的融合查询。
|
|
18
|
+
|
|
19
|
+
阅读 [references/vector-search.md](references/vector-search.md) 了解完整语法。
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 快速开始
|
|
24
|
+
|
|
25
|
+
### 1. 建表(含向量索引)
|
|
26
|
+
|
|
27
|
+
```sql
|
|
28
|
+
CREATE TABLE doc_embeddings (
|
|
29
|
+
id INT,
|
|
30
|
+
content STRING,
|
|
31
|
+
vec VECTOR(FLOAT, 1024),
|
|
32
|
+
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
33
|
+
"distance.function" = "cosine_distance",
|
|
34
|
+
"scalar.type" = "f32"
|
|
35
|
+
)
|
|
36
|
+
);
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### 2. 插入向量数据
|
|
40
|
+
|
|
41
|
+
```sql
|
|
42
|
+
-- 直接插入
|
|
43
|
+
INSERT INTO doc_embeddings VALUES
|
|
44
|
+
(1, '云器 Lakehouse 产品介绍', vector(0.12, 0.34, ...));
|
|
45
|
+
|
|
46
|
+
-- 从字符串转换(适合 API 返回的 JSON 格式)
|
|
47
|
+
INSERT INTO doc_embeddings (id, content, vec)
|
|
48
|
+
SELECT id, content, CAST(embedding_str AS VECTOR(1024))
|
|
49
|
+
FROM staging_table;
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### 3. 向量检索
|
|
53
|
+
|
|
54
|
+
```sql
|
|
55
|
+
-- 设置探索因子(精度 vs 速度)
|
|
56
|
+
SET cz.vector.index.search.ef = 64;
|
|
57
|
+
|
|
58
|
+
-- 余弦距离 Top-10 相似文档
|
|
59
|
+
SELECT id, content, cosine_distance(vec, CAST('[0.12, 0.34, ...]' AS VECTOR(1024))) AS dist
|
|
60
|
+
FROM doc_embeddings
|
|
61
|
+
ORDER BY dist
|
|
62
|
+
LIMIT 10;
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 向量 + 标量融合检索(RAG 场景)
|
|
68
|
+
|
|
69
|
+
```sql
|
|
70
|
+
-- 先用标量过滤缩小范围,再用向量排序
|
|
71
|
+
SELECT id, content, cosine_distance(vec, :query_embedding) AS dist
|
|
72
|
+
FROM doc_embeddings
|
|
73
|
+
WHERE category = 'product'
|
|
74
|
+
AND created_at >= '2024-01-01'
|
|
75
|
+
ORDER BY dist
|
|
76
|
+
LIMIT 5;
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## 向量 + 全文检索融合
|
|
82
|
+
|
|
83
|
+
```sql
|
|
84
|
+
-- 建表:同时支持向量索引和倒排索引
|
|
85
|
+
CREATE TABLE hybrid_docs (
|
|
86
|
+
id INT,
|
|
87
|
+
title STRING,
|
|
88
|
+
body STRING,
|
|
89
|
+
vec VECTOR(FLOAT, 1024),
|
|
90
|
+
INDEX body_inv_idx (body) USING INVERTED,
|
|
91
|
+
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
92
|
+
"distance.function" = "cosine_distance"
|
|
93
|
+
)
|
|
94
|
+
);
|
|
95
|
+
|
|
96
|
+
-- 融合检索:关键词过滤 + 向量排序
|
|
97
|
+
SELECT id, title, cosine_distance(vec, :query_vec) AS dist
|
|
98
|
+
FROM hybrid_docs
|
|
99
|
+
WHERE body LIKE '%向量检索%'
|
|
100
|
+
ORDER BY dist
|
|
101
|
+
LIMIT 10;
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## 外部系统写入向量(ARRAY → VECTOR 转换)
|
|
107
|
+
|
|
108
|
+
外部系统(Python SDK、Kafka 等)不能直接写 VECTOR 类型,需先写 ARRAY 再转换:
|
|
109
|
+
|
|
110
|
+
```sql
|
|
111
|
+
-- 暂存表(ARRAY 类型)
|
|
112
|
+
CREATE TABLE staging (id INT, vec_array ARRAY<FLOAT>);
|
|
113
|
+
|
|
114
|
+
-- 转换写入目标表
|
|
115
|
+
INSERT INTO doc_embeddings (id, vec)
|
|
116
|
+
SELECT id, CAST(vec_array AS VECTOR(FLOAT, 1024))
|
|
117
|
+
FROM staging;
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## 距离函数速查
|
|
123
|
+
|
|
124
|
+
| 函数 | 适用场景 |
|
|
125
|
+
|---|---|
|
|
126
|
+
| `cosine_distance(v1, v2)` | 文本语义检索(最常用) |
|
|
127
|
+
| `l2_distance(v1, v2)` | 图像/通用向量检索 |
|
|
128
|
+
| `dot_product(v1, v2)` | 归一化向量的相似度 |
|
|
129
|
+
| `hamming_distance(v1, v2)` | 二值向量(高效压缩) |
|
|
130
|
+
| `binary_quantize(v)` | 将 float 向量压缩为二值向量 |
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## 性能调优
|
|
135
|
+
|
|
136
|
+
```sql
|
|
137
|
+
-- 调整探索因子(默认 64,越大精度越高但越慢)
|
|
138
|
+
SET cz.vector.index.search.ef = 128;
|
|
139
|
+
|
|
140
|
+
-- 验证向量索引是否生效
|
|
141
|
+
EXPLAIN SELECT id, cosine_distance(vec, vector(0.1, 0.2)) AS dist
|
|
142
|
+
FROM doc_embeddings ORDER BY dist LIMIT 10;
|
|
143
|
+
-- 查看执行计划中是否有 vector_index_search_type 字样
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**最佳实践:**
|
|
147
|
+
- 向量检索建议**单独占用 VCluster**,避免与其他查询争抢缓存
|
|
148
|
+
- 大批量写入后执行 `BUILD INDEX vec_idx ON table_name` 为存量数据构建索引
|
|
149
|
+
- 外部系统写入时先写 ARRAY,再批量 CAST 转换,避免频繁小文件
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## 常见问题
|
|
154
|
+
|
|
155
|
+
| 问题 | 原因 | 解决方案 |
|
|
156
|
+
|---|---|---|
|
|
157
|
+
| 向量索引未生效 | 存量数据未构建索引 | 执行 `BUILD INDEX idx ON table` |
|
|
158
|
+
| 检索精度低 | ef 值太小 | 增大 `cz.vector.index.search.ef` |
|
|
159
|
+
| 外部写入报错 | 不支持直接写 VECTOR | 先写 ARRAY,再 CAST 转换 |
|
|
160
|
+
| 向量检索慢 | 与其他查询共用 VCluster | 为向量检索单独分配 VCluster |
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"怎么在 ClickZetta 里创建 VECTOR 类型的列存储 embedding?","expected_skill":"clickzetta-ai-vector-search","expected_output_contains":["VECTOR","FLOAT"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"HNSW 向量索引怎么创建?支持哪些距离函数?","expected_skill":"clickzetta-ai-vector-search","expected_output_contains":["HNSW","cosine"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"怎么做 ANN 近似最近邻检索?SQL 怎么写?","expected_skill":"clickzetta-ai-vector-search","expected_output_contains":["ANN","distance"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"向量检索和倒排索引能融合查询吗?怎么写?","expected_skill":"clickzetta-ai-vector-search","expected_output_contains":["向量","检索"]}
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# 向量检索参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/vector-search 等
|
|
4
|
+
|
|
5
|
+
## VECTOR 数据类型
|
|
6
|
+
|
|
7
|
+
```sql
|
|
8
|
+
-- 语法
|
|
9
|
+
vector(scalar_type, dimension)
|
|
10
|
+
vector(dimension) -- 默认 float 类型
|
|
11
|
+
|
|
12
|
+
-- 示例
|
|
13
|
+
CREATE TABLE embeddings (
|
|
14
|
+
id INT,
|
|
15
|
+
content STRING,
|
|
16
|
+
vec VECTOR(FLOAT, 1024), -- 1024 维 float 向量
|
|
17
|
+
vec_bin VECTOR(TINYINT, 128) -- 128 维 tinyint 向量(二值化)
|
|
18
|
+
);
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
支持的元素类型:`FLOAT`(f32)、`TINYINT`(i8/b1)
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## 创建向量索引
|
|
26
|
+
|
|
27
|
+
```sql
|
|
28
|
+
-- 建表时内联创建
|
|
29
|
+
CREATE TABLE doc_embeddings (
|
|
30
|
+
id INT,
|
|
31
|
+
content STRING,
|
|
32
|
+
vec VECTOR(FLOAT, 1024),
|
|
33
|
+
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
34
|
+
"distance.function" = "cosine_distance",
|
|
35
|
+
"scalar.type" = "f32",
|
|
36
|
+
"m" = "16",
|
|
37
|
+
"ef.construction" = "128"
|
|
38
|
+
)
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
-- 在已有表上添加向量索引
|
|
42
|
+
ALTER TABLE doc_embeddings ADD INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
43
|
+
"distance.function" = "cosine_distance",
|
|
44
|
+
"scalar.type" = "f32"
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
-- 为存量数据构建索引
|
|
48
|
+
BUILD INDEX vec_idx ON doc_embeddings;
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### 关键参数
|
|
52
|
+
|
|
53
|
+
| 参数 | 可选值 | 默认值 | 说明 |
|
|
54
|
+
|---|---|---|---|
|
|
55
|
+
| distance.function | l2_distance, cosine_distance, jaccard_distance, hamming_distance | cosine_distance | 距离函数 |
|
|
56
|
+
| scalar.type | f32, f16, i8, b1 | f32 | 索引元素类型 |
|
|
57
|
+
| m | 建议 ≤ 1000 | 16 | HNSW 最大邻居数 |
|
|
58
|
+
| ef.construction | 建议 ≤ 5000 | 128 | 构建时候选集大小 |
|
|
59
|
+
| compress.codec | uncompressed/zstd/lz4 | uncompressed | 压缩算法 |
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## 插入向量数据
|
|
64
|
+
|
|
65
|
+
```sql
|
|
66
|
+
-- 直接插入
|
|
67
|
+
INSERT INTO doc_embeddings (id, content, vec) VALUES
|
|
68
|
+
(1, 'hello world', vector(0.1, 0.2, 0.3, ...)),
|
|
69
|
+
(2, 'foo bar', vector(0.4, 0.5, 0.6, ...));
|
|
70
|
+
|
|
71
|
+
-- 从字符串转换
|
|
72
|
+
INSERT INTO doc_embeddings (id, vec)
|
|
73
|
+
SELECT id, CAST('[0.1, 0.2, 0.3]' AS VECTOR(3))
|
|
74
|
+
FROM source_table;
|
|
75
|
+
|
|
76
|
+
-- 从 ARRAY 列转换(外部系统写入场景)
|
|
77
|
+
INSERT OVERWRITE doc_embeddings
|
|
78
|
+
SELECT id, content, CAST(vec_array AS VECTOR(FLOAT, 1024))
|
|
79
|
+
FROM staging_table;
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## 向量检索
|
|
85
|
+
|
|
86
|
+
```sql
|
|
87
|
+
-- 调整探索因子(精度 vs 速度权衡)
|
|
88
|
+
SET cz.vector.index.search.ef = 64;
|
|
89
|
+
|
|
90
|
+
-- L2 距离检索(欧几里得距离,越小越相似)
|
|
91
|
+
SELECT id, content, l2_distance(vec, vector(0.1, 0.2, 0.3, ...)) AS dist
|
|
92
|
+
FROM doc_embeddings
|
|
93
|
+
ORDER BY dist
|
|
94
|
+
LIMIT 10;
|
|
95
|
+
|
|
96
|
+
-- 余弦距离检索(越小越相似)
|
|
97
|
+
SELECT id, content, cosine_distance(vec, CAST('[0.1,0.2,0.3]' AS VECTOR(3))) AS dist
|
|
98
|
+
FROM doc_embeddings
|
|
99
|
+
ORDER BY dist
|
|
100
|
+
LIMIT 10;
|
|
101
|
+
|
|
102
|
+
-- 带过滤条件的向量检索(向量 + 标量融合)
|
|
103
|
+
SELECT id, content, cosine_distance(vec, :query_vec) AS dist
|
|
104
|
+
FROM doc_embeddings
|
|
105
|
+
WHERE category = 'tech'
|
|
106
|
+
AND cosine_distance(vec, :query_vec) < 0.3
|
|
107
|
+
ORDER BY dist
|
|
108
|
+
LIMIT 10;
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## 距离函数速查
|
|
114
|
+
|
|
115
|
+
| 函数 | 适用场景 | 说明 |
|
|
116
|
+
|---|---|---|
|
|
117
|
+
| `l2_distance(v1, v2)` | 通用语义检索 | 欧几里得距离,越小越相似 |
|
|
118
|
+
| `cosine_distance(v1, v2)` | 文本语义检索 | 余弦距离,越小越相似 |
|
|
119
|
+
| `dot_product(v1, v2)` | 归一化向量 | 点积,越大越相似 |
|
|
120
|
+
| `hamming_distance(v1, v2)` | 二值向量 | 汉明距离,越小越相似 |
|
|
121
|
+
| `jaccard_distance(v1, v2)` | 集合相似度 | 雅卡德距离 |
|
|
122
|
+
| `binary_quantize(v)` | 向量压缩 | 将 float 向量二值化 |
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## 向量 + 倒排索引融合检索
|
|
127
|
+
|
|
128
|
+
```sql
|
|
129
|
+
-- 建表:同时支持向量索引和倒排索引
|
|
130
|
+
CREATE TABLE hybrid_search (
|
|
131
|
+
id INT,
|
|
132
|
+
content STRING,
|
|
133
|
+
vec VECTOR(FLOAT, 1024),
|
|
134
|
+
INDEX content_inv_idx (content) USING INVERTED,
|
|
135
|
+
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
136
|
+
"distance.function" = "cosine_distance"
|
|
137
|
+
)
|
|
138
|
+
);
|
|
139
|
+
|
|
140
|
+
-- 融合检索:先用倒排过滤,再用向量排序
|
|
141
|
+
SELECT id, content, cosine_distance(vec, :query_vec) AS dist
|
|
142
|
+
FROM hybrid_search
|
|
143
|
+
WHERE content LIKE '%关键词%'
|
|
144
|
+
ORDER BY dist
|
|
145
|
+
LIMIT 10;
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## 注意事项
|
|
151
|
+
|
|
152
|
+
- 向量类型不支持 `ORDER BY` 或 `GROUP BY`(只能对距离函数结果排序)
|
|
153
|
+
- 向量索引性能与内存/磁盘缓存直接相关,建议**单独占用 VCluster**
|
|
154
|
+
- 外部系统写入时不能直接写 VECTOR 类型,需先写 ARRAY 再 CAST 转换
|
|
155
|
+
- `ef` 值越大,检索精度越高但延迟越大;建议从 64 开始调优
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-data-retention
|
|
3
|
+
description: |
|
|
4
|
+
管理 ClickZetta Lakehouse 数据生命周期(TTL 自动回收)和数据恢复(Time Travel / UNDROP / RESTORE)。
|
|
5
|
+
覆盖数据生命周期设置(data_lifecycle)、Time Travel 保留周期(data_retention_days)、
|
|
6
|
+
历史数据查询(TIMESTAMP AS OF)、误删表恢复(UNDROP TABLE)、数据回滚(RESTORE TABLE)、
|
|
7
|
+
变更历史查看(DESC HISTORY)等完整数据管理工作流。
|
|
8
|
+
当用户说"设置生命周期"、"数据自动清理"、"TTL"、"data_lifecycle"、"表数据过期"、
|
|
9
|
+
"自动回收数据"、"设置数据保留"、"data_retention_days"、"Time Travel"、
|
|
10
|
+
"恢复误删的表"、"表被 DROP 了怎么办"、"回滚数据"、"查看历史版本"、
|
|
11
|
+
"UNDROP"、"RESTORE TABLE"、"误操作恢复"、"数据回滚"、"时间旅行"时触发。
|
|
12
|
+
Keywords: TTL, data retention, time travel, lifecycle, UNDROP, RESTORE, recovery, rollback
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
# ClickZetta 数据生命周期与恢复
|
|
16
|
+
|
|
17
|
+
## 两个核心概念
|
|
18
|
+
|
|
19
|
+
| 概念 | 属性键 | 作用 | 默认值 | 范围 |
|
|
20
|
+
|---|---|---|---|---|
|
|
21
|
+
| 数据生命周期(TTL) | `data_lifecycle` | 自动回收超期未更新的数据 | `-1`(永不回收) | 任意正整数天 |
|
|
22
|
+
| Time Travel 保留周期 | `data_retention_days` | 历史版本保留时长,支持时间点查询和恢复 | `1`(1天) | 0-90 天 |
|
|
23
|
+
|
|
24
|
+
两者独立,可同时设置。
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## 数据生命周期(TTL)
|
|
29
|
+
|
|
30
|
+
### 设置
|
|
31
|
+
|
|
32
|
+
```sql
|
|
33
|
+
-- 建表时设置(7天未更新自动清空数据)
|
|
34
|
+
CREATE TABLE orders_archive (id BIGINT, amount DECIMAL(10,2))
|
|
35
|
+
PROPERTIES('data_lifecycle'='7');
|
|
36
|
+
|
|
37
|
+
-- 到期同时删除表结构
|
|
38
|
+
CREATE TABLE temp_staging (id INT, data STRING)
|
|
39
|
+
PROPERTIES('data_lifecycle'='30', 'data_lifecycle_delete_meta'='true');
|
|
40
|
+
|
|
41
|
+
-- 修改现有表
|
|
42
|
+
ALTER TABLE my_table SET PROPERTIES ('data_lifecycle'='90');
|
|
43
|
+
|
|
44
|
+
-- 关闭生命周期
|
|
45
|
+
ALTER TABLE my_table SET PROPERTIES ('data_lifecycle'='-1');
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 查看
|
|
49
|
+
|
|
50
|
+
```sql
|
|
51
|
+
-- 查看单表
|
|
52
|
+
SHOW CREATE TABLE my_table;
|
|
53
|
+
|
|
54
|
+
-- 批量查看已设置生命周期的表
|
|
55
|
+
SELECT table_schema, table_name, data_lifecycle, last_modify_time
|
|
56
|
+
FROM information_schema.tables
|
|
57
|
+
WHERE data_lifecycle > 0
|
|
58
|
+
ORDER BY data_lifecycle;
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 注意事项
|
|
62
|
+
- 回收不立即执行,后台每 12 小时轮询,通常 24 小时内完成
|
|
63
|
+
- 默认只清空数据不删表;加 `data_lifecycle_delete_meta='true'` 才删表
|
|
64
|
+
- 分区表按分区独立计算 `last_modified_time`
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Time Travel 与数据恢复
|
|
69
|
+
|
|
70
|
+
### 配置保留周期
|
|
71
|
+
|
|
72
|
+
```sql
|
|
73
|
+
-- 修改保留周期(默认 1 天,最长 90 天)
|
|
74
|
+
ALTER TABLE my_table SET PROPERTIES ('data_retention_days'='7');
|
|
75
|
+
|
|
76
|
+
-- 建表时指定
|
|
77
|
+
CREATE TABLE orders (id INT, amount DECIMAL(10,2))
|
|
78
|
+
PROPERTIES ('data_retention_days'='30');
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 查看变更历史
|
|
82
|
+
|
|
83
|
+
```sql
|
|
84
|
+
DESC HISTORY my_table;
|
|
85
|
+
-- 返回:version, time, total_rows, total_bytes, user, operation, job_id
|
|
86
|
+
|
|
87
|
+
-- 查看已删除表的记录
|
|
88
|
+
SHOW TABLES HISTORY;
|
|
89
|
+
SHOW TABLES HISTORY LIKE 'orders%';
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Time Travel 查询历史数据
|
|
93
|
+
|
|
94
|
+
```sql
|
|
95
|
+
-- 查询指定时间点(只读)
|
|
96
|
+
SELECT * FROM orders TIMESTAMP AS OF '2026-03-18 15:00:00';
|
|
97
|
+
|
|
98
|
+
-- 相对时间
|
|
99
|
+
SELECT * FROM orders TIMESTAMP AS OF CURRENT_TIMESTAMP() - INTERVAL 12 HOURS;
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### RESTORE TABLE 回滚
|
|
103
|
+
|
|
104
|
+
```sql
|
|
105
|
+
-- 将表回滚到指定时间点(覆盖当前数据)
|
|
106
|
+
RESTORE TABLE orders TO TIMESTAMP AS OF '2026-03-18 14:59:00';
|
|
107
|
+
```
|
|
108
|
+
> 支持普通表和动态表,不支持物化视图。
|
|
109
|
+
|
|
110
|
+
### UNDROP TABLE 恢复误删表
|
|
111
|
+
|
|
112
|
+
```sql
|
|
113
|
+
-- 恢复被 DROP 的表(需在保留周期内)
|
|
114
|
+
UNDROP TABLE orders;
|
|
115
|
+
```
|
|
116
|
+
> 同名表存在时无法 UNDROP,需先 DROP 新表再 UNDROP。
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## 典型场景
|
|
121
|
+
|
|
122
|
+
### 误删表恢复
|
|
123
|
+
```sql
|
|
124
|
+
SHOW TABLES HISTORY LIKE 'orders';
|
|
125
|
+
UNDROP TABLE orders;
|
|
126
|
+
SELECT COUNT(*) FROM orders;
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 误执行 DELETE/UPDATE 回滚
|
|
130
|
+
```sql
|
|
131
|
+
DESC HISTORY analytics.events;
|
|
132
|
+
-- 全量回滚
|
|
133
|
+
RESTORE TABLE analytics.events TO TIMESTAMP AS OF '2026-03-18 14:55:00';
|
|
134
|
+
-- 或仅补回部分数据
|
|
135
|
+
INSERT INTO analytics.events
|
|
136
|
+
SELECT * FROM analytics.events TIMESTAMP AS OF '2026-03-18 14:55:00'
|
|
137
|
+
WHERE date < '2025-01-01';
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### 日志表自动清理
|
|
141
|
+
```sql
|
|
142
|
+
CREATE TABLE app_logs (log_id BIGINT, message STRING, log_time TIMESTAMP)
|
|
143
|
+
PROPERTIES('data_lifecycle'='30');
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## 决策树
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
数据丢失/损坏
|
|
152
|
+
├── 表被 DROP?
|
|
153
|
+
│ ├── 在保留周期内 → UNDROP TABLE
|
|
154
|
+
│ └── 超出保留周期 → 联系管理员
|
|
155
|
+
└── 数据被 DELETE/UPDATE/TRUNCATE?
|
|
156
|
+
├── 在保留周期内
|
|
157
|
+
│ ├── 全量回滚 → RESTORE TABLE TO TIMESTAMP AS OF
|
|
158
|
+
│ └── 补回部分 → INSERT INTO ... SELECT ... TIMESTAMP AS OF
|
|
159
|
+
└── 超出保留周期 → 联系管理员
|
|
160
|
+
```
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"怎么给表设置数据生命周期?超过 30 天自动清理","expected_skill":"clickzetta-data-retention","expected_output_contains":["data_lifecycle","PROPERTIES"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"表被误删了怎么恢复?UNDROP 怎么用?","expected_skill":"clickzetta-data-retention","expected_output_contains":["UNDROP","TABLE"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"怎么用 Time Travel 查询昨天的历史数据?","expected_skill":"clickzetta-data-retention","expected_output_contains":["TIMESTAMP AS OF"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"误执行了 DELETE,怎么回滚到操作前的状态?","expected_skill":"clickzetta-data-retention","expected_output_contains":["RESTORE","TIMESTAMP"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"data_retention_days 和 data_lifecycle 有什么区别?","expected_skill":"clickzetta-data-retention","expected_output_contains":["data_retention_days","data_lifecycle"]}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# 数据生命周期管理参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/data-lifecycle
|
|
4
|
+
> 已通过实际 Lakehouse 连接验证(cn-shanghai-alicloud, f8866243, quick_start)
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## 核心属性
|
|
9
|
+
|
|
10
|
+
| 属性键 | 类型 | 默认值 | 说明 |
|
|
11
|
+
|---|---|---|---|
|
|
12
|
+
| `data_lifecycle` | 正整数 / -1 | `-1` | 数据自动回收周期(天)。-1 表示永不回收 |
|
|
13
|
+
| `data_lifecycle_delete_meta` | boolean string | `'false'` | 到期时是否同时删除表结构。默认只清空数据 |
|
|
14
|
+
| `data_retention_days` | 整数 0-90 | `1` | Time Travel 历史版本保留天数 |
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## CREATE TABLE 语法
|
|
19
|
+
|
|
20
|
+
```sql
|
|
21
|
+
CREATE TABLE tname (
|
|
22
|
+
col1 datatype1,
|
|
23
|
+
col2 datatype2
|
|
24
|
+
) PROPERTIES(
|
|
25
|
+
'data_lifecycle'='<天数>',
|
|
26
|
+
'data_lifecycle_delete_meta'='true', -- 可选,到期删表结构
|
|
27
|
+
'data_retention_days'='<天数>' -- 可选,Time Travel 保留周期
|
|
28
|
+
);
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
**验证结果**:`SHOW CREATE TABLE` 输出中属性显示在 `TBLPROPERTIES` 块内:
|
|
32
|
+
```sql
|
|
33
|
+
CREATE TABLE quick_start.mcp_demo.lifecycle_test_table(
|
|
34
|
+
`id` int,
|
|
35
|
+
`name` string,
|
|
36
|
+
`created_at` timestamp)
|
|
37
|
+
USING PARQUET
|
|
38
|
+
TBLPROPERTIES(
|
|
39
|
+
'data_lifecycle'='7',
|
|
40
|
+
'data_retention_days'='7');
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## ALTER TABLE 语法
|
|
46
|
+
|
|
47
|
+
```sql
|
|
48
|
+
-- 设置/修改生命周期
|
|
49
|
+
ALTER TABLE tname SET PROPERTIES ('data_lifecycle'='<天数>');
|
|
50
|
+
|
|
51
|
+
-- 关闭生命周期
|
|
52
|
+
ALTER TABLE tname SET PROPERTIES ('data_lifecycle'='-1');
|
|
53
|
+
|
|
54
|
+
-- 设置到期删除表结构
|
|
55
|
+
ALTER TABLE tname SET PROPERTIES ('data_lifecycle_delete_meta'='true');
|
|
56
|
+
|
|
57
|
+
-- 设置 Time Travel 保留周期
|
|
58
|
+
ALTER TABLE tname SET PROPERTIES ('data_retention_days'='<天数>');
|
|
59
|
+
|
|
60
|
+
-- 同时设置多个属性
|
|
61
|
+
ALTER TABLE tname SET PROPERTIES (
|
|
62
|
+
'data_lifecycle'='90',
|
|
63
|
+
'data_lifecycle_delete_meta'='true',
|
|
64
|
+
'data_retention_days'='30'
|
|
65
|
+
);
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## 查看配置
|
|
71
|
+
|
|
72
|
+
### DESC EXTENDED
|
|
73
|
+
|
|
74
|
+
```sql
|
|
75
|
+
DESC EXTENDED tname;
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**实际输出结构**(验证结果):
|
|
79
|
+
|
|
80
|
+
| column_name | data_type | comment |
|
|
81
|
+
|---|---|---|
|
|
82
|
+
| id | int | |
|
|
83
|
+
| name | string | |
|
|
84
|
+
| ... | ... | |
|
|
85
|
+
| # detailed table information | | |
|
|
86
|
+
| workspace | quick_start | |
|
|
87
|
+
| schema | mcp_demo | |
|
|
88
|
+
| name | lifecycle_test_table | |
|
|
89
|
+
| creator | qiliang | |
|
|
90
|
+
| created_time | 2026-05-01 11:05:08.904 | |
|
|
91
|
+
| last_modified_time | 2026-05-01 11:05:26.442 | |
|
|
92
|
+
| comment | | |
|
|
93
|
+
| properties | (("data_lifecycle","7"),("data_retention_days","7")) | |
|
|
94
|
+
| version | 3377453148768716241 | |
|
|
95
|
+
| type | TABLE | |
|
|
96
|
+
| format | PARQUET | |
|
|
97
|
+
| statistics | 1 rows 2445 bytes | |
|
|
98
|
+
|
|
99
|
+
关键字段:
|
|
100
|
+
- `last_modified_time`:生命周期从此时间起算
|
|
101
|
+
- `properties`:显示所有 TBLPROPERTIES
|
|
102
|
+
|
|
103
|
+
### SHOW CREATE TABLE
|
|
104
|
+
|
|
105
|
+
```sql
|
|
106
|
+
SHOW CREATE TABLE tname;
|
|
107
|
+
-- 返回完整 DDL,TBLPROPERTIES 中包含 data_lifecycle 等属性
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### information_schema.tables
|
|
111
|
+
|
|
112
|
+
```sql
|
|
113
|
+
SELECT table_name, data_lifecycle, last_modify_time
|
|
114
|
+
FROM information_schema.tables
|
|
115
|
+
WHERE table_schema = 'my_schema';
|
|
116
|
+
-- data_lifecycle = -1 表示永久保留(未设置生命周期)
|
|
117
|
+
-- data_lifecycle > 0 表示已设置生命周期(单位:天)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## 分区表
|
|
123
|
+
|
|
124
|
+
分区表的生命周期按**分区**计算,每个分区独立判断 `last_modified_time`。
|
|
125
|
+
|
|
126
|
+
```sql
|
|
127
|
+
-- 查看分区的修改时间
|
|
128
|
+
SHOW PARTITIONS EXTENDED tname;
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**实际输出字段**(验证结果):
|
|
132
|
+
|
|
133
|
+
| 字段 | 说明 |
|
|
134
|
+
|---|---|
|
|
135
|
+
| partitions | 分区值(如 dt=2024-01-01) |
|
|
136
|
+
| total_rows | 分区行数 |
|
|
137
|
+
| bytes | 分区大小 |
|
|
138
|
+
| total_files | 文件数 |
|
|
139
|
+
| created_time | 分区创建时间 |
|
|
140
|
+
| last_modified_time | 分区最后修改时间(生命周期从此起算) |
|
|
141
|
+
| last_data_time | 最后数据写入时间 |
|
|
142
|
+
| last_compaction_time | 最后 compaction 时间 |
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Time Travel 语法
|
|
147
|
+
|
|
148
|
+
```sql
|
|
149
|
+
-- 查询历史时间点数据
|
|
150
|
+
SELECT * FROM tname TIMESTAMP AS OF '<timestamp>';
|
|
151
|
+
SELECT * FROM tname TIMESTAMP AS OF CURRENT_TIMESTAMP - INTERVAL 12 HOURS;
|
|
152
|
+
|
|
153
|
+
-- 查看版本历史
|
|
154
|
+
DESC HISTORY tname;
|
|
155
|
+
-- 返回:version, time, total_rows, total_bytes, user, operation, job_id, stats
|
|
156
|
+
|
|
157
|
+
-- 恢复到历史版本(注意:目标时间点必须晚于表创建时间)
|
|
158
|
+
RESTORE TABLE tname TO TIMESTAMP AS OF '<timestamp>';
|
|
159
|
+
|
|
160
|
+
-- 恢复被删除的表
|
|
161
|
+
UNDROP TABLE tname;
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**注意**:`RESTORE TABLE` 的目标时间点不能早于表创建时间,否则报错:
|
|
165
|
+
`InvalidArgument: toTimestamp is smaller than timestamp of fromTimestamp`
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## 工作原理
|
|
170
|
+
|
|
171
|
+
1. 生命周期回收依赖 `last_modified_time`(DDL/DML 操作会更新此时间)
|
|
172
|
+
2. 后台进程每 **12 小时**轮询一次,到期数据通常在 **24 小时内**被回收
|
|
173
|
+
3. 到期数据不立即删除,仍可查询,直到后台进程执行
|
|
174
|
+
4. 被回收的数据仍遵守 `data_retention_days`,可用 Time Travel 查询
|
|
175
|
+
5. 默认行为:只清空数据,**保留表结构**;设置 `data_lifecycle_delete_meta='true'` 才删表
|