@clickzetta/cz-cli-darwin-arm64 0.3.17 → 0.3.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
- package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
- package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
- package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +160 -0
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +155 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +386 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +548 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +220 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +259 -0
- package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +112 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +257 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +124 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +96 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +109 -0
- package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +156 -0
- package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
- package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
- package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +751 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +324 -0
- package/bin/skills/clickzetta-monitoring/SKILL.md +199 -0
- package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +537 -0
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +156 -0
- package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +56 -0
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +78 -0
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +65 -0
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +49 -0
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +42 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +276 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +379 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +166 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +185 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +129 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +222 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +125 -0
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +206 -0
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +212 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +54 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +150 -0
- package/bin/skills/clickzetta-volume-manager/SKILL.md +292 -0
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +199 -0
- package/bin/skills/cz-cli/SKILL.md +1 -1
- package/bin/skills/cz-cli-inner/SKILL.md +8 -0
- package/package.json +1 -1
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/SKILL.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/dt-declaration-strategy.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/incremental-config-reference.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/refresh-history-guide.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/sql-limitations.md +0 -0
- /package/bin/skills/{dynamic-table-alter → clickzetta-dynamic-table/dynamic-table-alter}/SKILL.md +0 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-data-ingest-pipeline
|
|
3
|
+
description: |
|
|
4
|
+
ClickZetta Lakehouse 数据导入总览与路由。根据用户的数据源类型、实时性要求、数据量等条件,
|
|
5
|
+
推荐最合适的数据导入方式,并引导到对应的专项 Skill 或直接执行简单导入操作。
|
|
6
|
+
当用户说"导入数据到 Lakehouse"、"数据入仓"、"数据入湖"、"怎么把数据导进来"、
|
|
7
|
+
"数据采集"、"数据加载"、"ingest data"、"load data"、"数据导入方案选择"时触发。
|
|
8
|
+
Keywords: data ingestion, import, routing, pipeline selection, data source
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Lakehouse 数据导入总览与路由
|
|
12
|
+
|
|
13
|
+
根据用户的数据源、实时性需求、数据规模等条件,推荐最合适的数据导入方式,
|
|
14
|
+
并路由到对应的专项 Pipeline Skill 或直接执行简单导入操作。
|
|
15
|
+
|
|
16
|
+
## 适用场景
|
|
17
|
+
|
|
18
|
+
- 用户想把数据导入 ClickZetta Lakehouse,但不确定用哪种方式
|
|
19
|
+
- 用户描述了数据源(Kafka、MySQL、OSS、文件等),需要推荐导入方案
|
|
20
|
+
- 用户需要了解各种导入方式的适用场景和差异
|
|
21
|
+
- 关键词:数据导入、数据入仓、数据入湖、数据采集、数据加载、pipeline 选择
|
|
22
|
+
|
|
23
|
+
## 前置依赖
|
|
24
|
+
|
|
25
|
+
- ClickZetta Lakehouse 账户,具备创建工作空间、Schema、表、PIPE、任务等权限
|
|
26
|
+
- **执行环境(满足其一即可,优先使用 cz-cli)**:
|
|
27
|
+
- **cz-cli 路径**:已安装 cz-cli(`pip install cz-cli`),并完成 `cz-cli configure` 配置
|
|
28
|
+
- **MCP 路径**:clickzetta-studio-mcp 或 clickzetta-mcp-server 工具可用(`LH_execute_query`、`create_task`、`save_integration_task` 等)
|
|
29
|
+
|
|
30
|
+
## 环境探测(执行前必读)
|
|
31
|
+
|
|
32
|
+
在开始任何操作前,先判断当前执行环境:
|
|
33
|
+
|
|
34
|
+
**第一步:检测 cz-cli 是否可用**
|
|
35
|
+
```bash
|
|
36
|
+
cz-cli --version
|
|
37
|
+
```
|
|
38
|
+
- 若命令存在 → **走 cz-cli 路径**(见本文档末尾"cz-cli 替代路径"章节,以及各专项 Skill 的 cz-cli 替代路径)
|
|
39
|
+
- 若命令不存在 → 继续检测 MCP
|
|
40
|
+
|
|
41
|
+
**第二步:检测 MCP 是否可用(仅在 cz-cli 不可用时)**
|
|
42
|
+
|
|
43
|
+
尝试调用 `LH_execute_query` 工具执行一条简单 SQL(如 `SELECT 1`)。
|
|
44
|
+
- 若工具存在于 tool list → **走 MCP 路径**(本文档默认路径)
|
|
45
|
+
- 若工具不存在 → 停止执行,提示用户:
|
|
46
|
+
> "当前环境既无 cz-cli 也无 MCP 工具,请安装其中之一后重试。
|
|
47
|
+
> cz-cli 安装:`pip install cz-cli`,然后运行 `cz-cli configure`
|
|
48
|
+
> MCP 安装:参考 clickzetta-studio-mcp 或 clickzetta-mcp-server 配置文档"
|
|
49
|
+
|
|
50
|
+
## 数据导入方式决策树
|
|
51
|
+
|
|
52
|
+
### 步骤 1:确认数据源类型和需求
|
|
53
|
+
|
|
54
|
+
向用户收集以下信息:
|
|
55
|
+
|
|
56
|
+
1. **数据源类型**:Kafka / 对象存储(OSS/S3/COS) / 关系型数据库(MySQL/PostgreSQL/SQL Server) / 本地文件 / URL/Web 文件 / Java SDK / ZettaPark
|
|
57
|
+
2. **实时性要求**:实时(秒级延迟)/ 准实时(分钟级)/ 离线批量(小时/天级)
|
|
58
|
+
3. **同步范围**:单表 / 多表 / 整库
|
|
59
|
+
4. **是否需要持续同步**:一次性导入 / 持续增量同步
|
|
60
|
+
5. **是否需要 CDC(变更数据捕获)**:是 / 否
|
|
61
|
+
|
|
62
|
+
### 步骤 2:根据决策矩阵推荐方案
|
|
63
|
+
|
|
64
|
+
| 数据源 | 实时性 | 同步范围 | 推荐方式 | 对应 Skill |
|
|
65
|
+
|--------|--------|---------|---------|-----------|
|
|
66
|
+
| Kafka | 实时/准实时 | 单 topic | Kafka PIPE 持续导入(SQL) | `clickzetta-kafka-ingest-pipeline` |
|
|
67
|
+
| Kafka | 实时 | 多 topic | Studio 实时同步 | `clickzetta-realtime-sync-pipeline` |
|
|
68
|
+
| 对象存储 (OSS/S3/COS) | 准实时/批量 | 文件持续到达 | PIPE 持续导入 | `clickzetta-oss-ingest-pipeline` |
|
|
69
|
+
| 对象存储 | 一次性 | 批量文件 | COPY INTO 命令 | `clickzetta-file-import-pipeline`(COPY INTO 部分) |
|
|
70
|
+
| MySQL/PostgreSQL/SQL Server | 实时 CDC | 单表 | Studio 实时同步 | `clickzetta-realtime-sync-pipeline` |
|
|
71
|
+
| MySQL/PostgreSQL/SQL Server | 实时 CDC | 多表/整库 | Studio 多表实时同步 | `clickzetta-cdc-sync-pipeline` |
|
|
72
|
+
| MySQL/PostgreSQL/SQL Server | 离线批量 | 单表 | Studio 离线同步 | `clickzetta-batch-sync-pipeline` |
|
|
73
|
+
| MySQL/PostgreSQL/SQL Server | 离线批量 | 多表 | Studio 多表离线同步 | `clickzetta-batch-sync-pipeline` |
|
|
74
|
+
| 本地文件 / URL | 一次性 | 单文件/多文件 | URL 下载 + COPY INTO | `clickzetta-file-import-pipeline` |
|
|
75
|
+
| 流式增量计算 | 准实时 | 表变更驱动 | Dynamic Table + Stream | `clickzetta-incremental-compute-pipeline` |
|
|
76
|
+
| Java 应用 | 实时/批量 | 程序写入 | Java SDK | (见下方 SDK 导入指引) |
|
|
77
|
+
| Python/ZettaPark | 批量 | DataFrame | ZettaPark save_as_table | (见下方 SDK 导入指引) |
|
|
78
|
+
|
|
79
|
+
### 步骤 3:路由到专项 Skill 或直接执行
|
|
80
|
+
|
|
81
|
+
根据推荐方案,执行以下路由逻辑:
|
|
82
|
+
|
|
83
|
+
**有对应专项 Skill 的场景** → 告知用户推荐方案,引导使用对应 Skill:
|
|
84
|
+
- `clickzetta-kafka-ingest-pipeline`:Kafka PIPE 管道搭建
|
|
85
|
+
- `clickzetta-oss-ingest-pipeline`:对象存储 PIPE 管道搭建
|
|
86
|
+
- `clickzetta-batch-sync-pipeline`:Studio 离线同步任务
|
|
87
|
+
- `clickzetta-realtime-sync-pipeline`:Studio 实时同步任务
|
|
88
|
+
- `clickzetta-cdc-sync-pipeline`:Studio 多表实时同步(CDC)
|
|
89
|
+
- `clickzetta-incremental-compute-pipeline`:Dynamic Table + Stream 增量计算管道
|
|
90
|
+
- `clickzetta-file-import-pipeline`:URL/文件下载导入
|
|
91
|
+
- `clickzetta-table-stream-pipeline`:Table Stream 变更数据捕获
|
|
92
|
+
|
|
93
|
+
**无专项 Skill 的简单场景** → 直接执行:
|
|
94
|
+
|
|
95
|
+
#### SQL INSERT 导入(小数据量)
|
|
96
|
+
```sql
|
|
97
|
+
-- 使用 LH_execute_query 执行
|
|
98
|
+
INSERT INTO schema_name.table_name (col1, col2, col3)
|
|
99
|
+
VALUES ('val1', 'val2', 'val3');
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
#### COPY INTO 快速导入(从 Volume)
|
|
103
|
+
```sql
|
|
104
|
+
-- 1. 确认 Volume 中有文件
|
|
105
|
+
SHOW VOLUME DIRECTORY volume_name;
|
|
106
|
+
|
|
107
|
+
-- 2. 执行 COPY INTO
|
|
108
|
+
COPY INTO schema_name.table_name
|
|
109
|
+
FROM VOLUME volume_name
|
|
110
|
+
USING CSV
|
|
111
|
+
OPTIONS('header' = 'true');
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
#### Java SDK 导入指引
|
|
115
|
+
提供 Java SDK 的关键配置信息:
|
|
116
|
+
- Maven 依赖坐标
|
|
117
|
+
- 连接配置(endpoint、workspace、schema、vcluster)
|
|
118
|
+
- 批量写入 API:`BulkloadWriter`
|
|
119
|
+
- 实时写入 API:`RealtimeWriter`
|
|
120
|
+
- 建议用户参考官方文档:`comprehensive_guide_to_ingesting_javasdk_buckload_realtime`
|
|
121
|
+
|
|
122
|
+
#### ZettaPark (Python) 导入指引
|
|
123
|
+
- `INSERT` 方式:`session.sql("INSERT INTO ...")`
|
|
124
|
+
- `save_as_table` 方式:`df.write.save_as_table("table_name")`
|
|
125
|
+
- 建议用户参考官方文档:`comprehensive_guide_to_ingesting_zettapark_save_as_table`
|
|
126
|
+
|
|
127
|
+
## 数据入仓 vs 数据入湖
|
|
128
|
+
|
|
129
|
+
| 维度 | 数据入仓 | 数据入湖 |
|
|
130
|
+
|------|---------|---------|
|
|
131
|
+
| 目标 | Lakehouse 托管表 | 用户 Volume(对象存储) |
|
|
132
|
+
| 格式 | 自动转为内部列式格式 | 保持原始文件格式 |
|
|
133
|
+
| 查询性能 | 高(列式存储 + 索引) | 较低(需扫描原始文件) |
|
|
134
|
+
| 适用场景 | 分析查询、BI 报表、数据仓库 | 数据暂存、原始数据归档、跨系统共享 |
|
|
135
|
+
| 常用方式 | Studio 同步、PIPE、COPY INTO、SDK | PUT 文件、Python 脚本上传 |
|
|
136
|
+
|
|
137
|
+
## 示例
|
|
138
|
+
|
|
139
|
+
### 示例 1:用户不确定导入方式
|
|
140
|
+
|
|
141
|
+
用户说:"我有一个 MySQL 数据库,想把里面的订单表实时同步到 Lakehouse"
|
|
142
|
+
|
|
143
|
+
路由逻辑:
|
|
144
|
+
1. 数据源:MySQL(关系型数据库)
|
|
145
|
+
2. 实时性:实时
|
|
146
|
+
3. 同步范围:单表
|
|
147
|
+
4. 需要 CDC:是(实时同步意味着需要捕获变更)
|
|
148
|
+
→ 推荐:Studio 实时同步
|
|
149
|
+
→ 路由到 `clickzetta-realtime-sync-pipeline` Skill
|
|
150
|
+
|
|
151
|
+
### 示例 2:多种数据源混合场景
|
|
152
|
+
|
|
153
|
+
用户说:"我们有 Kafka 的用户行为日志,还有 MySQL 的业务数据,都要导入 Lakehouse"
|
|
154
|
+
|
|
155
|
+
路由逻辑:
|
|
156
|
+
1. Kafka 用户行为日志 → `clickzetta-kafka-ingest-pipeline`(PIPE 持续导入)
|
|
157
|
+
2. MySQL 业务数据 → 确认实时性需求:
|
|
158
|
+
- 实时 → `clickzetta-realtime-sync-pipeline` 或 `clickzetta-cdc-sync-pipeline`
|
|
159
|
+
- 离线 → `clickzetta-batch-sync-pipeline`
|
|
160
|
+
→ 分别引导到对应 Skill
|
|
161
|
+
|
|
162
|
+
### 示例 3:简单的一次性文件导入
|
|
163
|
+
|
|
164
|
+
用户说:"我有一个 CSV 文件要导入"
|
|
165
|
+
|
|
166
|
+
路由逻辑:
|
|
167
|
+
1. 数据源:本地文件
|
|
168
|
+
2. 一次性导入
|
|
169
|
+
→ 路由到 `clickzetta-file-import-pipeline` Skill(支持文件上传 + COPY INTO)
|
|
170
|
+
|
|
171
|
+
## 错误处理
|
|
172
|
+
|
|
173
|
+
| 场景 | 处理方式 |
|
|
174
|
+
|------|---------|
|
|
175
|
+
| 用户无法确定数据源类型 | 询问数据当前存储位置(哪个系统/服务),帮助判断 |
|
|
176
|
+
| 用户需求跨多种导入方式 | 拆分为多个独立的导入任务,分别路由到对应 Skill |
|
|
177
|
+
| 推荐的 Skill 尚未创建 | 提供该导入方式的基本步骤和关键 SQL/API,引导用户参考官方文档 |
|
|
178
|
+
| 用户的云环境不支持某种连接 | 使用 `LH_show_object_list`(object_type=CONNECTIONS)检查可用连接类型,推荐替代方案 |
|
|
179
|
+
| 数据量极大(TB 级) | 建议分批导入,优先使用 PIPE 或 Studio 同步任务(支持断点续传) |
|
|
180
|
+
|
|
181
|
+
## 注意事项
|
|
182
|
+
|
|
183
|
+
- 本 Skill 是路由入口,不直接执行复杂的 pipeline 搭建,而是引导到专项 Skill
|
|
184
|
+
- 对于简单场景(SQL INSERT、单次 COPY INTO),可以直接在本 Skill 中完成
|
|
185
|
+
- 推荐方案时需考虑用户的云环境(阿里云/腾讯云/AWS),不同环境支持的连接类型可能不同
|
|
186
|
+
- 使用 `LH_show_object_list`(object_type=VCLUSTERS)确认可用的虚拟集群,同步任务需要 SYNC 类型的 VCluster
|
|
187
|
+
- 数据入仓是最常见的场景,数据入湖主要用于原始数据暂存或跨系统共享
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## cz-cli 替代路径
|
|
192
|
+
|
|
193
|
+
> 仅在 cz-cli 可用且 MCP 不可用时使用本节。
|
|
194
|
+
> 本 Skill 是路由入口,cz-cli 路径的核心逻辑在各专项 Skill 的"cz-cli 替代路径"章节中。
|
|
195
|
+
|
|
196
|
+
### 路由说明
|
|
197
|
+
|
|
198
|
+
当 MCP 不可用时,各专项 Skill 均已提供 cz-cli 替代路径:
|
|
199
|
+
|
|
200
|
+
| 数据源 | 推荐方式 | 对应 Skill 的 cz-cli 路径 |
|
|
201
|
+
|--------|---------|--------------------------|
|
|
202
|
+
| Kafka | PIPE 持续导入 | `clickzetta-kafka-ingest-pipeline` → cz-cli 替代路径 |
|
|
203
|
+
| 对象存储 (OSS/S3/COS) | PIPE 持续导入 | `clickzetta-oss-ingest-pipeline` → cz-cli 替代路径 |
|
|
204
|
+
| MySQL/PostgreSQL/SQL Server(实时单表) | Studio 实时同步 | `clickzetta-realtime-sync-pipeline` → cz-cli 替代路径 |
|
|
205
|
+
| MySQL/PostgreSQL/SQL Server(实时多表/整库) | Studio 多表实时同步 | `clickzetta-cdc-sync-pipeline` → cz-cli 替代路径 |
|
|
206
|
+
| MySQL/PostgreSQL/SQL Server(离线批量) | Studio 离线同步 | `clickzetta-batch-sync-pipeline` → cz-cli 替代路径 |
|
|
207
|
+
|
|
208
|
+
### 简单场景直接执行(cz-cli 版)
|
|
209
|
+
|
|
210
|
+
对于无需专项 Skill 的简单场景,可直接用 cz-cli agent 完成:
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
# SQL INSERT 导入(小数据量)
|
|
214
|
+
cz-cli agent run "向表 <schema_name>.<table_name> 插入数据:<col1>=<val1>, <col2>=<val2>" \
|
|
215
|
+
--format a2a --dangerously-skip-permissions
|
|
216
|
+
|
|
217
|
+
# COPY INTO 快速导入(从 Volume)
|
|
218
|
+
cz-cli agent run "从 Volume <volume_name> 以 CSV 格式(有 header)将数据导入表 <schema_name>.<table_name>" \
|
|
219
|
+
--format a2a --dangerously-skip-permissions
|
|
220
|
+
```
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"我想把数据导入 Lakehouse,但不确定用哪种方式","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["数据源","实时","批量"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"数据入仓有哪些方案?怎么选择?","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["Kafka","对象存储","MySQL"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"我有 MySQL 和 Kafka 两个数据源要导入 Lakehouse,分别用什么方式?","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["CDC"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"数据导入方案怎么选?实时和离线有什么区别?","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["实时","离线","延迟"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"ingest data into ClickZetta Lakehouse, what options do I have?","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["Kafka","OSS","SDK"]}
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-data-retention
|
|
3
|
+
description: |
|
|
4
|
+
管理 ClickZetta Lakehouse 数据生命周期(TTL 自动回收)和数据恢复(Time Travel / UNDROP / RESTORE)。
|
|
5
|
+
覆盖数据生命周期设置(data_lifecycle)、Time Travel 保留周期(data_retention_days)、
|
|
6
|
+
历史数据查询(TIMESTAMP AS OF)、误删表恢复(UNDROP TABLE)、数据回滚(RESTORE TABLE)、
|
|
7
|
+
变更历史查看(DESC HISTORY)等完整数据管理工作流。
|
|
8
|
+
当用户说"设置生命周期"、"数据自动清理"、"TTL"、"data_lifecycle"、"表数据过期"、
|
|
9
|
+
"自动回收数据"、"设置数据保留"、"data_retention_days"、"Time Travel"、
|
|
10
|
+
"恢复误删的表"、"表被 DROP 了怎么办"、"回滚数据"、"查看历史版本"、
|
|
11
|
+
"UNDROP"、"RESTORE TABLE"、"误操作恢复"、"数据回滚"、"时间旅行"时触发。
|
|
12
|
+
Keywords: TTL, data retention, time travel, lifecycle, UNDROP, RESTORE, recovery, rollback
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
# ClickZetta 数据生命周期与恢复
|
|
16
|
+
|
|
17
|
+
## 两个核心概念
|
|
18
|
+
|
|
19
|
+
| 概念 | 属性键 | 作用 | 默认值 | 范围 |
|
|
20
|
+
|---|---|---|---|---|
|
|
21
|
+
| 数据生命周期(TTL) | `data_lifecycle` | 自动回收超期未更新的数据 | `-1`(永不回收) | 任意正整数天 |
|
|
22
|
+
| Time Travel 保留周期 | `data_retention_days` | 历史版本保留时长,支持时间点查询和恢复 | `1`(1天) | 0-90 天 |
|
|
23
|
+
|
|
24
|
+
两者独立,可同时设置。
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## 数据生命周期(TTL)
|
|
29
|
+
|
|
30
|
+
### 设置
|
|
31
|
+
|
|
32
|
+
```sql
|
|
33
|
+
-- 建表时设置(7天未更新自动清空数据)
|
|
34
|
+
CREATE TABLE orders_archive (id BIGINT, amount DECIMAL(10,2))
|
|
35
|
+
PROPERTIES('data_lifecycle'='7');
|
|
36
|
+
|
|
37
|
+
-- 到期同时删除表结构
|
|
38
|
+
CREATE TABLE temp_staging (id INT, data STRING)
|
|
39
|
+
PROPERTIES('data_lifecycle'='30', 'data_lifecycle_delete_meta'='true');
|
|
40
|
+
|
|
41
|
+
-- 修改现有表
|
|
42
|
+
ALTER TABLE my_table SET PROPERTIES ('data_lifecycle'='90');
|
|
43
|
+
|
|
44
|
+
-- 关闭生命周期
|
|
45
|
+
ALTER TABLE my_table SET PROPERTIES ('data_lifecycle'='-1');
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 查看
|
|
49
|
+
|
|
50
|
+
```sql
|
|
51
|
+
-- 查看单表
|
|
52
|
+
SHOW CREATE TABLE my_table;
|
|
53
|
+
|
|
54
|
+
-- 批量查看已设置生命周期的表
|
|
55
|
+
SELECT table_schema, table_name, data_lifecycle, last_modify_time
|
|
56
|
+
FROM information_schema.tables
|
|
57
|
+
WHERE data_lifecycle > 0
|
|
58
|
+
ORDER BY data_lifecycle;
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 注意事项
|
|
62
|
+
- 回收不立即执行,后台每 12 小时轮询,通常 24 小时内完成
|
|
63
|
+
- 默认只清空数据不删表;加 `data_lifecycle_delete_meta='true'` 才删表
|
|
64
|
+
- 分区表按分区独立计算 `last_modified_time`
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Time Travel 与数据恢复
|
|
69
|
+
|
|
70
|
+
### 配置保留周期
|
|
71
|
+
|
|
72
|
+
```sql
|
|
73
|
+
-- 修改保留周期(默认 1 天,最长 90 天)
|
|
74
|
+
ALTER TABLE my_table SET PROPERTIES ('data_retention_days'='7');
|
|
75
|
+
|
|
76
|
+
-- 建表时指定
|
|
77
|
+
CREATE TABLE orders (id INT, amount DECIMAL(10,2))
|
|
78
|
+
PROPERTIES ('data_retention_days'='30');
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 查看变更历史
|
|
82
|
+
|
|
83
|
+
```sql
|
|
84
|
+
DESC HISTORY my_table;
|
|
85
|
+
-- 返回:version, time, total_rows, total_bytes, user, operation, job_id
|
|
86
|
+
|
|
87
|
+
-- 查看已删除表的记录
|
|
88
|
+
SHOW TABLES HISTORY;
|
|
89
|
+
SHOW TABLES HISTORY LIKE 'orders%';
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Time Travel 查询历史数据
|
|
93
|
+
|
|
94
|
+
```sql
|
|
95
|
+
-- 查询指定时间点(只读)
|
|
96
|
+
SELECT * FROM orders TIMESTAMP AS OF '2026-03-18 15:00:00';
|
|
97
|
+
|
|
98
|
+
-- 相对时间
|
|
99
|
+
SELECT * FROM orders TIMESTAMP AS OF CURRENT_TIMESTAMP() - INTERVAL 12 HOURS;
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### RESTORE TABLE 回滚
|
|
103
|
+
|
|
104
|
+
```sql
|
|
105
|
+
-- 将表回滚到指定时间点(覆盖当前数据)
|
|
106
|
+
RESTORE TABLE orders TO TIMESTAMP AS OF '2026-03-18 14:59:00';
|
|
107
|
+
```
|
|
108
|
+
> 支持普通表和动态表,不支持物化视图。
|
|
109
|
+
|
|
110
|
+
### UNDROP TABLE 恢复误删表
|
|
111
|
+
|
|
112
|
+
```sql
|
|
113
|
+
-- 恢复被 DROP 的表(需在保留周期内)
|
|
114
|
+
UNDROP TABLE orders;
|
|
115
|
+
```
|
|
116
|
+
> 同名表存在时无法 UNDROP,需先 DROP 新表再 UNDROP。
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## 典型场景
|
|
121
|
+
|
|
122
|
+
### 误删表恢复
|
|
123
|
+
```sql
|
|
124
|
+
SHOW TABLES HISTORY LIKE 'orders';
|
|
125
|
+
UNDROP TABLE orders;
|
|
126
|
+
SELECT COUNT(*) FROM orders;
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 误执行 DELETE/UPDATE 回滚
|
|
130
|
+
```sql
|
|
131
|
+
DESC HISTORY analytics.events;
|
|
132
|
+
-- 全量回滚
|
|
133
|
+
RESTORE TABLE analytics.events TO TIMESTAMP AS OF '2026-03-18 14:55:00';
|
|
134
|
+
-- 或仅补回部分数据
|
|
135
|
+
INSERT INTO analytics.events
|
|
136
|
+
SELECT * FROM analytics.events TIMESTAMP AS OF '2026-03-18 14:55:00'
|
|
137
|
+
WHERE date < '2025-01-01';
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### 日志表自动清理
|
|
141
|
+
```sql
|
|
142
|
+
CREATE TABLE app_logs (log_id BIGINT, message STRING, log_time TIMESTAMP)
|
|
143
|
+
PROPERTIES('data_lifecycle'='30');
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## 决策树
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
数据丢失/损坏
|
|
152
|
+
├── 表被 DROP?
|
|
153
|
+
│ ├── 在保留周期内 → UNDROP TABLE
|
|
154
|
+
│ └── 超出保留周期 → 联系管理员
|
|
155
|
+
└── 数据被 DELETE/UPDATE/TRUNCATE?
|
|
156
|
+
├── 在保留周期内
|
|
157
|
+
│ ├── 全量回滚 → RESTORE TABLE TO TIMESTAMP AS OF
|
|
158
|
+
│ └── 补回部分 → INSERT INTO ... SELECT ... TIMESTAMP AS OF
|
|
159
|
+
└── 超出保留周期 → 联系管理员
|
|
160
|
+
```
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"怎么给表设置数据生命周期?超过 30 天自动清理","expected_skill":"clickzetta-data-retention","expected_output_contains":["data_lifecycle","PROPERTIES"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"表被误删了怎么恢复?UNDROP 怎么用?","expected_skill":"clickzetta-data-retention","expected_output_contains":["UNDROP","TABLE"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"怎么用 Time Travel 查询昨天的历史数据?","expected_skill":"clickzetta-data-retention","expected_output_contains":["TIMESTAMP AS OF"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"误执行了 DELETE,怎么回滚到操作前的状态?","expected_skill":"clickzetta-data-retention","expected_output_contains":["RESTORE","TIMESTAMP"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"data_retention_days 和 data_lifecycle 有什么区别?","expected_skill":"clickzetta-data-retention","expected_output_contains":["data_retention_days","data_lifecycle"]}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# 数据生命周期管理参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/data-lifecycle
|
|
4
|
+
> 已通过实际 Lakehouse 连接验证(cn-shanghai-alicloud, f8866243, quick_start)
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## 核心属性
|
|
9
|
+
|
|
10
|
+
| 属性键 | 类型 | 默认值 | 说明 |
|
|
11
|
+
|---|---|---|---|
|
|
12
|
+
| `data_lifecycle` | 正整数 / -1 | `-1` | 数据自动回收周期(天)。-1 表示永不回收 |
|
|
13
|
+
| `data_lifecycle_delete_meta` | boolean string | `'false'` | 到期时是否同时删除表结构。默认只清空数据 |
|
|
14
|
+
| `data_retention_days` | 整数 0-90 | `1` | Time Travel 历史版本保留天数 |
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## CREATE TABLE 语法
|
|
19
|
+
|
|
20
|
+
```sql
|
|
21
|
+
CREATE TABLE tname (
|
|
22
|
+
col1 datatype1,
|
|
23
|
+
col2 datatype2
|
|
24
|
+
) PROPERTIES(
|
|
25
|
+
'data_lifecycle'='<天数>',
|
|
26
|
+
'data_lifecycle_delete_meta'='true', -- 可选,到期删表结构
|
|
27
|
+
'data_retention_days'='<天数>' -- 可选,Time Travel 保留周期
|
|
28
|
+
);
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
**验证结果**:`SHOW CREATE TABLE` 输出中属性显示在 `TBLPROPERTIES` 块内:
|
|
32
|
+
```sql
|
|
33
|
+
CREATE TABLE quick_start.mcp_demo.lifecycle_test_table(
|
|
34
|
+
`id` int,
|
|
35
|
+
`name` string,
|
|
36
|
+
`created_at` timestamp)
|
|
37
|
+
USING PARQUET
|
|
38
|
+
TBLPROPERTIES(
|
|
39
|
+
'data_lifecycle'='7',
|
|
40
|
+
'data_retention_days'='7');
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## ALTER TABLE 语法
|
|
46
|
+
|
|
47
|
+
```sql
|
|
48
|
+
-- 设置/修改生命周期
|
|
49
|
+
ALTER TABLE tname SET PROPERTIES ('data_lifecycle'='<天数>');
|
|
50
|
+
|
|
51
|
+
-- 关闭生命周期
|
|
52
|
+
ALTER TABLE tname SET PROPERTIES ('data_lifecycle'='-1');
|
|
53
|
+
|
|
54
|
+
-- 设置到期删除表结构
|
|
55
|
+
ALTER TABLE tname SET PROPERTIES ('data_lifecycle_delete_meta'='true');
|
|
56
|
+
|
|
57
|
+
-- 设置 Time Travel 保留周期
|
|
58
|
+
ALTER TABLE tname SET PROPERTIES ('data_retention_days'='<天数>');
|
|
59
|
+
|
|
60
|
+
-- 同时设置多个属性
|
|
61
|
+
ALTER TABLE tname SET PROPERTIES (
|
|
62
|
+
'data_lifecycle'='90',
|
|
63
|
+
'data_lifecycle_delete_meta'='true',
|
|
64
|
+
'data_retention_days'='30'
|
|
65
|
+
);
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## 查看配置
|
|
71
|
+
|
|
72
|
+
### DESC EXTENDED
|
|
73
|
+
|
|
74
|
+
```sql
|
|
75
|
+
DESC EXTENDED tname;
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**实际输出结构**(验证结果):
|
|
79
|
+
|
|
80
|
+
| column_name | data_type | comment |
|
|
81
|
+
|---|---|---|
|
|
82
|
+
| id | int | |
|
|
83
|
+
| name | string | |
|
|
84
|
+
| ... | ... | |
|
|
85
|
+
| # detailed table information | | |
|
|
86
|
+
| workspace | quick_start | |
|
|
87
|
+
| schema | mcp_demo | |
|
|
88
|
+
| name | lifecycle_test_table | |
|
|
89
|
+
| creator | qiliang | |
|
|
90
|
+
| created_time | 2026-05-01 11:05:08.904 | |
|
|
91
|
+
| last_modified_time | 2026-05-01 11:05:26.442 | |
|
|
92
|
+
| comment | | |
|
|
93
|
+
| properties | (("data_lifecycle","7"),("data_retention_days","7")) | |
|
|
94
|
+
| version | 3377453148768716241 | |
|
|
95
|
+
| type | TABLE | |
|
|
96
|
+
| format | PARQUET | |
|
|
97
|
+
| statistics | 1 rows 2445 bytes | |
|
|
98
|
+
|
|
99
|
+
关键字段:
|
|
100
|
+
- `last_modified_time`:生命周期从此时间起算
|
|
101
|
+
- `properties`:显示所有 TBLPROPERTIES
|
|
102
|
+
|
|
103
|
+
### SHOW CREATE TABLE
|
|
104
|
+
|
|
105
|
+
```sql
|
|
106
|
+
SHOW CREATE TABLE tname;
|
|
107
|
+
-- 返回完整 DDL,TBLPROPERTIES 中包含 data_lifecycle 等属性
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### information_schema.tables
|
|
111
|
+
|
|
112
|
+
```sql
|
|
113
|
+
SELECT table_name, data_lifecycle, last_modify_time
|
|
114
|
+
FROM information_schema.tables
|
|
115
|
+
WHERE table_schema = 'my_schema';
|
|
116
|
+
-- data_lifecycle = -1 表示永久保留(未设置生命周期)
|
|
117
|
+
-- data_lifecycle > 0 表示已设置生命周期(单位:天)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## 分区表
|
|
123
|
+
|
|
124
|
+
分区表的生命周期按**分区**计算,每个分区独立判断 `last_modified_time`。
|
|
125
|
+
|
|
126
|
+
```sql
|
|
127
|
+
-- 查看分区的修改时间
|
|
128
|
+
SHOW PARTITIONS EXTENDED tname;
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**实际输出字段**(验证结果):
|
|
132
|
+
|
|
133
|
+
| 字段 | 说明 |
|
|
134
|
+
|---|---|
|
|
135
|
+
| partitions | 分区值(如 dt=2024-01-01) |
|
|
136
|
+
| total_rows | 分区行数 |
|
|
137
|
+
| bytes | 分区大小 |
|
|
138
|
+
| total_files | 文件数 |
|
|
139
|
+
| created_time | 分区创建时间 |
|
|
140
|
+
| last_modified_time | 分区最后修改时间(生命周期从此起算) |
|
|
141
|
+
| last_data_time | 最后数据写入时间 |
|
|
142
|
+
| last_compaction_time | 最后 compaction 时间 |
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Time Travel 语法
|
|
147
|
+
|
|
148
|
+
```sql
|
|
149
|
+
-- 查询历史时间点数据
|
|
150
|
+
SELECT * FROM tname TIMESTAMP AS OF '<timestamp>';
|
|
151
|
+
SELECT * FROM tname TIMESTAMP AS OF CURRENT_TIMESTAMP - INTERVAL 12 HOURS;
|
|
152
|
+
|
|
153
|
+
-- 查看版本历史
|
|
154
|
+
DESC HISTORY tname;
|
|
155
|
+
-- 返回:version, time, total_rows, total_bytes, user, operation, job_id, stats
|
|
156
|
+
|
|
157
|
+
-- 恢复到历史版本(注意:目标时间点必须晚于表创建时间)
|
|
158
|
+
RESTORE TABLE tname TO TIMESTAMP AS OF '<timestamp>';
|
|
159
|
+
|
|
160
|
+
-- 恢复被删除的表
|
|
161
|
+
UNDROP TABLE tname;
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**注意**:`RESTORE TABLE` 的目标时间点不能早于表创建时间,否则报错:
|
|
165
|
+
`InvalidArgument: toTimestamp is smaller than timestamp of fromTimestamp`
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## 工作原理
|
|
170
|
+
|
|
171
|
+
1. 生命周期回收依赖 `last_modified_time`(DDL/DML 操作会更新此时间)
|
|
172
|
+
2. 后台进程每 **12 小时**轮询一次,到期数据通常在 **24 小时内**被回收
|
|
173
|
+
3. 到期数据不立即删除,仍可查询,直到后台进程执行
|
|
174
|
+
4. 被回收的数据仍遵守 `data_retention_days`,可用 Time Travel 查询
|
|
175
|
+
5. 默认行为:只清空数据,**保留表结构**;设置 `data_lifecycle_delete_meta='true'` 才删表
|