@clickzetta/cz-cli-darwin-arm64 0.3.20 → 0.3.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/cz-cli CHANGED
Binary file
@@ -1,3 +1,3 @@
1
- {"case_id":"002","type":"should_call","user_input":"怎么查看当前工作空间有哪些用户?","expected_skill":"clickzetta-access-control","expected_output_contains":["SHOW USERS"]}
1
+ {"case_id":"002","type":"should_call","user_input":"怎么管理工作空间的用户?新用户加入后需要授予什么角色?","expected_skill":"clickzetta-access-control","expected_output_contains":["角色","授予"]}
2
2
  {"case_id":"003","type":"should_call","user_input":"当前有哪些系统预置角色?各自有什么权限?","expected_skill":"clickzetta-access-control","expected_output_contains":["workspace_admin","workspace_dev"]}
3
3
  {"case_id":"004","type":"should_call","user_input":"怎么给某个用户授予 public schema 下所有表的只读权限?","expected_skill":"clickzetta-access-control","expected_output_contains":["GRANT","SELECT"]}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"怎么创建离线同步任务把 MySQL 表定期同步到 Lakehouse?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["离线同步","Cron","调度"]}
2
+ {"case_id":"002","type":"should_call","user_input":"多表离线同步支持自动建表吗?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["自动创建","多表"]}
3
+ {"case_id":"003","type":"should_call","user_input":"离线同步的单表模式和多表模式怎么选?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["单表","多表","task_type"]}
4
+ {"case_id":"004","type":"should_call","user_input":"批量同步支持 Schema Evolution 吗?新增字段会自动适配吗?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["Schema Evolution","多表"]}
5
+ {"case_id":"005","type":"should_call","user_input":"怎么配置离线同步任务的调度周期?用 Cron 表达式吗?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["Cron","调度"]}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"怎么把 MySQL 整库实时同步到 Lakehouse?","expected_skill":"clickzetta-cdc-sync-pipeline","expected_output_contains":["整库镜像","Binlog","CDC"]}
2
+ {"case_id":"002","type":"should_call","user_input":"分库分表的数据怎么合并同步到 Lakehouse 一张表?","expected_skill":"clickzetta-cdc-sync-pipeline","expected_output_contains":["多表合并","分库分表"]}
3
+ {"case_id":"003","type":"should_call","user_input":"CDC 同步任务 Binlog 位点过期了怎么办?","expected_skill":"clickzetta-cdc-sync-pipeline","expected_output_contains":["Binlog"]}
4
+ {"case_id":"004","type":"should_call","user_input":"多表实时同步怎么配置告警?支持飞书通知吗?","expected_skill":"clickzetta-cdc-sync-pipeline","expected_output_contains":["告警","webhook"]}
5
+ {"case_id":"005","type":"should_call","user_input":"PostgreSQL 整库 CDC 同步需要源端做什么准备?","expected_skill":"clickzetta-cdc-sync-pipeline","expected_output_contains":["WAL","PostgreSQL"]}
@@ -0,0 +1,542 @@
1
+ ---
2
+ name: clickzetta-dba-guide
3
+ description: |
4
+ ClickZetta Lakehouse DBA 日常运维操作手册。集中覆盖 DBA 最常用的 6 类操作:
5
+ 计算集群运维、作业监控与诊断、数据恢复与保护、
6
+ 存储优化与维护、Schema 与对象管理、成本与资源分析。
7
+ 每个操作提供可直接执行的 SQL,并标注 ClickZetta 特有限制。
8
+ 当用户说"启停集群"、"调整集群规格"、
9
+ "取消作业"、"慢查询"、"恢复误删表"、"UNDROP"、"RESTORE"、
10
+ "小文件合并"、"OPTIMIZE"、"ANALYZE TABLE"、
11
+ "成本分析"、"存储用量"、"DBA 操作"、
12
+ "创建 Schema"、"删除 Schema"、"重命名表"、"对象管理"、"Schema 管理"时触发。
13
+ 用户与权限管理、网络策略、数据脱敏等安全治理操作请使用 clickzetta-access-control skill。
14
+ Keywords: DBA, operations, monitoring, troubleshooting, cluster management, cost
15
+ ---
16
+
17
+ # ClickZetta Lakehouse DBA 运维手册
18
+
19
+ ---
20
+
21
+ ## 模块 1:用户与权限管理
22
+
23
+ ### 用户管理
24
+
25
+ ```sql
26
+ -- 创建用户(设置默认集群和 Schema)
27
+ CREATE USER alice DEFAULT_VCLUSTER default_ap DEFAULT_SCHEMA my_schema;
28
+
29
+ -- 修改用户默认集群
30
+ ALTER USER alice SET DEFAULT_VCLUSTER = analytics_cluster;
31
+
32
+ -- 删除用户(从当前工作空间移除)
33
+ DROP USER alice;
34
+
35
+ -- 查看所有用户
36
+ SHOW USERS;
37
+ ```
38
+
39
+ ### 角色管理
40
+
41
+ ```sql
42
+ -- 创建自定义角色(仅工作空间级,仅 SQL)
43
+ CREATE ROLE data_engineer;
44
+
45
+ -- 将角色授予用户
46
+ GRANT ROLE data_engineer TO USER alice;
47
+
48
+ -- 撤销角色
49
+ REVOKE ROLE data_engineer FROM USER alice;
50
+
51
+ -- 查看所有角色
52
+ SHOW ROLES;
53
+
54
+ -- 查看用户权限
55
+ SHOW GRANTS TO USER alice;
56
+ SHOW GRANTS TO ROLE data_engineer;
57
+ ```
58
+
59
+ ### 权限授予
60
+
61
+ ```sql
62
+ -- 授予 Schema 下所有表的读权限
63
+ GRANT SELECT ON ALL TABLES IN SCHEMA my_schema TO ROLE data_engineer;
64
+
65
+ -- 授予单张表的读写权限
66
+ GRANT SELECT, INSERT, UPDATE, DELETE ON TABLE my_schema.orders TO USER alice;
67
+
68
+ -- 授予创建表的权限
69
+ GRANT CREATE TABLE ON SCHEMA my_schema TO ROLE data_engineer;
70
+
71
+ -- 授予使用集群的权限
72
+ GRANT USE ON VCLUSTER default_ap TO ROLE data_engineer;
73
+
74
+ -- 授予 information_schema 查询权限
75
+ GRANT ALL ON ALL VIEWS IN SCHEMA information_schema TO ROLE data_engineer;
76
+
77
+ -- 批量授权(Schema 级别)
78
+ GRANT SELECT ON ALL TABLES IN SCHEMA ods TO ROLE analyst;
79
+ GRANT SELECT ON ALL TABLES IN SCHEMA dwd TO ROLE analyst;
80
+ GRANT SELECT ON ALL TABLES IN SCHEMA dws TO ROLE analyst;
81
+ ```
82
+
83
+ ### 权限撤销
84
+
85
+ ```sql
86
+ -- 撤销表权限
87
+ REVOKE SELECT ON TABLE my_schema.orders FROM USER alice;
88
+
89
+ -- 撤销 Schema 创建权限
90
+ REVOKE CREATE TABLE ON SCHEMA my_schema FROM ROLE data_engineer;
91
+ ```
92
+
93
+ ### 动态脱敏(列级安全,邀测功能)
94
+
95
+ ```sql
96
+ -- 创建脱敏函数(基于角色)
97
+ CREATE FUNCTION my_schema.phone_masking(phone STRING)
98
+ RETURNS STRING
99
+ AS CASE
100
+ WHEN ARRAY_CONTAINS(current_roles(), 'data_admin') THEN phone
101
+ ELSE CONCAT(SUBSTR(phone, 1, 3), '****', SUBSTR(phone, 8, 4))
102
+ END;
103
+
104
+ -- 绑定脱敏策略到列
105
+ ALTER TABLE my_schema.users
106
+ CHANGE COLUMN phone SET MASK my_schema.phone_masking;
107
+
108
+ -- 解除脱敏
109
+ ALTER TABLE my_schema.users
110
+ CHANGE COLUMN phone UNSET MASK;
111
+ ```
112
+
113
+ **ClickZetta 特有限制:**
114
+ - 无超级用户,所有操作必须明确授权
115
+ - `instance_admin` 不能直接操作工作空间数据
116
+ - 自定义角色仅工作空间级,不支持实例级自定义角色
117
+
118
+ ---
119
+
120
+ ## 模块 2:计算集群运维
121
+
122
+ ### 启停与状态
123
+
124
+ ```sql
125
+ -- 启动集群
126
+ ALTER VCLUSTER my_cluster RESUME;
127
+ ALTER VCLUSTER IF EXISTS my_cluster RESUME;
128
+
129
+ -- 停止集群
130
+ ALTER VCLUSTER my_cluster SUSPEND;
131
+ ALTER VCLUSTER my_cluster SUSPEND FORCE; -- 强制停止(中断运行中的作业)
132
+
133
+ -- 取消集群所有作业
134
+ ALTER VCLUSTER my_cluster CANCEL ALL JOBS;
135
+
136
+ -- 查看集群状态
137
+ SHOW VCLUSTERS;
138
+ SHOW VCLUSTERS WHERE state = 'RUNNING';
139
+ SHOW VCLUSTERS WHERE state = 'SUSPENDED';
140
+ DESC VCLUSTER my_cluster;
141
+ DESC VCLUSTER EXTENDED my_cluster;
142
+
143
+ -- 切换当前会话使用的集群
144
+ USE VCLUSTER my_cluster;
145
+ ```
146
+
147
+ ### 调整规格
148
+
149
+ ```sql
150
+ -- 通用型(GP):固定规格
151
+ ALTER VCLUSTER my_gp SET VCLUSTER_SIZE = 8;
152
+
153
+ -- 通用型(GP):弹性规格
154
+ ALTER VCLUSTER my_gp SET MIN_VCLUSTER_SIZE = 2 MAX_VCLUSTER_SIZE = 16;
155
+
156
+ -- 分析型(AP):调整副本数
157
+ ALTER VCLUSTER my_ap SET MIN_REPLICAS = 1 MAX_REPLICAS = 4;
158
+
159
+ -- 分析型(AP):调整最大并发
160
+ ALTER VCLUSTER my_ap SET MAX_CONCURRENCY = 16;
161
+
162
+ -- 设置查询超时(秒,-1 表示无限制)
163
+ ALTER VCLUSTER my_cluster SET QUERY_RUNTIME_LIMIT_IN_SECOND = 3600;
164
+ ```
165
+
166
+ ### 自动停止与启动
167
+
168
+ ```sql
169
+ -- 设置 60 秒无作业自动停止,有作业自动启动
170
+ ALTER VCLUSTER my_cluster SET
171
+ AUTO_SUSPEND_IN_SECOND = 60
172
+ AUTO_RESUME = TRUE;
173
+
174
+ -- 关闭自动停止
175
+ ALTER VCLUSTER my_cluster SET AUTO_SUSPEND_IN_SECOND = -1;
176
+ ```
177
+
178
+ ### AP 集群预加载缓存
179
+
180
+ ```sql
181
+ -- 设置预加载表(集群启动时自动缓存最新数据)
182
+ ALTER VCLUSTER my_ap SET PRELOAD_TABLES = "sales.orders,sales.products";
183
+
184
+ -- 查看缓存状态
185
+ SHOW PRELOAD CACHED STATUS;
186
+ SHOW EXTENDED PRELOAD CACHED STATUS;
187
+ ```
188
+
189
+ **ClickZetta 特有限制:**
190
+ - OPTIMIZE(小文件合并)仅 GP 集群支持,AP 集群不生效
191
+ - 分析型集群规格步长为 2^n(1/2/4/8/16...),通用型步长为 1
192
+
193
+ ---
194
+
195
+ ## 模块 3:作业监控与诊断
196
+
197
+ ### 实时作业查看
198
+
199
+ ```sql
200
+ -- 查看最近作业(最多 7 天,10000 条)
201
+ SHOW JOBS LIMIT 20;
202
+ SHOW JOBS IN VCLUSTER default_ap LIMIT 20;
203
+
204
+ -- 取消指定作业
205
+ CANCEL JOB '2026050118342658136171272';
206
+
207
+ -- 查看执行计划
208
+ EXPLAIN SELECT * FROM orders WHERE order_date = '2024-01-01';
209
+ EXPLAIN EXTENDED SELECT * FROM orders WHERE order_date = '2024-01-01';
210
+ ```
211
+
212
+ ### 历史作业分析(information_schema)
213
+
214
+ ```sql
215
+ -- 慢查询 TOP 20(最近 7 天)
216
+ SELECT job_id, job_creator, execution_time, input_bytes, job_text
217
+ FROM information_schema.job_history
218
+ WHERE pt_date >= CAST(CURRENT_DATE - INTERVAL 7 DAY AS DATE)
219
+ AND status = 'SUCCEED'
220
+ ORDER BY execution_time DESC
221
+ LIMIT 20;
222
+
223
+ -- 失败作业(最近 24 小时)
224
+ SELECT job_id, job_creator, error_message, start_time, job_text
225
+ FROM information_schema.job_history
226
+ WHERE pt_date >= CAST(CURRENT_DATE - INTERVAL 1 DAY AS DATE)
227
+ AND status = 'FAILED'
228
+ ORDER BY start_time DESC;
229
+
230
+ -- 按用户统计 CRU 消耗(最近 30 天)
231
+ SELECT job_creator,
232
+ COUNT(*) AS job_count,
233
+ ROUND(SUM(cru), 2) AS total_cru,
234
+ ROUND(AVG(execution_time), 1) AS avg_exec_sec
235
+ FROM information_schema.job_history
236
+ WHERE pt_date >= CAST(CURRENT_DATE - INTERVAL 30 DAY AS DATE)
237
+ AND status = 'SUCCEED'
238
+ GROUP BY job_creator
239
+ ORDER BY total_cru DESC;
240
+
241
+ -- 按集群统计作业分布
242
+ SELECT virtual_cluster,
243
+ COUNT(*) AS job_count,
244
+ ROUND(SUM(cru), 2) AS total_cru
245
+ FROM information_schema.job_history
246
+ WHERE pt_date >= CAST(CURRENT_DATE - INTERVAL 7 DAY AS DATE)
247
+ GROUP BY virtual_cluster
248
+ ORDER BY total_cru DESC;
249
+ ```
250
+
251
+ ---
252
+
253
+ ## 模块 4:数据恢复与保护
254
+
255
+ ### 恢复误删对象
256
+
257
+ ```sql
258
+ -- 查看已删除的表(delete_time 不为 NULL)
259
+ SHOW TABLES HISTORY IN my_schema;
260
+ SHOW TABLES HISTORY LIKE '%orders%';
261
+
262
+ -- 恢复误删的表/动态表/物化视图
263
+ UNDROP TABLE my_schema.orders;
264
+ UNDROP TABLE my_schema.my_dynamic_table;
265
+ UNDROP TABLE my_schema.my_mv;
266
+ -- ⚠️ 恢复外部函数用 UNDROP FUNCTION,不是 UNDROP EXTERNAL FUNCTION
267
+ UNDROP FUNCTION my_schema.my_ext_function;
268
+ ```
269
+
270
+ ### 回滚到历史版本
271
+
272
+ ```sql
273
+ -- 查看表的版本历史
274
+ DESC HISTORY my_schema.orders;
275
+ -- 返回:version, time, total_rows, total_bytes, user, operation, job_id
276
+
277
+ -- 恢复到指定时间点(覆盖当前数据)
278
+ -- ⚠️ 时间戳必须用 CAST() 或完整毫秒格式,不能用简单字符串
279
+ -- ❌ 错误:RESTORE TABLE t TO TIMESTAMP AS OF '2024-01-15';
280
+ -- ✅ 正确写法:
281
+ RESTORE TABLE my_schema.orders TO TIMESTAMP AS OF CAST('2024-01-15 10:00:00' AS TIMESTAMP);
282
+ RESTORE TABLE my_schema.orders TO TIMESTAMP AS OF CURRENT_TIMESTAMP() - INTERVAL '2' HOURS;
283
+ -- 也支持完整毫秒时间戳字符串(从 DESC HISTORY 复制):
284
+ RESTORE TABLE my_schema.orders TO TIMESTAMP AS OF '2024-01-15 10:00:00.123';
285
+
286
+ -- 查询历史数据(不覆盖,仅查看)
287
+ SELECT * FROM my_schema.orders TIMESTAMP AS OF CAST('2024-01-15 10:00:00' AS TIMESTAMP);
288
+ ```
289
+
290
+ ### 设置数据保留周期
291
+
292
+ ```sql
293
+ -- 设置 Time Travel 保留 30 天(范围 0-90)
294
+ ALTER TABLE my_schema.orders SET PROPERTIES ('data_retention_days' = '30');
295
+
296
+ -- 查看当前设置
297
+ SHOW CREATE TABLE my_schema.orders;
298
+ ```
299
+
300
+ **ClickZetta 特有限制:**
301
+ - `RESTORE TABLE` 目标时间点不能早于表创建时间
302
+ - `UNDROP` 需在 `data_retention_days` 保留期内(默认 1 天)
303
+ - 物化视图支持 UNDROP,但不支持 RESTORE
304
+
305
+ ---
306
+
307
+ ## 模块 5:存储优化与维护
308
+
309
+ ### 小文件合并
310
+
311
+ ```sql
312
+ -- 手动触发小文件合并(异步,仅 GP 集群)
313
+ OPTIMIZE my_schema.orders;
314
+
315
+ -- 同步执行(等待完成)
316
+ OPTIMIZE my_schema.orders OPTIONS('cz.sql.optimize.table.async' = 'false');
317
+
318
+ -- 只优化特定分区
319
+ OPTIMIZE my_schema.orders WHERE dt = '2024-01-01';
320
+ OPTIMIZE my_schema.orders WHERE dt = '2024-01-01' AND region = 'cn';
321
+
322
+ -- DML 写入时自动触发合并(GP 集群)
323
+ SET cz.sql.compaction.after.commit = true;
324
+ INSERT INTO my_schema.orders SELECT * FROM staging;
325
+ ```
326
+
327
+ ### 统计信息收集
328
+
329
+ ```sql
330
+ -- 收集表统计信息(优化查询计划)
331
+ ANALYZE TABLE my_schema.orders COMPUTE STATISTICS;
332
+
333
+ -- 仅收集大小,不扫描数据(快速)
334
+ ANALYZE TABLE my_schema.orders COMPUTE STATISTICS NOSCAN;
335
+
336
+ -- 收集指定列的统计信息
337
+ ANALYZE TABLE my_schema.orders COMPUTE STATISTICS FOR COLUMNS order_date, customer_id;
338
+
339
+ -- 收集 Schema 下所有表
340
+ ANALYZE TABLES IN my_schema COMPUTE STATISTICS;
341
+ ```
342
+
343
+ ### 清空数据
344
+
345
+ ```sql
346
+ -- 清空整张表(保留表结构)
347
+ TRUNCATE TABLE my_schema.staging;
348
+
349
+ -- 清空指定分区
350
+ TRUNCATE TABLE my_schema.orders WHERE dt = '2024-01-01';
351
+ ```
352
+
353
+ ### 查看存储用量
354
+
355
+ ```sql
356
+ -- 当前 Schema 下大表排行
357
+ SELECT table_schema, table_name,
358
+ ROUND(bytes / 1024.0 / 1024 / 1024, 2) AS size_gb,
359
+ row_count
360
+ FROM information_schema.tables
361
+ WHERE table_type = 'MANAGED_TABLE'
362
+ ORDER BY bytes DESC
363
+ LIMIT 20;
364
+
365
+ -- Sort Key 推荐(系统自动分析)
366
+ SELECT table_name, col, statement, ratio
367
+ FROM information_schema.sortkey_candidates
368
+ ORDER BY ratio DESC;
369
+ ```
370
+
371
+ ---
372
+
373
+ ## 模块 6:网络策略与安全
374
+
375
+ ### 网络策略管理
376
+
377
+ ```sql
378
+ -- 创建网络策略(白名单)
379
+ CREATE NETWORK POLICY office_policy
380
+ ALLOWED_IP_LIST = ('10.0.0.0/8', '192.168.1.0/24')
381
+ COMMENT '办公网络白名单';
382
+
383
+ -- 创建网络策略(白名单 + 黑名单)
384
+ CREATE NETWORK POLICY strict_policy
385
+ ALLOWED_IP_LIST = ('10.0.0.0/8')
386
+ BLOCKED_IP_LIST = ('10.0.1.100')
387
+ COMMENT '严格访问控制';
388
+
389
+ -- 修改网络策略(覆盖式,必须包含所有 IP)
390
+ ALTER NETWORK POLICY office_policy
391
+ ALLOWED_IP_LIST = ('10.0.0.0/8', '172.16.0.0/12')
392
+ BLOCKED_IP_LIST = ('10.0.1.100');
393
+
394
+ -- 停用/启用策略
395
+ ALTER NETWORK POLICY office_policy INACTIVATE;
396
+ ALTER NETWORK POLICY office_policy ACTIVATE;
397
+
398
+ -- 删除策略
399
+ DROP NETWORK POLICY IF EXISTS office_policy;
400
+
401
+ -- 查看所有策略(注意:单数 POLICY,无 S)
402
+ SHOW NETWORK POLICY;
403
+
404
+ -- 查看策略详情
405
+ DESC NETWORK POLICY office_policy;
406
+ ```
407
+
408
+ **关键规则(Deny 优先):**
409
+ - 无任何策略时:允许所有 IP
410
+ - 有白名单策略时:不在白名单的 IP 被拒绝
411
+ - 黑名单命中时:无论白名单如何,该 IP 被拒绝
412
+ - MySQL 协议:只要有任何生效策略,所有 MySQL 流量均被拦截
413
+ - 策略生效延迟:最多 5 分钟
414
+
415
+ ---
416
+
417
+ ## 模块 7:Schema 与对象管理
418
+
419
+ ### Schema 管理
420
+
421
+ ```sql
422
+ -- 创建 Schema
423
+ CREATE SCHEMA ods;
424
+ CREATE SCHEMA IF NOT EXISTS dwd;
425
+
426
+ -- 修改 Schema 注释
427
+ ALTER SCHEMA ods SET COMMENT 'ODS 原始数据层';
428
+
429
+ -- 重命名 Schema
430
+ ALTER SCHEMA old_name RENAME TO new_name;
431
+
432
+ -- 删除 Schema(级联删除所有对象)
433
+ DROP SCHEMA IF EXISTS temp_schema CASCADE;
434
+
435
+ -- 切换默认 Schema
436
+ USE SCHEMA my_schema;
437
+ ```
438
+
439
+ ### 表管理
440
+
441
+ ```sql
442
+ -- 修改表:加列
443
+ ALTER TABLE my_schema.orders ADD COLUMN (discount DECIMAL(5,2) COMMENT '折扣率');
444
+
445
+ -- 修改表:改列注释
446
+ ALTER TABLE my_schema.orders CHANGE COLUMN order_id SET COMMENT '订单唯一标识';
447
+
448
+ -- 修改表:设置生命周期
449
+ ALTER TABLE my_schema.orders SET PROPERTIES ('data_lifecycle' = '90');
450
+
451
+ -- 修改表:设置 Sort Key
452
+ ALTER TABLE my_schema.orders SET PROPERTIES ('hint.sort.columns' = 'order_date');
453
+
454
+ -- 重命名表
455
+ ALTER TABLE my_schema.orders RENAME TO my_schema.orders_v2;
456
+
457
+ -- 删除表(可 UNDROP 恢复)
458
+ DROP TABLE IF EXISTS my_schema.temp_table;
459
+
460
+ -- 删除动态表
461
+ DROP DYNAMIC TABLE IF EXISTS my_schema.my_dt;
462
+
463
+ -- 删除物化视图
464
+ DROP MATERIALIZED VIEW IF EXISTS my_schema.my_mv;
465
+ ```
466
+
467
+ ### 批量对象查看
468
+
469
+ ```sql
470
+ -- 统计各类型对象数量
471
+ SELECT
472
+ CASE WHEN is_view THEN 'VIEW'
473
+ WHEN is_materialized_view THEN 'MV'
474
+ WHEN is_dynamic THEN 'DT'
475
+ WHEN is_external THEN 'EXTERNAL'
476
+ ELSE 'TABLE' END AS type,
477
+ COUNT(*) AS cnt
478
+ FROM (SHOW TABLES IN my_schema)
479
+ GROUP BY 1;
480
+
481
+ -- 查找大于 30 天未更新的表(潜在废弃表)
482
+ SELECT table_schema, table_name, last_modify_time,
483
+ ROUND(bytes / 1024.0 / 1024 / 1024, 2) AS size_gb
484
+ FROM information_schema.tables
485
+ WHERE table_type = 'MANAGED_TABLE'
486
+ AND last_modify_time < CURRENT_TIMESTAMP - INTERVAL 30 DAY
487
+ ORDER BY bytes DESC;
488
+ ```
489
+
490
+ ---
491
+
492
+ ## 模块 8:成本与资源分析(需 INSTANCE ADMIN)
493
+
494
+ ```sql
495
+ -- 本月各工作空间计算费用
496
+ SELECT workspace_name, sku_name,
497
+ ROUND(SUM(measurements_consumption), 2) AS total_cru,
498
+ ROUND(SUM(amount), 2) AS total_yuan
499
+ FROM SYS.information_schema.instance_usage
500
+ WHERE measurement_start >= DATE_TRUNC('month', CURRENT_DATE)
501
+ GROUP BY workspace_name, sku_name
502
+ ORDER BY total_yuan DESC;
503
+
504
+ -- 本月各工作空间存储费用
505
+ SELECT workspace_name, sku_name,
506
+ ROUND(SUM(measurements_consumption), 4) AS consumption,
507
+ measurements_unit,
508
+ ROUND(SUM(amount), 4) AS total_yuan
509
+ FROM SYS.information_schema.storage_metering
510
+ WHERE measurement_start >= DATE_TRUNC('month', CURRENT_DATE)
511
+ GROUP BY workspace_name, sku_name, measurements_unit
512
+ ORDER BY workspace_name, total_yuan DESC;
513
+
514
+ -- 跨空间存储用量排行
515
+ SELECT workspace_name,
516
+ ROUND(workspace_storage / 1024.0 / 1024 / 1024, 2) AS storage_gb
517
+ FROM SYS.information_schema.workspaces
518
+ WHERE delete_time IS NULL
519
+ ORDER BY workspace_storage DESC;
520
+
521
+ -- 跨空间大表排行(大于 10GB)
522
+ SELECT table_catalog, table_schema, table_name,
523
+ ROUND(bytes / 1024.0 / 1024 / 1024, 2) AS size_gb, row_count
524
+ FROM SYS.information_schema.tables
525
+ WHERE delete_time IS NULL AND bytes > 10 * 1024 * 1024 * 1024
526
+ ORDER BY bytes DESC;
527
+ ```
528
+
529
+ ---
530
+
531
+ ## ClickZetta DBA 特有注意事项
532
+
533
+ | 场景 | 注意事项 |
534
+ |---|---|
535
+ | 权限体系 | 无超级用户;`instance_admin` 不能直接操作工作空间数据 |
536
+ | 自定义角色 | 仅工作空间级,不支持实例级;只能 SQL 创建,不支持 Web 端 |
537
+ | OPTIMIZE | 仅 GP 集群支持;AP 集群不支持小文件合并 |
538
+ | UNDROP | 需在 `data_retention_days` 保留期内(默认 1 天) |
539
+ | RESTORE | 目标时间点不能早于表创建时间 |
540
+ | 网络策略 | Deny 优先;MySQL 协议有任何策略即全部拦截;生效延迟最多 5 分钟 |
541
+ | 动态脱敏 | 邀测功能,需联系技术支持开通 |
542
+ | 集群规格 | AP 集群步长 2^n;GP 集群步长 1;同步型最小 0.25 CRU |
@@ -0,0 +1,3 @@
1
+ {"case_id":"001","type":"should_call","user_input":"DBA 日常怎么启停集群和调整规格?","expected_skill":"clickzetta-dba-guide","expected_output_contains":["SUSPEND","RESUME"]}
2
+ {"case_id":"002","type":"should_call","user_input":"怎么取消正在运行的作业?CANCEL JOB 怎么用?","expected_skill":"clickzetta-dba-guide","expected_output_contains":["CANCEL"]}
3
+ {"case_id":"003","type":"should_call","user_input":"DBA 日常怎么做存储优化?小文件合并和 OPTIMIZE 怎么用?","expected_skill":"clickzetta-dba-guide","expected_output_contains":["OPTIMIZE","小文件"]}
@@ -1,4 +1,4 @@
1
- {"case_id":"001","type":"should_call","user_input":"帮我做数仓分层设计","expected_skill":"clickzetta-dw-modeling","expected_output_contains":["分层"]}
1
+ {"case_id":"001","type":"should_call","user_input":"数仓分层设计的原则是什么?ODS、DWD、DWS 各层的职责?","expected_skill":"clickzetta-dw-modeling","expected_output_contains":["分层"]}
2
2
  {"case_id":"002","type":"should_call","user_input":"ODS/DWD/DWS/ADS 分层怎么设计","expected_skill":"clickzetta-dw-modeling","expected_output_contains":["ODS","DWD","DWS"]}
3
3
  {"case_id":"003","type":"should_call","user_input":"Medallion 架构 Bronze/Silver/Gold 怎么搭建","expected_skill":"clickzetta-dw-modeling","expected_output_contains":["Bronze","Silver","Gold"]}
4
4
  {"case_id":"004","type":"should_call","user_input":"星型模型和雪花模型怎么选","expected_skill":"clickzetta-dw-modeling","expected_output_contains":["星型","雪花"]}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"帮我创建一个 Dynamic Table,从 public.dim_studio_user_dmin_f 聚合按租户+日期统计用户数,每 60 分钟自动刷新","expected_skill":"clickzetta-dynamic-table","expected_output_contains":["DYNAMIC TABLE","REFRESH"]}
2
+ {"case_id":"002","type":"should_call","user_input":"怎么查看动态表的刷新历史和状态","expected_skill":"clickzetta-dynamic-table","expected_output_contains":["REFRESH HISTORY"]}
3
+ {"case_id":"003","type":"should_call","user_input":"动态表的增量刷新怎么配置?SESSION_CONFIGS 怎么用?","expected_skill":"clickzetta-dynamic-table","expected_output_contains":["SESSION_CONFIGS","增量"]}
4
+ {"case_id":"004","type":"should_call","user_input":"静态分区 DT 和动态分区 DT 有什么区别?该怎么选?","expected_skill":"clickzetta-dynamic-table","expected_output_contains":["静态分区","动态分区"]}
5
+ {"case_id":"005","type":"should_call","user_input":"动态表怎么修改刷新间隔和 vcluster?","expected_skill":"clickzetta-dynamic-table","expected_output_contains":["ALTER","DYNAMIC TABLE"]}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"从 URL 导入文件到 Lakehouse 的步骤和语法是什么?","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["Volume","COPY INTO"]}
2
+ {"case_id":"002","type":"should_call","user_input":"本地文件上传到 Lakehouse 表的流程是什么?需要哪些步骤?","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["Volume","COPY INTO"]}
3
+ {"case_id":"003","type":"should_call","user_input":"COPY INTO 导入数据时 append 和 overwrite 写入模式有什么区别?请说明语法","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["append","overwrite"]}
4
+ {"case_id":"004","type":"should_call","user_input":"COPY INTO 导入前怎么推断文件格式?有哪些支持的格式类型?","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["CSV","JSON","Parquet"]}
5
+ {"case_id":"005","type":"should_call","user_input":"CSV 有自定义分隔符,COPY INTO 的 OPTIONS 怎么写?","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["CSV","OPTIONS"]}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"怎么从阿里云 OSS 持续自动导入数据到 Lakehouse?","expected_skill":"clickzetta-oss-ingest-pipeline","expected_output_contains":["PIPE","LIST_PURGE"]}
2
+ {"case_id":"002","type":"should_call","user_input":"OSS PIPE 的 LIST_PURGE 和 EVENT_NOTIFICATION 模式有什么区别?","expected_skill":"clickzetta-oss-ingest-pipeline","expected_output_contains":["LIST_PURGE","EVENT_NOTIFICATION"]}
3
+ {"case_id":"003","type":"should_call","user_input":"怎么从 S3 批量导入 Parquet 文件到 Lakehouse?","expected_skill":"clickzetta-oss-ingest-pipeline","expected_output_contains":["Volume","COPY INTO"]}
4
+ {"case_id":"004","type":"should_call","user_input":"OSS 持续导入的前置步骤是什么?需要先创建什么对象?","expected_skill":"clickzetta-oss-ingest-pipeline","expected_output_contains":["CREATE STORAGE CONNECTION","External Volume"]}
5
+ {"case_id":"005","type":"should_call","user_input":"腾讯云 COS 的数据怎么导入 ClickZetta?","expected_skill":"clickzetta-oss-ingest-pipeline","expected_output_contains":["COS","PIPE"]}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"怎么用 Studio 创建单表实时同步任务?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["实时同步","task_type","28"]}
2
+ {"case_id":"002","type":"should_call","user_input":"Kafka 单个 topic 实时同步到 Lakehouse 表怎么配置?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["Kafka","实时同步"]}
3
+ {"case_id":"003","type":"should_call","user_input":"单表实时同步和多表实时同步有什么区别?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["单表","多表","28","281"]}
4
+ {"case_id":"004","type":"should_call","user_input":"MySQL 单表 CDC 实时同步到 Lakehouse 怎么做?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["MySQL","实时同步","CDC"]}
5
+ {"case_id":"005","type":"should_call","user_input":"实时同步任务需要配置调度策略吗?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["无需配置","持续运行"]}
@@ -0,0 +1,12 @@
1
+ {"case_id":"001","type":"should_call","user_input":"帮我创建一个动态表,每 5 分钟从 raw_events 聚合数据","expected_skill":"clickzetta-sql-pipeline-manager","expected_output_contains":["CREATE DYNAMIC TABLE","REFRESH INTERVAL"]}
2
+ {"case_id":"002","type":"should_call","user_input":"怎么创建物化视图?","expected_skill":"clickzetta-sql-pipeline-manager","expected_output_contains":["CREATE MATERIALIZED VIEW"]}
3
+ {"case_id":"003","type":"should_call","user_input":"创建一个 Table Stream 捕获 orders 表的变更","expected_skill":"clickzetta-sql-pipeline-manager","expected_output_contains":["CREATE TABLE STREAM"]}
4
+ {"case_id":"004","type":"should_call","user_input":"怎么暂停动态表的刷新","expected_skill":"clickzetta-sql-pipeline-manager","expected_output_contains":["ALTER","SUSPEND"]}
5
+ {"case_id":"005","type":"should_call","user_input":"怎么查看动态表的刷新历史","expected_skill":"clickzetta-sql-pipeline-manager","expected_output_contains":["SHOW DYNAMIC TABLE REFRESH HISTORY"]}
6
+ {"case_id":"006","type":"should_call","user_input":"帮我设计一个 Medallion 架构的数据管道","expected_skill":"clickzetta-sql-pipeline-manager","expected_output_contains":["Bronze","Silver","Gold"]}
7
+ {"case_id":"007","type":"should_call","user_input":"从 Kafka 持续导入数据到 Lakehouse 用什么方式","expected_skill":"clickzetta-sql-pipeline-manager","expected_output_contains":["Pipe","read_kafka"]}
8
+ {"case_id":"008","type":"should_not_call","user_input":"帮我写一个 Node.js 后端","forbidden_skill":"clickzetta-sql-pipeline-manager"}
9
+ {"case_id":"009","type":"should_not_call","user_input":"怎么创建用户和授权","forbidden_skill":"clickzetta-sql-pipeline-manager"}
10
+ {"case_id":"010","type":"should_not_call","user_input":"Kubernetes 怎么部署","forbidden_skill":"clickzetta-sql-pipeline-manager"}
11
+ {"case_id":"011","type":"should_not_call","user_input":"怎么连接 Superset","forbidden_skill":"clickzetta-sql-pipeline-manager"}
12
+ {"case_id":"012","type":"should_not_call","user_input":"帮我优化一个慢查询","forbidden_skill":"clickzetta-sql-pipeline-manager"}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"怎么创建 Table Stream 捕获表的增量变更?","expected_skill":"clickzetta-table-stream-pipeline","expected_output_contains":["CREATE TABLE STREAM","change_tracking"]}
2
+ {"case_id":"002","type":"should_call","user_input":"Table Stream 的 STANDARD 和 APPEND_ONLY 模式有什么区别?","expected_skill":"clickzetta-table-stream-pipeline","expected_output_contains":["STANDARD","APPEND_ONLY","INSERT"]}
3
+ {"case_id":"003","type":"should_call","user_input":"Table Stream 消费后 offset 怎么管理?","expected_skill":"clickzetta-table-stream-pipeline","expected_output_contains":["offset"]}
4
+ {"case_id":"004","type":"should_call","user_input":"怎么用 Table Stream + MERGE 实现幂等增量消费?","expected_skill":"clickzetta-table-stream-pipeline","expected_output_contains":["MERGE","幂等"]}
5
+ {"case_id":"005","type":"should_call","user_input":"Table Stream 的 __change_type 元数据字段有哪些值?","expected_skill":"clickzetta-table-stream-pipeline","expected_output_contains":["__change_type","INSERT","UPDATE","DELETE"]}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"创建分析型集群的语法是什么?需要哪些参数?","expected_skill":"clickzetta-vcluster-manager","expected_output_contains":["CREATE VCLUSTER","ANALYTICS","VCLUSTER_SIZE"]}
2
+ {"case_id":"002","type":"should_call","user_input":"VCluster 的三种集群类型分别适合什么场景?","expected_skill":"clickzetta-vcluster-manager","expected_output_contains":["GENERAL","ANALYTICS","INTEGRATION"]}
3
+ {"case_id":"003","type":"should_call","user_input":"分析型集群的副本扩缩容参数怎么配置?","expected_skill":"clickzetta-vcluster-manager","expected_output_contains":["MIN_REPLICAS","MAX_REPLICAS"]}
4
+ {"case_id":"004","type":"should_call","user_input":"PRELOAD_TABLES 缓存预加载的语法和限制是什么?","expected_skill":"clickzetta-vcluster-manager","expected_output_contains":["PRELOAD_TABLES","ALTER VCLUSTER"]}
5
+ {"case_id":"005","type":"should_call","user_input":"集群的 AUTO_SUSPEND 和 AUTO_RESUME 机制是怎样的?","expected_skill":"clickzetta-vcluster-manager","expected_output_contains":["AUTO_SUSPEND_IN_SECOND","AUTO_RESUME"]}
@@ -0,0 +1,5 @@
1
+ {"case_id":"001","type":"should_call","user_input":"我要挂载一个阿里云 OSS bucket 到 Lakehouse,需要先创建 Storage Connection,endpoint 是 oss-cn-hangzhou-internal.aliyuncs.com,帮我说明完整流程","expected_skill":"clickzetta-volume-manager","expected_output_contains":["connection","volume"]}
2
+ {"case_id":"002","type":"should_call","user_input":"创建一个外部 Volume 叫 eval_oss_volume,挂载 oss://studio-dev-hz/,用 eval_oss_conn 连接,开启目录自动刷新","expected_skill":"clickzetta-volume-manager","expected_output_contains":["eval_oss_volume"]}
3
+ {"case_id":"003","type":"should_call","user_input":"查看 eval_oss_volume 里有哪些文件","expected_skill":"clickzetta-volume-manager","expected_output_contains":["eval_oss_volume"]}
4
+ {"case_id":"004","type":"should_call","user_input":"直接查询 eval_oss_volume 里的 CSV 文件,看前 10 行","expected_skill":"clickzetta-volume-manager","expected_output_contains":["eval_oss_volume"]}
5
+ {"case_id":"005","type":"should_call","user_input":"删除 eval_oss_volume 和 eval_oss_conn,清理测试资源","expected_skill":"clickzetta-volume-manager","expected_output_contains":["eval_oss"]}
@@ -37,8 +37,8 @@ cz-cli task save-content <task> --file <f> Save task script
37
37
  cz-cli task save-config <task> Save task non-cron config, like retry, dependency
38
38
  cz-cli task save-cron <task> Save task schedule config
39
39
  cz-cli task deps <task> Show task dependencies (draft)
40
- cz-cli task online <task> Publish a task
41
- cz-cli task offline <task> Take task offline (irreversible)
40
+ cz-cli task deploy <task> Publish/deploy a task (alias: online)
41
+ cz-cli task undeploy <task> Undeploy a task, irreversible (alias: offline)
42
42
  cz-cli task execute <task> Execute ad-hoc
43
43
  cz-cli task delete <task> Delete draft/offline task
44
44
  cz-cli task flow dag <task> Get flow DAG
@@ -78,6 +78,7 @@ cz-cli datasource objects <name_or_id> <catalog>
78
78
  List objects (tables/topics/collections) in a catalog
79
79
  cz-cli datasource describe <name_or_id> <catalog> <object>
80
80
  Show object metadata (columns, types)
81
+ cz-cli datasource test <name_or_id> Test data source connectivity
81
82
  ```
82
83
 
83
84
  ## Output Formats
@@ -92,9 +93,9 @@ cz-cli datasource describe <name_or_id> <catalog> <object>
92
93
  1. **SQL is async by default**. Use `--sync` for SELECT when you need data immediately.
93
94
  2. **Write operations require `--write` flag** (INSERT/UPDATE/DELETE/CREATE/DROP).
94
95
  3. **Always pass `--type` when creating tasks** (SQL/PYTHON/SHELL/SPARK/FLOW).
95
- 4. **Flow tasks use `task flow *` commands exclusively** — never use `task save-content` or `task online` on flow nodes.
96
+ 4. **Flow tasks use `task flow *` commands exclusively** — never use `task save-content` or `task deploy` on flow nodes.
96
97
  5. **Paginated results**: `list` commands return page 1 only. Check `ai_message` in response for next-page hints.
97
- 6. **State-changing operations** (online/offline/execute/delete/refill): confirm intent with user first.
98
+ 6. **State-changing operations** (deploy/undeploy/execute/delete/refill): confirm intent with user first.
98
99
  7. **Multi-environment**: use `--profile <name>` to target a specific environment.
99
100
  8. **On `NO_PROFILE` error**: guide user to run `cz-cli setup`.
100
101
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@clickzetta/cz-cli-darwin-arm64",
3
- "version": "0.3.20",
3
+ "version": "0.3.21",
4
4
  "description": "cz-cli binary for macOS ARM64 (Apple Silicon)",
5
5
  "os": ["darwin"],
6
6
  "cpu": ["arm64"],
@@ -1,220 +0,0 @@
1
- ---
2
- name: clickzetta-data-ingest-pipeline
3
- description: |
4
- ClickZetta Lakehouse 数据导入总览与路由。根据用户的数据源类型、实时性要求、数据量等条件,
5
- 推荐最合适的数据导入方式,并引导到对应的专项 Skill 或直接执行简单导入操作。
6
- 当用户说"导入数据到 Lakehouse"、"数据入仓"、"数据入湖"、"怎么把数据导进来"、
7
- "数据采集"、"数据加载"、"ingest data"、"load data"、"数据导入方案选择"时触发。
8
- Keywords: data ingestion, import, routing, pipeline selection, data source
9
- ---
10
-
11
- # Lakehouse 数据导入总览与路由
12
-
13
- 根据用户的数据源、实时性需求、数据规模等条件,推荐最合适的数据导入方式,
14
- 并路由到对应的专项 Pipeline Skill 或直接执行简单导入操作。
15
-
16
- ## 适用场景
17
-
18
- - 用户想把数据导入 ClickZetta Lakehouse,但不确定用哪种方式
19
- - 用户描述了数据源(Kafka、MySQL、OSS、文件等),需要推荐导入方案
20
- - 用户需要了解各种导入方式的适用场景和差异
21
- - 关键词:数据导入、数据入仓、数据入湖、数据采集、数据加载、pipeline 选择
22
-
23
- ## 前置依赖
24
-
25
- - ClickZetta Lakehouse 账户,具备创建工作空间、Schema、表、PIPE、任务等权限
26
- - **执行环境(满足其一即可,优先使用 cz-cli)**:
27
- - **cz-cli 路径**:已安装 cz-cli(`pip install cz-cli`),并完成 `cz-cli configure` 配置
28
- - **MCP 路径**:clickzetta-studio-mcp 或 clickzetta-mcp-server 工具可用(`LH_execute_query`、`create_task`、`save_integration_task` 等)
29
-
30
- ## 环境探测(执行前必读)
31
-
32
- 在开始任何操作前,先判断当前执行环境:
33
-
34
- **第一步:检测 cz-cli 是否可用**
35
- ```bash
36
- cz-cli --version
37
- ```
38
- - 若命令存在 → **走 cz-cli 路径**(见本文档末尾"cz-cli 替代路径"章节,以及各专项 Skill 的 cz-cli 替代路径)
39
- - 若命令不存在 → 继续检测 MCP
40
-
41
- **第二步:检测 MCP 是否可用(仅在 cz-cli 不可用时)**
42
-
43
- 尝试调用 `LH_execute_query` 工具执行一条简单 SQL(如 `SELECT 1`)。
44
- - 若工具存在于 tool list → **走 MCP 路径**(本文档默认路径)
45
- - 若工具不存在 → 停止执行,提示用户:
46
- > "当前环境既无 cz-cli 也无 MCP 工具,请安装其中之一后重试。
47
- > cz-cli 安装:`pip install cz-cli`,然后运行 `cz-cli configure`
48
- > MCP 安装:参考 clickzetta-studio-mcp 或 clickzetta-mcp-server 配置文档"
49
-
50
- ## 数据导入方式决策树
51
-
52
- ### 步骤 1:确认数据源类型和需求
53
-
54
- 向用户收集以下信息:
55
-
56
- 1. **数据源类型**:Kafka / 对象存储(OSS/S3/COS) / 关系型数据库(MySQL/PostgreSQL/SQL Server) / 本地文件 / URL/Web 文件 / Java SDK / ZettaPark
57
- 2. **实时性要求**:实时(秒级延迟)/ 准实时(分钟级)/ 离线批量(小时/天级)
58
- 3. **同步范围**:单表 / 多表 / 整库
59
- 4. **是否需要持续同步**:一次性导入 / 持续增量同步
60
- 5. **是否需要 CDC(变更数据捕获)**:是 / 否
61
-
62
- ### 步骤 2:根据决策矩阵推荐方案
63
-
64
- | 数据源 | 实时性 | 同步范围 | 推荐方式 | 对应 Skill |
65
- |--------|--------|---------|---------|-----------|
66
- | Kafka | 实时/准实时 | 单 topic | Kafka PIPE 持续导入(SQL) | `clickzetta-kafka-ingest-pipeline` |
67
- | Kafka | 实时 | 多 topic | Studio 实时同步 | `clickzetta-realtime-sync-pipeline` |
68
- | 对象存储 (OSS/S3/COS) | 准实时/批量 | 文件持续到达 | PIPE 持续导入 | `clickzetta-oss-ingest-pipeline` |
69
- | 对象存储 | 一次性 | 批量文件 | COPY INTO 命令 | `clickzetta-file-import-pipeline`(COPY INTO 部分) |
70
- | MySQL/PostgreSQL/SQL Server | 实时 CDC | 单表 | Studio 实时同步 | `clickzetta-realtime-sync-pipeline` |
71
- | MySQL/PostgreSQL/SQL Server | 实时 CDC | 多表/整库 | Studio 多表实时同步 | `clickzetta-cdc-sync-pipeline` |
72
- | MySQL/PostgreSQL/SQL Server | 离线批量 | 单表 | Studio 离线同步 | `clickzetta-batch-sync-pipeline` |
73
- | MySQL/PostgreSQL/SQL Server | 离线批量 | 多表 | Studio 多表离线同步 | `clickzetta-batch-sync-pipeline` |
74
- | 本地文件 / URL | 一次性 | 单文件/多文件 | URL 下载 + COPY INTO | `clickzetta-file-import-pipeline` |
75
- | 流式增量计算 | 准实时 | 表变更驱动 | Dynamic Table + Stream | `clickzetta-incremental-compute-pipeline` |
76
- | Java 应用 | 实时/批量 | 程序写入 | Java SDK | (见下方 SDK 导入指引) |
77
- | Python/ZettaPark | 批量 | DataFrame | ZettaPark save_as_table | (见下方 SDK 导入指引) |
78
-
79
- ### 步骤 3:路由到专项 Skill 或直接执行
80
-
81
- 根据推荐方案,执行以下路由逻辑:
82
-
83
- **有对应专项 Skill 的场景** → 告知用户推荐方案,引导使用对应 Skill:
84
- - `clickzetta-kafka-ingest-pipeline`:Kafka PIPE 管道搭建
85
- - `clickzetta-oss-ingest-pipeline`:对象存储 PIPE 管道搭建
86
- - `clickzetta-batch-sync-pipeline`:Studio 离线同步任务
87
- - `clickzetta-realtime-sync-pipeline`:Studio 实时同步任务
88
- - `clickzetta-cdc-sync-pipeline`:Studio 多表实时同步(CDC)
89
- - `clickzetta-incremental-compute-pipeline`:Dynamic Table + Stream 增量计算管道
90
- - `clickzetta-file-import-pipeline`:URL/文件下载导入
91
- - `clickzetta-table-stream-pipeline`:Table Stream 变更数据捕获
92
-
93
- **无专项 Skill 的简单场景** → 直接执行:
94
-
95
- #### SQL INSERT 导入(小数据量)
96
- ```sql
97
- -- 使用 LH_execute_query 执行
98
- INSERT INTO schema_name.table_name (col1, col2, col3)
99
- VALUES ('val1', 'val2', 'val3');
100
- ```
101
-
102
- #### COPY INTO 快速导入(从 Volume)
103
- ```sql
104
- -- 1. 确认 Volume 中有文件
105
- SHOW VOLUME DIRECTORY volume_name;
106
-
107
- -- 2. 执行 COPY INTO
108
- COPY INTO schema_name.table_name
109
- FROM VOLUME volume_name
110
- USING CSV
111
- OPTIONS('header' = 'true');
112
- ```
113
-
114
- #### Java SDK 导入指引
115
- 提供 Java SDK 的关键配置信息:
116
- - Maven 依赖坐标
117
- - 连接配置(endpoint、workspace、schema、vcluster)
118
- - 批量写入 API:`BulkloadWriter`
119
- - 实时写入 API:`RealtimeWriter`
120
- - 建议用户参考官方文档:`comprehensive_guide_to_ingesting_javasdk_buckload_realtime`
121
-
122
- #### ZettaPark (Python) 导入指引
123
- - `INSERT` 方式:`session.sql("INSERT INTO ...")`
124
- - `save_as_table` 方式:`df.write.save_as_table("table_name")`
125
- - 建议用户参考官方文档:`comprehensive_guide_to_ingesting_zettapark_save_as_table`
126
-
127
- ## 数据入仓 vs 数据入湖
128
-
129
- | 维度 | 数据入仓 | 数据入湖 |
130
- |------|---------|---------|
131
- | 目标 | Lakehouse 托管表 | 用户 Volume(对象存储) |
132
- | 格式 | 自动转为内部列式格式 | 保持原始文件格式 |
133
- | 查询性能 | 高(列式存储 + 索引) | 较低(需扫描原始文件) |
134
- | 适用场景 | 分析查询、BI 报表、数据仓库 | 数据暂存、原始数据归档、跨系统共享 |
135
- | 常用方式 | Studio 同步、PIPE、COPY INTO、SDK | PUT 文件、Python 脚本上传 |
136
-
137
- ## 示例
138
-
139
- ### 示例 1:用户不确定导入方式
140
-
141
- 用户说:"我有一个 MySQL 数据库,想把里面的订单表实时同步到 Lakehouse"
142
-
143
- 路由逻辑:
144
- 1. 数据源:MySQL(关系型数据库)
145
- 2. 实时性:实时
146
- 3. 同步范围:单表
147
- 4. 需要 CDC:是(实时同步意味着需要捕获变更)
148
- → 推荐:Studio 实时同步
149
- → 路由到 `clickzetta-realtime-sync-pipeline` Skill
150
-
151
- ### 示例 2:多种数据源混合场景
152
-
153
- 用户说:"我们有 Kafka 的用户行为日志,还有 MySQL 的业务数据,都要导入 Lakehouse"
154
-
155
- 路由逻辑:
156
- 1. Kafka 用户行为日志 → `clickzetta-kafka-ingest-pipeline`(PIPE 持续导入)
157
- 2. MySQL 业务数据 → 确认实时性需求:
158
- - 实时 → `clickzetta-realtime-sync-pipeline` 或 `clickzetta-cdc-sync-pipeline`
159
- - 离线 → `clickzetta-batch-sync-pipeline`
160
- → 分别引导到对应 Skill
161
-
162
- ### 示例 3:简单的一次性文件导入
163
-
164
- 用户说:"我有一个 CSV 文件要导入"
165
-
166
- 路由逻辑:
167
- 1. 数据源:本地文件
168
- 2. 一次性导入
169
- → 路由到 `clickzetta-file-import-pipeline` Skill(支持文件上传 + COPY INTO)
170
-
171
- ## 错误处理
172
-
173
- | 场景 | 处理方式 |
174
- |------|---------|
175
- | 用户无法确定数据源类型 | 询问数据当前存储位置(哪个系统/服务),帮助判断 |
176
- | 用户需求跨多种导入方式 | 拆分为多个独立的导入任务,分别路由到对应 Skill |
177
- | 推荐的 Skill 尚未创建 | 提供该导入方式的基本步骤和关键 SQL/API,引导用户参考官方文档 |
178
- | 用户的云环境不支持某种连接 | 使用 `LH_show_object_list`(object_type=CONNECTIONS)检查可用连接类型,推荐替代方案 |
179
- | 数据量极大(TB 级) | 建议分批导入,优先使用 PIPE 或 Studio 同步任务(支持断点续传) |
180
-
181
- ## 注意事项
182
-
183
- - 本 Skill 是路由入口,不直接执行复杂的 pipeline 搭建,而是引导到专项 Skill
184
- - 对于简单场景(SQL INSERT、单次 COPY INTO),可以直接在本 Skill 中完成
185
- - 推荐方案时需考虑用户的云环境(阿里云/腾讯云/AWS),不同环境支持的连接类型可能不同
186
- - 使用 `LH_show_object_list`(object_type=VCLUSTERS)确认可用的虚拟集群,同步任务需要 SYNC 类型的 VCluster
187
- - 数据入仓是最常见的场景,数据入湖主要用于原始数据暂存或跨系统共享
188
-
189
- ---
190
-
191
- ## cz-cli 替代路径
192
-
193
- > 仅在 cz-cli 可用且 MCP 不可用时使用本节。
194
- > 本 Skill 是路由入口,cz-cli 路径的核心逻辑在各专项 Skill 的"cz-cli 替代路径"章节中。
195
-
196
- ### 路由说明
197
-
198
- 当 MCP 不可用时,各专项 Skill 均已提供 cz-cli 替代路径:
199
-
200
- | 数据源 | 推荐方式 | 对应 Skill 的 cz-cli 路径 |
201
- |--------|---------|--------------------------|
202
- | Kafka | PIPE 持续导入 | `clickzetta-kafka-ingest-pipeline` → cz-cli 替代路径 |
203
- | 对象存储 (OSS/S3/COS) | PIPE 持续导入 | `clickzetta-oss-ingest-pipeline` → cz-cli 替代路径 |
204
- | MySQL/PostgreSQL/SQL Server(实时单表) | Studio 实时同步 | `clickzetta-realtime-sync-pipeline` → cz-cli 替代路径 |
205
- | MySQL/PostgreSQL/SQL Server(实时多表/整库) | Studio 多表实时同步 | `clickzetta-cdc-sync-pipeline` → cz-cli 替代路径 |
206
- | MySQL/PostgreSQL/SQL Server(离线批量) | Studio 离线同步 | `clickzetta-batch-sync-pipeline` → cz-cli 替代路径 |
207
-
208
- ### 简单场景直接执行(cz-cli 版)
209
-
210
- 对于无需专项 Skill 的简单场景,可直接用 cz-cli agent 完成:
211
-
212
- ```bash
213
- # SQL INSERT 导入(小数据量)
214
- cz-cli agent run "向表 <schema_name>.<table_name> 插入数据:<col1>=<val1>, <col2>=<val2>" \
215
- --format a2a --dangerously-skip-permissions
216
-
217
- # COPY INTO 快速导入(从 Volume)
218
- cz-cli agent run "从 Volume <volume_name> 以 CSV 格式(有 header)将数据导入表 <schema_name>.<table_name>" \
219
- --format a2a --dangerously-skip-permissions
220
- ```
@@ -1,5 +0,0 @@
1
- {"case_id":"001","type":"should_call","user_input":"我想把数据导入 Lakehouse,但不确定用哪种方式","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["数据源","实时","批量"]}
2
- {"case_id":"002","type":"should_call","user_input":"数据入仓有哪些方案?怎么选择?","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["Kafka","对象存储","MySQL"]}
3
- {"case_id":"003","type":"should_call","user_input":"我有 MySQL 和 Kafka 两个数据源要导入 Lakehouse,分别用什么方式?","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["CDC"]}
4
- {"case_id":"004","type":"should_call","user_input":"数据导入方案怎么选?实时和离线有什么区别?","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["实时","离线","延迟"]}
5
- {"case_id":"005","type":"should_call","user_input":"ingest data into ClickZetta Lakehouse, what options do I have?","expected_skill":"clickzetta-data-ingest-pipeline","expected_output_contains":["Kafka","OSS","SDK"]}