@clickzetta/cz-cli-darwin-arm64 0.3.17 → 0.3.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
- package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
- package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
- package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +160 -0
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +155 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +386 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +548 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +220 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +259 -0
- package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +112 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +257 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +124 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +96 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +109 -0
- package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +156 -0
- package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
- package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
- package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +751 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +324 -0
- package/bin/skills/clickzetta-monitoring/SKILL.md +199 -0
- package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +537 -0
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +156 -0
- package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +56 -0
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +78 -0
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +65 -0
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +49 -0
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +42 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +276 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +379 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +166 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +185 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +129 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +222 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +125 -0
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +206 -0
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +212 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +54 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +150 -0
- package/bin/skills/clickzetta-volume-manager/SKILL.md +292 -0
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +199 -0
- package/bin/skills/cz-cli/SKILL.md +1 -1
- package/bin/skills/cz-cli-inner/SKILL.md +8 -0
- package/package.json +1 -1
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/SKILL.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/dt-declaration-strategy.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/incremental-config-reference.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/refresh-history-guide.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/sql-limitations.md +0 -0
- /package/bin/skills/{dynamic-table-alter → clickzetta-dynamic-table/dynamic-table-alter}/SKILL.md +0 -0
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"怎么用 READ_KAFKA Pipe 把 Kafka topic 数据持续导入 Lakehouse?","expected_skill":"clickzetta-kafka-ingest-pipeline","expected_output_contains":["READ_KAFKA","PIPE"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"Kafka 消息是多层嵌套 JSON,怎么解析导入?","expected_skill":"clickzetta-kafka-ingest-pipeline","expected_output_contains":["JSON","READ_KAFKA"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"Kafka Pipe 延迟很高怎么排查?怎么监控 offsetLag?","expected_skill":"clickzetta-kafka-ingest-pipeline","expected_output_contains":["pipe_latency","offsetLag"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"Kafka 需要 SASL 认证,Pipe 怎么配置?","expected_skill":"clickzetta-kafka-ingest-pipeline","expected_output_contains":["SASL"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"Kafka 外部表和 READ_KAFKA Pipe 有什么区别?该选哪个?","expected_skill":"clickzetta-kafka-ingest-pipeline","expected_output_contains":["外部表","READ_KAFKA","Table Stream"]}
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
# Kafka Pipe SQL 语法参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/pipe-kafka 和 https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1
|
|
4
|
+
|
|
5
|
+
> **⚠️ ClickZetta READ_KAFKA 使用位置参数(positional parameters)**
|
|
6
|
+
> - ❌ 不支持 `=>` 命名参数语法(如 `KAFKA_BROKER => 'host:port'`)
|
|
7
|
+
> - ❌ 不支持 `TABLE(READ_KAFKA(...))` 包装
|
|
8
|
+
> - ✅ 正确:`FROM read_kafka('broker', 'topic', '', 'group', '', '', '', '', 'raw', 'raw', 0, MAP(...))`
|
|
9
|
+
|
|
10
|
+
## CREATE PIPE(READ_KAFKA 方式)
|
|
11
|
+
|
|
12
|
+
```sql
|
|
13
|
+
CREATE [ OR REPLACE ] PIPE <pipe_name>
|
|
14
|
+
VIRTUAL_CLUSTER = '<vcluster_name>'
|
|
15
|
+
[ BATCH_INTERVAL_IN_SECONDS = '<seconds>' ]
|
|
16
|
+
[ BATCH_SIZE_PER_KAFKA_PARTITION = '<count>' ]
|
|
17
|
+
[ MAX_SKIP_BATCH_COUNT_ON_ERROR = '<count>' ]
|
|
18
|
+
[ INITIAL_DELAY_IN_SECONDS = '<seconds>' ]
|
|
19
|
+
[ RESET_KAFKA_GROUP_OFFSETS = '<offset_value>' ]
|
|
20
|
+
[ COPY_JOB_HINT = '<json>' ]
|
|
21
|
+
AS
|
|
22
|
+
COPY INTO <target_table> FROM (
|
|
23
|
+
SELECT <expr> [, ...]
|
|
24
|
+
FROM read_kafka(
|
|
25
|
+
'<bootstrap_servers>', -- 位置 1:Kafka 集群地址(必填)
|
|
26
|
+
'<topic_name>', -- 位置 2:Topic 名称(必填)
|
|
27
|
+
'', -- 位置 3:Topic pattern(保留,填空字符串)
|
|
28
|
+
'<group_id>', -- 位置 4:消费者组 ID(必填)
|
|
29
|
+
'', -- 位置 5:starting_offsets(Pipe 中留空)
|
|
30
|
+
'', -- 位置 6:ending_offsets(Pipe 中留空)
|
|
31
|
+
'', -- 位置 7:starting_timestamp(Pipe 中留空)
|
|
32
|
+
'', -- 位置 8:ending_timestamp(Pipe 中留空)
|
|
33
|
+
'raw', -- 位置 9:key 格式(目前只支持 raw)
|
|
34
|
+
'raw', -- 位置 10:value 格式(目前只支持 raw)
|
|
35
|
+
0, -- 位置 11:max_errors
|
|
36
|
+
MAP(<kafka_config>) -- 位置 12:Kafka 配置参数
|
|
37
|
+
)
|
|
38
|
+
);
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Pipe 参数说明
|
|
42
|
+
|
|
43
|
+
| 参数 | 必填 | 默认值 | 说明 |
|
|
44
|
+
|------|------|--------|------|
|
|
45
|
+
| `VIRTUAL_CLUSTER` | 是 | — | 执行 Pipe 任务的计算集群 |
|
|
46
|
+
| `BATCH_INTERVAL_IN_SECONDS` | 否 | 60 | 批处理间隔(秒),即数据新鲜度 |
|
|
47
|
+
| `BATCH_SIZE_PER_KAFKA_PARTITION` | 否 | 500000 | 每个 Kafka 分区每批最大消息数 |
|
|
48
|
+
| `MAX_SKIP_BATCH_COUNT_ON_ERROR` | 否 | 30 | 出错时跳过批次的最大重试次数 |
|
|
49
|
+
| `INITIAL_DELAY_IN_SECONDS` | 否 | 0 | 首个作业调度延迟 |
|
|
50
|
+
| `RESET_KAFKA_GROUP_OFFSETS` | 否 | — | 启动时消费位点(仅创建时生效) |
|
|
51
|
+
| `COPY_JOB_HINT` | 否 | — | JSON 格式的作业参数 |
|
|
52
|
+
|
|
53
|
+
### RESET_KAFKA_GROUP_OFFSETS 可选值
|
|
54
|
+
|
|
55
|
+
| 值 | 说明 |
|
|
56
|
+
|----|------|
|
|
57
|
+
| `'none'` | 无操作,使用 Kafka `auto.offset.reset`(默认 latest) |
|
|
58
|
+
| `'valid'` | 检查当前位点是否过期,将过期分区重置到 earliest |
|
|
59
|
+
| `'earliest'` | 重置到最早位点 |
|
|
60
|
+
| `'latest'` | 重置到最新位点 |
|
|
61
|
+
| `'<毫秒时间戳>'` | 重置到指定时间戳对应位点(如 `'1737789688000'`) |
|
|
62
|
+
|
|
63
|
+
### READ_KAFKA 参数(在 Pipe 中 vs 独立使用)
|
|
64
|
+
|
|
65
|
+
| 特性 | 独立使用 read_kafka | 在 Pipe 中使用 |
|
|
66
|
+
|------|-------------------|---------------|
|
|
67
|
+
| 消费者组 | 临时,执行完即销毁 | 持久,保持消费位置 |
|
|
68
|
+
| 位置管理 | 在 MAP 中设置 `kafka.auto.offset.reset` | Pipe 自动管理,位置参数**必须留空** |
|
|
69
|
+
| 执行方式 | 一次性查询 | 持续调度执行 |
|
|
70
|
+
| 默认起始位置 | latest(可在 MAP 中改为 earliest) | latest(由 RESET_KAFKA_GROUP_OFFSETS 控制) |
|
|
71
|
+
|
|
72
|
+
### MAP 配置参数
|
|
73
|
+
|
|
74
|
+
| 参数 | 说明 |
|
|
75
|
+
|------|------|
|
|
76
|
+
| `kafka.security.protocol` | 安全协议:`PLAINTEXT` 或 `SASL_PLAINTEXT` |
|
|
77
|
+
| `kafka.sasl.mechanism` | SASL 机制:`PLAIN` |
|
|
78
|
+
| `kafka.sasl.username` | SASL 用户名 |
|
|
79
|
+
| `kafka.sasl.password` | SASL 密码 |
|
|
80
|
+
| `kafka.auto.offset.reset` | 独立探查时的起始位点(`earliest` / `latest`) |
|
|
81
|
+
| `cz.kafka.fetch.retry.enable` | 启用 fetch 重试(`true`/`false`) |
|
|
82
|
+
| `cz.kafka.fetch.retry.times` | 重试次数 |
|
|
83
|
+
| `cz.kafka.fetch.retry.intervalMs` | 重试间隔(毫秒) |
|
|
84
|
+
|
|
85
|
+
### JSON 字段提取语法
|
|
86
|
+
|
|
87
|
+
```sql
|
|
88
|
+
-- key 和 value 都是 binary 类型,需要先转换
|
|
89
|
+
value::string -- 转为字符串
|
|
90
|
+
parse_json(value::string) -- 解析为 JSON 对象
|
|
91
|
+
parse_json(value::string)['field']::TYPE -- 提取顶层字段
|
|
92
|
+
parse_json(value::string)['nested']['key']::TYPE -- 提取嵌套字段
|
|
93
|
+
|
|
94
|
+
-- 推荐模式:在子查询中先 parse_json,外层直接用 j['field']
|
|
95
|
+
SELECT j['order_id']::STRING, j['amount']::DECIMAL(10,2)
|
|
96
|
+
FROM (
|
|
97
|
+
SELECT parse_json(value::string) AS j
|
|
98
|
+
FROM read_kafka(...)
|
|
99
|
+
)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### 完整示例
|
|
103
|
+
|
|
104
|
+
```sql
|
|
105
|
+
-- 无认证 Kafka Pipe
|
|
106
|
+
CREATE PIPE kafka_orders_pipe
|
|
107
|
+
VIRTUAL_CLUSTER = 'default'
|
|
108
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
109
|
+
AS
|
|
110
|
+
COPY INTO ods.orders FROM (
|
|
111
|
+
SELECT
|
|
112
|
+
j['order_id']::STRING AS order_id,
|
|
113
|
+
j['user_id']::STRING AS user_id,
|
|
114
|
+
j['amount']::DECIMAL(10,2) AS amount,
|
|
115
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
116
|
+
FROM (
|
|
117
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
118
|
+
FROM read_kafka(
|
|
119
|
+
'kafka.example.com:9092',
|
|
120
|
+
'orders',
|
|
121
|
+
'',
|
|
122
|
+
'lakehouse_orders',
|
|
123
|
+
'', '', '', '',
|
|
124
|
+
'raw', 'raw', 0,
|
|
125
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
);
|
|
129
|
+
|
|
130
|
+
-- SASL 认证 + 指定时间点消费
|
|
131
|
+
CREATE PIPE kafka_secure_pipe
|
|
132
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
133
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
134
|
+
RESET_KAFKA_GROUP_OFFSETS = '1737789688000'
|
|
135
|
+
AS
|
|
136
|
+
COPY INTO ods.secure_events FROM (
|
|
137
|
+
SELECT
|
|
138
|
+
j['id']::STRING AS event_id,
|
|
139
|
+
j['payload']::STRING AS payload,
|
|
140
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
141
|
+
FROM (
|
|
142
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
143
|
+
FROM read_kafka(
|
|
144
|
+
'kafka.example.com:9092',
|
|
145
|
+
'secure_events',
|
|
146
|
+
'',
|
|
147
|
+
'cz_secure',
|
|
148
|
+
'', '', '', '',
|
|
149
|
+
'raw', 'raw', 0,
|
|
150
|
+
MAP(
|
|
151
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
152
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
153
|
+
'kafka.sasl.username', 'my_user',
|
|
154
|
+
'kafka.sasl.password', 'my_password'
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
);
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## 独立探查(验证连接和数据格式)
|
|
164
|
+
|
|
165
|
+
```sql
|
|
166
|
+
-- 无认证
|
|
167
|
+
SELECT value::string
|
|
168
|
+
FROM read_kafka(
|
|
169
|
+
'kafka.example.com:9092',
|
|
170
|
+
'orders',
|
|
171
|
+
'',
|
|
172
|
+
'test_explore',
|
|
173
|
+
'', '', '', '',
|
|
174
|
+
'raw', 'raw', 0,
|
|
175
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
176
|
+
)
|
|
177
|
+
LIMIT 10;
|
|
178
|
+
|
|
179
|
+
-- SASL 认证
|
|
180
|
+
SELECT value::string
|
|
181
|
+
FROM read_kafka(
|
|
182
|
+
'kafka.example.com:9092',
|
|
183
|
+
'orders',
|
|
184
|
+
'',
|
|
185
|
+
'test_explore',
|
|
186
|
+
'', '', '', '',
|
|
187
|
+
'raw', 'raw', 0,
|
|
188
|
+
MAP(
|
|
189
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
190
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
191
|
+
'kafka.sasl.username', 'my_user',
|
|
192
|
+
'kafka.sasl.password', 'my_password',
|
|
193
|
+
'kafka.auto.offset.reset', 'earliest'
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
LIMIT 10;
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## CREATE PIPE(Kafka 外部表 + Table Stream 方式)
|
|
202
|
+
|
|
203
|
+
### 步骤 1:创建 Kafka Storage Connection
|
|
204
|
+
|
|
205
|
+
```sql
|
|
206
|
+
CREATE STORAGE CONNECTION IF NOT EXISTS <conn_name>
|
|
207
|
+
TYPE KAFKA
|
|
208
|
+
BOOTSTRAP_SERVERS = ['<host1>:<port1>', '<host2>:<port2>']
|
|
209
|
+
SECURITY_PROTOCOL = 'PLAINTEXT';
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### 步骤 2:创建 Kafka 外部表
|
|
213
|
+
|
|
214
|
+
```sql
|
|
215
|
+
-- ⚠️ 必须显式指定列定义(不能省略)
|
|
216
|
+
-- ⚠️ offset 是保留字,必须用反引号转义
|
|
217
|
+
CREATE EXTERNAL TABLE <ext_table_name> (
|
|
218
|
+
topic STRING,
|
|
219
|
+
partition INT,
|
|
220
|
+
`offset` BIGINT,
|
|
221
|
+
`timestamp` TIMESTAMP,
|
|
222
|
+
timestamp_type STRING,
|
|
223
|
+
headers STRING,
|
|
224
|
+
key BINARY,
|
|
225
|
+
value BINARY
|
|
226
|
+
)
|
|
227
|
+
USING KAFKA
|
|
228
|
+
OPTIONS (
|
|
229
|
+
'group_id' = '<consumer_group>',
|
|
230
|
+
'topics' = '<topic_name>',
|
|
231
|
+
'starting_offset' = '<earliest | latest>'
|
|
232
|
+
)
|
|
233
|
+
CONNECTION <conn_name>;
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
> **注意**:
|
|
237
|
+
> - 列定义是**必须的**,省略会报错 `failed to detect columns`
|
|
238
|
+
> - `offset` 和 `timestamp` 是保留字,需要反引号转义
|
|
239
|
+
> - 删除外部表用 `DROP TABLE`(不是 `DROP EXTERNAL TABLE`)
|
|
240
|
+
|
|
241
|
+
### 步骤 3:创建 Table Stream
|
|
242
|
+
|
|
243
|
+
```sql
|
|
244
|
+
CREATE TABLE STREAM <stream_name>
|
|
245
|
+
ON TABLE <ext_table_name>
|
|
246
|
+
WITH PROPERTIES ('TABLE_STREAM_MODE' = 'APPEND_ONLY');
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### 步骤 4:创建 Pipe
|
|
250
|
+
|
|
251
|
+
```sql
|
|
252
|
+
CREATE PIPE <pipe_name>
|
|
253
|
+
VIRTUAL_CLUSTER = '<vcluster_name>'
|
|
254
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
255
|
+
AS
|
|
256
|
+
COPY INTO <target_table>
|
|
257
|
+
SELECT <expr> [, ...]
|
|
258
|
+
FROM <stream_name>;
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
263
|
+
## ALTER PIPE
|
|
264
|
+
|
|
265
|
+
```sql
|
|
266
|
+
-- 暂停
|
|
267
|
+
ALTER PIPE <pipe_name> SET PIPE_EXECUTION_PAUSED = true;
|
|
268
|
+
|
|
269
|
+
-- 恢复
|
|
270
|
+
ALTER PIPE <pipe_name> SET PIPE_EXECUTION_PAUSED = false;
|
|
271
|
+
|
|
272
|
+
-- 修改 VCluster
|
|
273
|
+
ALTER PIPE <pipe_name> SET VIRTUAL_CLUSTER = 'new_vc';
|
|
274
|
+
|
|
275
|
+
-- 修改 COPY_JOB_HINT
|
|
276
|
+
ALTER PIPE <pipe_name> SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
> ⚠️ **ALTER PIPE 支持的属性**:
|
|
280
|
+
> - ✅ `PIPE_EXECUTION_PAUSED`
|
|
281
|
+
> - ✅ `VIRTUAL_CLUSTER`
|
|
282
|
+
> - ✅ `COPY_JOB_HINT`
|
|
283
|
+
> - ❌ `BATCH_INTERVAL_IN_SECONDS`(不支持,需删除重建)
|
|
284
|
+
> - ❌ `BATCH_SIZE_PER_KAFKA_PARTITION`(不支持,需删除重建)
|
|
285
|
+
>
|
|
286
|
+
> 不支持修改 COPY/INSERT 语句逻辑,需删除 Pipe 后重建。
|
|
287
|
+
> 修改 `COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数。
|
|
288
|
+
|
|
289
|
+
---
|
|
290
|
+
|
|
291
|
+
## 监控
|
|
292
|
+
|
|
293
|
+
```sql
|
|
294
|
+
-- 查看 Pipe 详情(含延迟信息 pipe_latency)
|
|
295
|
+
DESC PIPE EXTENDED <pipe_name>;
|
|
296
|
+
|
|
297
|
+
-- 查看所有 Pipe
|
|
298
|
+
SHOW PIPES;
|
|
299
|
+
|
|
300
|
+
-- 查看加载历史
|
|
301
|
+
SELECT * FROM load_history('<schema>.<table>')
|
|
302
|
+
ORDER BY last_load_time DESC LIMIT 20;
|
|
303
|
+
|
|
304
|
+
-- 通过 query_tag 查看 Pipe 作业
|
|
305
|
+
-- 格式:pipe.<workspace_name>.<schema_name>.<pipe_name>
|
|
306
|
+
SHOW JOBS WHERE query_tag = 'pipe.my_workspace.ods.kafka_orders_pipe';
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## DROP PIPE
|
|
312
|
+
|
|
313
|
+
```sql
|
|
314
|
+
DROP PIPE [ IF EXISTS ] <pipe_name>;
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## 参考文档
|
|
318
|
+
|
|
319
|
+
- [Pipe 简介](https://www.yunqi.tech/documents/pipe-summary)
|
|
320
|
+
- [借助 read_kafka 函数持续导入](https://www.yunqi.tech/documents/pipe-kafka)
|
|
321
|
+
- [借助 Kafka 外表 Table Stream 持续导入](https://www.yunqi.tech/documents/pipe-kafka-table-stream)
|
|
322
|
+
- [最佳实践:使用 Pipe 高效接入 Kafka 数据](https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1)
|
|
323
|
+
- [Kafka 外部表](https://www.yunqi.tech/documents/kafka-external-table)
|
|
324
|
+
- [Kafka Storage Connection](https://www.yunqi.tech/documents/Kafka_connection)
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-monitoring
|
|
3
|
+
description: |
|
|
4
|
+
监控和分析 ClickZetta Lakehouse 作业运行状态、性能和资源使用情况,
|
|
5
|
+
以及通过 INFORMATION_SCHEMA 查询元数据(表、列、Schema、工作空间等)。
|
|
6
|
+
覆盖 SHOW JOBS 实时查看作业、information_schema.job_history 历史分析、
|
|
7
|
+
慢查询识别、集群负载分析、缓存命中率统计、失败作业排查、
|
|
8
|
+
information_schema.tables/columns/schemas 元数据查询等完整监控与治理工作流。
|
|
9
|
+
当用户说"查看作业"、"作业历史"、"SHOW JOBS"、"慢查询"、"查询性能"、
|
|
10
|
+
"集群负载"、"作业失败"、"查询失败"、"监控"、"job history"、
|
|
11
|
+
"information_schema"、"缓存命中率"、"查询耗时"、"作业状态"、
|
|
12
|
+
"元数据查询"、"查看所有表"、"表大小"、"列信息"、"资产盘点"时触发。
|
|
13
|
+
Keywords: monitoring, job status, performance, resource usage, SHOW JOBS, slow query
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
# ClickZetta 作业监控与分析
|
|
17
|
+
|
|
18
|
+
阅读 [references/show-jobs.md](references/show-jobs.md) 了解 SHOW JOBS 语法。
|
|
19
|
+
阅读 [references/job-history-analysis.md](references/job-history-analysis.md) 了解历史分析查询。
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 实时查看作业(SHOW JOBS)
|
|
24
|
+
|
|
25
|
+
```sql
|
|
26
|
+
-- 查看所有作业(最近7天)
|
|
27
|
+
SHOW JOBS;
|
|
28
|
+
|
|
29
|
+
-- 查看指定集群的作业
|
|
30
|
+
SHOW JOBS IN VCLUSTER default_ap;
|
|
31
|
+
|
|
32
|
+
-- 查看执行时间超过2分钟的慢查询
|
|
33
|
+
SHOW JOBS WHERE execution_time > INTERVAL 2 MINUTE;
|
|
34
|
+
|
|
35
|
+
-- 查看失败的作业
|
|
36
|
+
SHOW JOBS WHERE status = 'FAILED';
|
|
37
|
+
|
|
38
|
+
-- 限制返回数量
|
|
39
|
+
SHOW JOBS IN VCLUSTER default_ap LIMIT 50;
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## 历史作业分析(information_schema)
|
|
45
|
+
|
|
46
|
+
### 集群负载分析
|
|
47
|
+
|
|
48
|
+
```sql
|
|
49
|
+
SELECT
|
|
50
|
+
virtual_cluster,
|
|
51
|
+
COUNT(*) AS job_count,
|
|
52
|
+
AVG(execution_time) AS avg_seconds,
|
|
53
|
+
ROUND(SUM(CASE WHEN status = 'SUCCEED' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS success_rate
|
|
54
|
+
FROM sys.information_schema.job_history
|
|
55
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
56
|
+
GROUP BY virtual_cluster
|
|
57
|
+
ORDER BY job_count DESC;
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### 慢查询 TOP 20
|
|
61
|
+
|
|
62
|
+
```sql
|
|
63
|
+
SELECT job_id, virtual_cluster, execution_time, status, start_time
|
|
64
|
+
FROM sys.information_schema.job_history
|
|
65
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
66
|
+
ORDER BY execution_time DESC
|
|
67
|
+
LIMIT 20;
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### 失败作业统计
|
|
71
|
+
|
|
72
|
+
```sql
|
|
73
|
+
SELECT
|
|
74
|
+
virtual_cluster,
|
|
75
|
+
COUNT(*) AS failed_count,
|
|
76
|
+
DATE(start_time) AS date
|
|
77
|
+
FROM sys.information_schema.job_history
|
|
78
|
+
WHERE status = 'FAILED'
|
|
79
|
+
AND start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
80
|
+
GROUP BY virtual_cluster, DATE(start_time)
|
|
81
|
+
ORDER BY date DESC;
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### 高峰期识别
|
|
85
|
+
|
|
86
|
+
```sql
|
|
87
|
+
SELECT
|
|
88
|
+
HOUR(start_time) AS hour_of_day,
|
|
89
|
+
COUNT(*) AS job_count,
|
|
90
|
+
AVG(execution_time) AS avg_execution_time
|
|
91
|
+
FROM sys.information_schema.job_history
|
|
92
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
93
|
+
GROUP BY HOUR(start_time)
|
|
94
|
+
ORDER BY hour_of_day;
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## query_tag 标记与过滤
|
|
100
|
+
|
|
101
|
+
给作业打标,便于按来源过滤:
|
|
102
|
+
|
|
103
|
+
```sql
|
|
104
|
+
-- 在 SQL 中设置 query_tag
|
|
105
|
+
SET query_tag = 'etl_daily';
|
|
106
|
+
SELECT * FROM orders;
|
|
107
|
+
|
|
108
|
+
-- 按 query_tag 过滤作业历史
|
|
109
|
+
SELECT job_id, execution_time, status
|
|
110
|
+
FROM sys.information_schema.job_history
|
|
111
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
112
|
+
AND query_tag = 'etl_daily';
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
JDBC URL 中设置:
|
|
116
|
+
```
|
|
117
|
+
jdbc:clickzetta://instance.region.api.clickzetta.com/workspace?query_tag=my_app
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## 常见问题排查
|
|
123
|
+
|
|
124
|
+
| 现象 | 排查方向 |
|
|
125
|
+
|---|---|
|
|
126
|
+
| 作业长时间"等待执行" | 集群资源不足,考虑扩容 VCluster |
|
|
127
|
+
| 作业长时间"集群启动中" | VCluster 冷启动慢,联系技术支持 |
|
|
128
|
+
| 大量失败作业 | 查看 job_id 详情,检查 SQL 语法或权限 |
|
|
129
|
+
| 平均执行时间突然变长 | 检查数据量变化、索引状态、缓存命中率 |
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## INFORMATION_SCHEMA 元数据查询
|
|
134
|
+
|
|
135
|
+
除了 `job_history`,INFORMATION_SCHEMA 还提供丰富的元数据视图,用于资产盘点和治理。
|
|
136
|
+
|
|
137
|
+
### 空间级视图(当前工作空间)
|
|
138
|
+
|
|
139
|
+
```sql
|
|
140
|
+
-- 查看当前空间下所有 Schema
|
|
141
|
+
SELECT * FROM information_schema.schemas;
|
|
142
|
+
|
|
143
|
+
-- 查看所有表及其大小、行数
|
|
144
|
+
SELECT table_schema, table_name, table_type, row_count, bytes
|
|
145
|
+
FROM information_schema.tables
|
|
146
|
+
ORDER BY bytes DESC;
|
|
147
|
+
|
|
148
|
+
-- 查看所有列的详细信息(字段名、类型、是否可空、注释)
|
|
149
|
+
SELECT table_schema, table_name, column_name, data_type, is_nullable, comment
|
|
150
|
+
FROM information_schema.columns
|
|
151
|
+
WHERE table_schema = 'public';
|
|
152
|
+
|
|
153
|
+
-- 查看排序列推荐
|
|
154
|
+
SELECT * FROM information_schema.sortkey_candidates;
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### 实例级视图(需要 instance_admin 权限,使用 sys 库)
|
|
158
|
+
|
|
159
|
+
```sql
|
|
160
|
+
-- 查看实例下所有工作空间
|
|
161
|
+
SELECT * FROM sys.information_schema.workspaces;
|
|
162
|
+
|
|
163
|
+
-- 查看实例下所有 Schema(跨工作空间)
|
|
164
|
+
SELECT * FROM sys.information_schema.schemas;
|
|
165
|
+
|
|
166
|
+
-- 查看实例用量(费用分析)
|
|
167
|
+
SELECT * FROM sys.information_schema.instance_usage
|
|
168
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 7 DAY;
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### 常用元数据分析场景
|
|
172
|
+
|
|
173
|
+
```sql
|
|
174
|
+
-- 找出最大的 10 张表
|
|
175
|
+
SELECT table_schema, table_name, row_count, bytes
|
|
176
|
+
FROM information_schema.tables
|
|
177
|
+
WHERE table_type = 'TABLE'
|
|
178
|
+
ORDER BY bytes DESC
|
|
179
|
+
LIMIT 10;
|
|
180
|
+
|
|
181
|
+
-- 找出没有注释的表
|
|
182
|
+
SELECT table_schema, table_name
|
|
183
|
+
FROM information_schema.tables
|
|
184
|
+
WHERE comment IS NULL OR comment = '';
|
|
185
|
+
|
|
186
|
+
-- 找出没有注释的字段
|
|
187
|
+
SELECT table_schema, table_name, column_name
|
|
188
|
+
FROM information_schema.columns
|
|
189
|
+
WHERE (comment IS NULL OR comment = '')
|
|
190
|
+
AND table_schema NOT IN ('information_schema');
|
|
191
|
+
|
|
192
|
+
-- 统计各 Schema 下的表数量和总存储
|
|
193
|
+
SELECT table_schema,
|
|
194
|
+
COUNT(*) AS table_count,
|
|
195
|
+
SUM(bytes) AS total_storage
|
|
196
|
+
FROM information_schema.tables
|
|
197
|
+
GROUP BY table_schema
|
|
198
|
+
ORDER BY total_storage DESC;
|
|
199
|
+
```
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"SHOW JOBS 的语法是什么?怎么过滤特定状态的作业?","expected_skill":"clickzetta-monitoring","expected_output_contains":["SHOW JOBS"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"怎么查询失败的作业?用什么 SQL?","expected_skill":"clickzetta-monitoring","expected_output_contains":["FAILED"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"过去 7 天各集群的作业执行情况怎么查?","expected_skill":"clickzetta-monitoring","expected_output_contains":["job_history","virtual_cluster"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"怎么查看慢查询?执行超过 2 分钟的作业有哪些?","expected_skill":"clickzetta-monitoring","expected_output_contains":["execution_time"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"怎么通过 information_schema 查看所有表的大小和行数?","expected_skill":"clickzetta-monitoring","expected_output_contains":["information_schema","tables"]}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# information_schema 作业历史分析参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/job_history_analysis_with_information_schema
|
|
4
|
+
|
|
5
|
+
## 数据源
|
|
6
|
+
|
|
7
|
+
表名:`sys.information_schema.job_history`
|
|
8
|
+
|
|
9
|
+
### 关键字段
|
|
10
|
+
|
|
11
|
+
| 字段 | 类型 | 说明 |
|
|
12
|
+
|---|---|---|
|
|
13
|
+
| workspace_name | String | 工作空间名称 |
|
|
14
|
+
| virtual_cluster | String | 计算集群名称 |
|
|
15
|
+
| job_id | String | 作业唯一标识 |
|
|
16
|
+
| execution_time | Float | 执行时长(秒) |
|
|
17
|
+
| start_time | Timestamp | 开始时间 |
|
|
18
|
+
| status | String | 状态(SUCCEED/FAILED/CANCELLED/...) |
|
|
19
|
+
| input_tables | String | 输入表(JSON 格式) |
|
|
20
|
+
| input_bytes | String | 读取字节数 |
|
|
21
|
+
| cache_hit | String | 缓存命中字节数 |
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## 常用分析查询
|
|
26
|
+
|
|
27
|
+
### 1. 集群负载分析(近 30 天)
|
|
28
|
+
|
|
29
|
+
```sql
|
|
30
|
+
SELECT
|
|
31
|
+
virtual_cluster,
|
|
32
|
+
COUNT(*) AS job_count,
|
|
33
|
+
SUM(execution_time) AS total_execution_time,
|
|
34
|
+
AVG(execution_time) AS avg_execution_time,
|
|
35
|
+
ROUND(SUM(CASE WHEN status = 'SUCCEED' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS success_rate
|
|
36
|
+
FROM sys.information_schema.job_history
|
|
37
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 30 DAY
|
|
38
|
+
GROUP BY virtual_cluster
|
|
39
|
+
ORDER BY total_execution_time DESC;
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### 2. 慢查询分析(执行时间 TOP 20)
|
|
43
|
+
|
|
44
|
+
```sql
|
|
45
|
+
SELECT
|
|
46
|
+
job_id,
|
|
47
|
+
virtual_cluster,
|
|
48
|
+
execution_time,
|
|
49
|
+
status,
|
|
50
|
+
start_time
|
|
51
|
+
FROM sys.information_schema.job_history
|
|
52
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
53
|
+
ORDER BY execution_time DESC
|
|
54
|
+
LIMIT 20;
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### 3. 失败作业分析
|
|
58
|
+
|
|
59
|
+
```sql
|
|
60
|
+
SELECT
|
|
61
|
+
virtual_cluster,
|
|
62
|
+
COUNT(*) AS failed_count,
|
|
63
|
+
DATE(start_time) AS date
|
|
64
|
+
FROM sys.information_schema.job_history
|
|
65
|
+
WHERE status = 'FAILED'
|
|
66
|
+
AND start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
67
|
+
GROUP BY virtual_cluster, DATE(start_time)
|
|
68
|
+
ORDER BY date DESC, failed_count DESC;
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### 4. 缓存命中率分析
|
|
72
|
+
|
|
73
|
+
```sql
|
|
74
|
+
SELECT
|
|
75
|
+
virtual_cluster,
|
|
76
|
+
SUM(CAST(input_bytes AS BIGINT)) AS total_input_bytes,
|
|
77
|
+
SUM(CAST(cache_hit AS BIGINT)) AS total_cache_hit,
|
|
78
|
+
ROUND(SUM(CAST(cache_hit AS BIGINT)) * 100.0 /
|
|
79
|
+
NULLIF(SUM(CAST(input_bytes AS BIGINT)), 0), 2) AS cache_hit_rate
|
|
80
|
+
FROM sys.information_schema.job_history
|
|
81
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
82
|
+
AND input_bytes IS NOT NULL
|
|
83
|
+
GROUP BY virtual_cluster;
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 5. 按小时统计作业量(识别高峰期)
|
|
87
|
+
|
|
88
|
+
```sql
|
|
89
|
+
SELECT
|
|
90
|
+
HOUR(start_time) AS hour_of_day,
|
|
91
|
+
COUNT(*) AS job_count,
|
|
92
|
+
AVG(execution_time) AS avg_execution_time
|
|
93
|
+
FROM sys.information_schema.job_history
|
|
94
|
+
WHERE start_time >= CURRENT_DATE() - INTERVAL 7 DAY
|
|
95
|
+
GROUP BY HOUR(start_time)
|
|
96
|
+
ORDER BY hour_of_day;
|
|
97
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# SHOW JOBS 参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/show-jobs
|
|
4
|
+
|
|
5
|
+
## 语法
|
|
6
|
+
|
|
7
|
+
```sql
|
|
8
|
+
SHOW JOBS [IN VCLUSTER vc_name] [LIKE 'pattern'] [WHERE <expr>] [LIMIT num];
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## 参数说明
|
|
12
|
+
|
|
13
|
+
- `IN VCLUSTER vc_name`:(可选)指定计算集群,筛选该集群下的作业
|
|
14
|
+
- `WHERE <expr>`:(可选)按字段过滤,支持 SHOW JOBS 显示的所有字段
|
|
15
|
+
- `LIMIT num`:(可选)限制返回数量,范围 1-10000
|
|
16
|
+
- `LIKE 'pattern'`:(可选)按 job_id 模式匹配(支持 `%` 和 `_`)
|
|
17
|
+
|
|
18
|
+
默认显示最近 7 天内的作业,最多 10000 条。
|
|
19
|
+
|
|
20
|
+
## 示例
|
|
21
|
+
|
|
22
|
+
```sql
|
|
23
|
+
-- 查看所有作业(最近7天)
|
|
24
|
+
SHOW JOBS;
|
|
25
|
+
|
|
26
|
+
-- 查看指定集群的作业
|
|
27
|
+
SHOW JOBS IN VCLUSTER default_ap;
|
|
28
|
+
|
|
29
|
+
-- 查看执行时间超过2分钟的作业
|
|
30
|
+
SHOW JOBS IN VCLUSTER default_ap WHERE execution_time > INTERVAL 2 MINUTE;
|
|
31
|
+
|
|
32
|
+
-- 限制返回100条
|
|
33
|
+
SHOW JOBS LIMIT 100;
|
|
34
|
+
|
|
35
|
+
-- 按 job_id 模糊匹配
|
|
36
|
+
SHOW JOBS LIKE '2024%';
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## 作业状态说明
|
|
40
|
+
|
|
41
|
+
| 状态 | 含义 |
|
|
42
|
+
|---|---|
|
|
43
|
+
| 初始化 | SQL 编译优化阶段 |
|
|
44
|
+
| 集群启动中 | 等待 VCluster 启动 |
|
|
45
|
+
| 等待执行 | 排队等待资源 |
|
|
46
|
+
| 正在执行 | 正在处理数据 |
|
|
47
|
+
| 执行成功 | 运行结束 |
|
|
48
|
+
| 执行失败 | 运行失败 |
|