@clickzetta/cz-cli-darwin-arm64 0.3.17 → 0.3.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
- package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
- package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
- package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +160 -0
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +155 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +386 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +548 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +220 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +259 -0
- package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +112 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +257 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +124 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +96 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +109 -0
- package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +156 -0
- package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
- package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
- package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +751 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +324 -0
- package/bin/skills/clickzetta-monitoring/SKILL.md +199 -0
- package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +537 -0
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +156 -0
- package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +56 -0
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +78 -0
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +65 -0
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +49 -0
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +42 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +276 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +379 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +166 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +185 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +129 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +222 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +125 -0
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +206 -0
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +212 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +54 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +150 -0
- package/bin/skills/clickzetta-volume-manager/SKILL.md +292 -0
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +199 -0
- package/bin/skills/cz-cli/SKILL.md +1 -1
- package/bin/skills/cz-cli-inner/SKILL.md +8 -0
- package/package.json +1 -1
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/SKILL.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/dt-declaration-strategy.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/incremental-config-reference.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/refresh-history-guide.md +0 -0
- /package/bin/skills/{dt-creator → clickzetta-dynamic-table/dt-creator}/references/sql-limitations.md +0 -0
- /package/bin/skills/{dynamic-table-alter → clickzetta-dynamic-table/dynamic-table-alter}/SKILL.md +0 -0
|
@@ -0,0 +1,751 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-kafka-ingest-pipeline
|
|
3
|
+
description: |
|
|
4
|
+
搭建 ClickZetta Lakehouse Kafka 数据接入管道,覆盖从连接验证、数据探查、目标表创建
|
|
5
|
+
到 Pipe 持续导入的端到端工作流。支持两种接入路径:READ_KAFKA Pipe(推荐)和
|
|
6
|
+
Kafka 外部表 + Table Stream Pipe。
|
|
7
|
+
当用户说"Kafka 接入"、"Kafka 导入"、"Kafka Pipe"、"read_kafka"、"Kafka 数据管道"、
|
|
8
|
+
"Kafka 外部表"、"Kafka 消费"、"消息队列导入"、"Kafka 到 Lakehouse"、
|
|
9
|
+
"Kafka 实时导入"、"Kafka 持续导入"、"Kafka topic 导入"、"Kafka JSON 解析"、
|
|
10
|
+
"Kafka 延迟监控"、"Kafka 积压"时触发。
|
|
11
|
+
包含 READ_KAFKA 函数探查、JSON 多层嵌套解析、Kafka Pipe DDL、Kafka 外部表 + Table Stream、
|
|
12
|
+
SASL 认证配置、生产调优(BATCH_SIZE / COPY_JOB_HINT / VCluster 规格)、
|
|
13
|
+
延迟监控(pipe_latency / query_tag)等 ClickZetta 特有逻辑。
|
|
14
|
+
Keywords: Kafka, READ_KAFKA, Pipe, streaming ingestion, topic, consumer
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
# Kafka 数据接入管道工作流
|
|
18
|
+
|
|
19
|
+
## 适用场景
|
|
20
|
+
|
|
21
|
+
- 将 Kafka Topic 数据持续导入 ClickZetta Lakehouse 表
|
|
22
|
+
- 需要近实时(分钟级)数据新鲜度
|
|
23
|
+
- Kafka 消息格式为 JSON / CSV / Avro
|
|
24
|
+
- 需要在导入前对 JSON 消息进行多层嵌套解析和转换
|
|
25
|
+
- 关键词:Kafka Pipe、read_kafka、Kafka 外部表、消息队列导入、Kafka 持续导入
|
|
26
|
+
|
|
27
|
+
## 两种接入路径
|
|
28
|
+
|
|
29
|
+
| 路径 | 适用场景 | 核心对象 |
|
|
30
|
+
|------|---------|---------|
|
|
31
|
+
| **READ_KAFKA Pipe**(推荐) | 通用场景,支持复杂 SQL 转换 | `CREATE PIPE ... AS COPY INTO ... FROM (SELECT ... FROM read_kafka(...))` |
|
|
32
|
+
| **Kafka 外部表 + Table Stream Pipe** | 需要先落原始数据再增量消费 | Kafka 外部表 → Table Stream → Pipe `INSERT INTO ... SELECT` |
|
|
33
|
+
|
|
34
|
+
**选择建议**:大多数场景用 READ_KAFKA Pipe 即可,更简洁高效。Kafka 外部表路径适合需要保留原始消息、多个下游消费同一 Topic 的场景。
|
|
35
|
+
|
|
36
|
+
## 前置依赖
|
|
37
|
+
|
|
38
|
+
- ClickZetta Lakehouse 账户,具备创建 Pipe、表、VCluster 等权限
|
|
39
|
+
- Kafka 集群网络可达(确认 bootstrap 地址和端口)
|
|
40
|
+
- 已知 Kafka Topic 名称和消息格式
|
|
41
|
+
- 认证信息(如需要):SASL 用户名/密码
|
|
42
|
+
- **执行环境(满足其一即可,优先使用 cz-cli)**:
|
|
43
|
+
- **cz-cli 路径**:已安装 cz-cli(`pip install cz-cli`),并完成 `cz-cli configure` 配置
|
|
44
|
+
- **MCP 路径**:clickzetta-mcp-server 工具可用(`LH_execute_query`、`LH_show_object_list` 等)
|
|
45
|
+
|
|
46
|
+
## 环境探测(执行前必读)
|
|
47
|
+
|
|
48
|
+
在开始任何操作前,先判断当前执行环境:
|
|
49
|
+
|
|
50
|
+
**第一步:检测 cz-cli 是否可用**
|
|
51
|
+
```bash
|
|
52
|
+
cz-cli --version
|
|
53
|
+
```
|
|
54
|
+
- 若命令存在 → **走 cz-cli 路径**(见本文档末尾"cz-cli 替代路径"章节)
|
|
55
|
+
- 若命令不存在 → 继续检测 MCP
|
|
56
|
+
|
|
57
|
+
**第二步:检测 MCP 是否可用(仅在 cz-cli 不可用时)**
|
|
58
|
+
|
|
59
|
+
尝试调用 `LH_execute_query` 工具执行一条简单 SQL(如 `SELECT 1`)。
|
|
60
|
+
- 若工具存在于 tool list → **走 MCP 路径**(本文档默认路径)
|
|
61
|
+
- 若工具不存在 → 停止执行,提示用户:
|
|
62
|
+
> "当前环境既无 cz-cli 也无 MCP 工具,请安装其中之一后重试。
|
|
63
|
+
> cz-cli 安装:`pip install cz-cli`,然后运行 `cz-cli configure`
|
|
64
|
+
> MCP 安装:参考 clickzetta-mcp-server 配置文档"
|
|
65
|
+
|
|
66
|
+
## ⚠️ 关键注意事项
|
|
67
|
+
|
|
68
|
+
- Kafka Pipe 仅支持 **PLAINTEXT** 和 **SASL_PLAINTEXT** 两种安全协议,不支持 SSL 证书方式
|
|
69
|
+
- Pipe 创建后**自动启动**,无需手动 RESUME
|
|
70
|
+
- Pipe 不支持修改 COPY 语句逻辑,需删除后重建
|
|
71
|
+
- 建议为 Kafka Pipe 分配**专用 GP 集群**,避免与其他查询争抢资源
|
|
72
|
+
- `RESET_KAFKA_GROUP_OFFSETS` 仅在创建时生效,会强制改写消费位点,谨慎使用
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## 路径一:READ_KAFKA Pipe(推荐)
|
|
77
|
+
|
|
78
|
+
### 步骤 1:验证 Kafka 连接和探查数据
|
|
79
|
+
|
|
80
|
+
先用 `READ_KAFKA` 函数验证网络连通性和消息格式:
|
|
81
|
+
|
|
82
|
+
> ⚠️ **READ_KAFKA 使用位置参数(positional parameters)**,不支持 `=>` 命名参数语法。参数顺序固定,不可省略。
|
|
83
|
+
|
|
84
|
+
```sql
|
|
85
|
+
-- 无认证 Kafka(位置参数语法)
|
|
86
|
+
SELECT *
|
|
87
|
+
FROM read_kafka(
|
|
88
|
+
'kafka.example.com:9092', -- bootstrap_servers(必填)
|
|
89
|
+
'orders', -- topic(必填)
|
|
90
|
+
'', -- topic_pattern(保留,填空字符串)
|
|
91
|
+
'test_explore', -- group_id(必填)
|
|
92
|
+
'', -- starting_offsets(探查时可填 'earliest',或留空用默认 latest)
|
|
93
|
+
'', -- ending_offsets(留空)
|
|
94
|
+
'', -- starting_timestamp(留空)
|
|
95
|
+
'', -- ending_timestamp(留空)
|
|
96
|
+
'raw', -- key_format(目前只支持 raw)
|
|
97
|
+
'raw', -- value_format(目前只支持 raw)
|
|
98
|
+
0, -- max_errors
|
|
99
|
+
MAP(
|
|
100
|
+
'kafka.security.protocol', 'PLAINTEXT',
|
|
101
|
+
'kafka.auto.offset.reset', 'earliest'
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
LIMIT 10;
|
|
105
|
+
|
|
106
|
+
-- SASL_PLAINTEXT 认证
|
|
107
|
+
SELECT *
|
|
108
|
+
FROM read_kafka(
|
|
109
|
+
'kafka.example.com:9092',
|
|
110
|
+
'orders',
|
|
111
|
+
'',
|
|
112
|
+
'test_explore',
|
|
113
|
+
'', '', '', '',
|
|
114
|
+
'raw',
|
|
115
|
+
'raw',
|
|
116
|
+
0,
|
|
117
|
+
MAP(
|
|
118
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
119
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
120
|
+
'kafka.sasl.username', 'my_user',
|
|
121
|
+
'kafka.sasl.password', 'my_password',
|
|
122
|
+
'kafka.auto.offset.reset', 'earliest'
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
LIMIT 10;
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
> **参数说明**:
|
|
129
|
+
> - 探查用的 `group_id` 建议用临时名称(如 `test_explore`),避免影响正式消费组
|
|
130
|
+
> - `kafka.auto.offset.reset` 在 MAP 中设置为 `'earliest'` 可读取历史数据
|
|
131
|
+
> - key 和 value 都是 binary 类型,需要 CAST 转换后使用
|
|
132
|
+
> - **多 Broker 地址格式**:用逗号分隔多个 broker,Pipe 会自动故障转移
|
|
133
|
+
> - ✅ 推荐:`'broker1:9092,broker2:9092,broker3:9092'`(高可用)
|
|
134
|
+
> - ⚠️ 单 broker:`'broker1:9092'`(无故障转移,不推荐生产使用)
|
|
135
|
+
|
|
136
|
+
### 步骤 2:探查 JSON 结构并确定目标表 Schema
|
|
137
|
+
|
|
138
|
+
Kafka 的 key 和 value 都是 binary 类型。用 `value::string` 转换后查看内容,用 `parse_json()` 解析 JSON:
|
|
139
|
+
|
|
140
|
+
```sql
|
|
141
|
+
-- 将 value 转为字符串查看原始内容
|
|
142
|
+
SELECT key::string, value::string
|
|
143
|
+
FROM read_kafka(
|
|
144
|
+
'kafka.example.com:9092',
|
|
145
|
+
'orders',
|
|
146
|
+
'',
|
|
147
|
+
'test_schema',
|
|
148
|
+
'', '', '', '',
|
|
149
|
+
'raw', 'raw', 0,
|
|
150
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
151
|
+
)
|
|
152
|
+
LIMIT 5;
|
|
153
|
+
|
|
154
|
+
-- 解析 JSON 字段(使用 parse_json)
|
|
155
|
+
SELECT
|
|
156
|
+
j['order_id']::STRING AS order_id,
|
|
157
|
+
j['user_id']::STRING AS user_id,
|
|
158
|
+
j['amount']::DECIMAL(10,2) AS amount,
|
|
159
|
+
j['status']::STRING AS status,
|
|
160
|
+
timestamp_millis(j['created_at']::BIGINT) AS created_at
|
|
161
|
+
FROM (
|
|
162
|
+
SELECT parse_json(value::string) AS j
|
|
163
|
+
FROM read_kafka(
|
|
164
|
+
'kafka.example.com:9092',
|
|
165
|
+
'orders',
|
|
166
|
+
'',
|
|
167
|
+
'test_schema',
|
|
168
|
+
'', '', '', '',
|
|
169
|
+
'raw', 'raw', 0,
|
|
170
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
171
|
+
)
|
|
172
|
+
LIMIT 5
|
|
173
|
+
);
|
|
174
|
+
|
|
175
|
+
-- 多层嵌套 JSON 解析(逐层 parse_json 展开)
|
|
176
|
+
SELECT
|
|
177
|
+
j['id']::STRING AS id,
|
|
178
|
+
j['type']::STRING AS event_type,
|
|
179
|
+
parse_json(j['event']::STRING)['action']::STRING AS action,
|
|
180
|
+
parse_json(parse_json(j['event']::STRING)['payload']::STRING)['ref']::STRING AS ref
|
|
181
|
+
FROM (
|
|
182
|
+
SELECT parse_json(value::string) AS j
|
|
183
|
+
FROM read_kafka(
|
|
184
|
+
'kafka.example.com:9092',
|
|
185
|
+
'events',
|
|
186
|
+
'',
|
|
187
|
+
'test_nested',
|
|
188
|
+
'', '', '', '',
|
|
189
|
+
'raw', 'raw', 0,
|
|
190
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
191
|
+
)
|
|
192
|
+
LIMIT 5
|
|
193
|
+
);
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
> **最佳实践**:在 SELECT 中将所有嵌套 JSON 字符串都 `parse_json` 展开后再落表,避免下游查询重复计算。
|
|
197
|
+
|
|
198
|
+
### 步骤 3:创建目标表
|
|
199
|
+
|
|
200
|
+
根据探查结果创建目标表:
|
|
201
|
+
|
|
202
|
+
```sql
|
|
203
|
+
CREATE TABLE IF NOT EXISTS ods.kafka_orders (
|
|
204
|
+
order_id STRING,
|
|
205
|
+
user_id STRING,
|
|
206
|
+
amount DECIMAL(10,2),
|
|
207
|
+
status STRING,
|
|
208
|
+
created_at TIMESTAMP,
|
|
209
|
+
__kafka_timestamp__ TIMESTAMP COMMENT 'Kafka 消息时间戳,用于端到端延迟监控'
|
|
210
|
+
);
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
> 建议额外添加 `__kafka_timestamp__` 字段记录 Kafka 消息时间戳,用于后续端到端延迟监控。
|
|
214
|
+
|
|
215
|
+
### 步骤 4:创建专用 VCluster(推荐)
|
|
216
|
+
|
|
217
|
+
```sql
|
|
218
|
+
CREATE VCLUSTER IF NOT EXISTS pipe_kafka_vc
|
|
219
|
+
VCLUSTER_TYPE = GENERAL
|
|
220
|
+
VCLUSTER_SIZE = 4
|
|
221
|
+
AUTO_SUSPEND_IN_SECOND = 0
|
|
222
|
+
COMMENT 'Kafka Pipe 专用集群,常驻运行';
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
> 数据新鲜度要求 1 分钟时,建议 VCluster 常驻(`AUTO_SUSPEND_IN_SECOND = 0`),避免冷启动延迟。
|
|
226
|
+
|
|
227
|
+
### 步骤 5:创建 Kafka Pipe
|
|
228
|
+
|
|
229
|
+
```sql
|
|
230
|
+
-- ⚠️ 注意:ClickZetta 不支持 CREATE OR REPLACE PIPE,需用 CREATE PIPE 或先 DROP 再 CREATE
|
|
231
|
+
CREATE PIPE kafka_orders_pipe
|
|
232
|
+
VIRTUAL_CLUSTER = 'pipe_kafka_vc'
|
|
233
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
234
|
+
BATCH_SIZE_PER_KAFKA_PARTITION = '500000'
|
|
235
|
+
AS
|
|
236
|
+
COPY INTO ods.kafka_orders FROM (
|
|
237
|
+
SELECT
|
|
238
|
+
j['order_id']::STRING,
|
|
239
|
+
j['user_id']::STRING,
|
|
240
|
+
j['amount']::DECIMAL(10,2),
|
|
241
|
+
j['status']::STRING,
|
|
242
|
+
j['created_at']::TIMESTAMP,
|
|
243
|
+
CAST(`timestamp` AS TIMESTAMP) AS __kafka_timestamp__
|
|
244
|
+
FROM (
|
|
245
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
246
|
+
FROM read_kafka(
|
|
247
|
+
'kafka.example.com:9092', -- bootstrap_servers
|
|
248
|
+
'orders', -- topic
|
|
249
|
+
'', -- reserved
|
|
250
|
+
'lakehouse_orders', -- group_id(正式消费组名)
|
|
251
|
+
'', '', '', '', -- 位置参数留空,由 Pipe 自动管理
|
|
252
|
+
'raw', -- key_format
|
|
253
|
+
'raw', -- value_format
|
|
254
|
+
0, -- max_errors
|
|
255
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
);
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
> ⚠️ **Pipe 中 READ_KAFKA 的关键区别**:
|
|
262
|
+
> - 位置参数(starting_offsets 等)**必须留空**,由 Pipe 自动管理消费位点
|
|
263
|
+
> - 不要设置 `kafka.auto.offset.reset`(由 Pipe 的 `RESET_KAFKA_GROUP_OFFSETS` 参数控制)
|
|
264
|
+
> - group_id 使用正式名称(如 `lakehouse_orders`),Pipe 会持久化消费位点
|
|
265
|
+
|
|
266
|
+
**关键参数说明:**
|
|
267
|
+
|
|
268
|
+
| 参数 | 默认值 | 说明 |
|
|
269
|
+
|------|--------|------|
|
|
270
|
+
| `VIRTUAL_CLUSTER` | — | 必填,指定执行 Pipe 的计算集群 |
|
|
271
|
+
| `BATCH_INTERVAL_IN_SECONDS` | 60 | 批处理间隔(秒),即数据新鲜度 |
|
|
272
|
+
| `BATCH_SIZE_PER_KAFKA_PARTITION` | 500000 | 每个 Kafka 分区每批最大消息数 |
|
|
273
|
+
| `MAX_SKIP_BATCH_COUNT_ON_ERROR` | 30 | 出错时跳过批次的最大重试次数 |
|
|
274
|
+
| `INITIAL_DELAY_IN_SECONDS` | 0 | 首个作业调度延迟 |
|
|
275
|
+
| `RESET_KAFKA_GROUP_OFFSETS` | — | 可选,指定起始消费位点(仅创建时生效) |
|
|
276
|
+
|
|
277
|
+
**RESET_KAFKA_GROUP_OFFSETS 可选值:**
|
|
278
|
+
|
|
279
|
+
| 值 | 说明 |
|
|
280
|
+
|----|------|
|
|
281
|
+
| `'none'` | 无操作,使用 Kafka 的 `auto.offset.reset` 配置(默认 latest) |
|
|
282
|
+
| `'valid'` | 检查当前位点是否过期,将过期分区重置到 earliest |
|
|
283
|
+
| `'earliest'` | 重置到最早位点(消费全部历史数据) |
|
|
284
|
+
| `'latest'` | 重置到最新位点(仅消费新数据) |
|
|
285
|
+
| `'1737789688000'` | 重置到指定毫秒时间戳对应的位点 |
|
|
286
|
+
|
|
287
|
+
> **注意**:Pipe 中的 read_kafka 位置参数(starting_offsets 等)必须留空,由 Pipe 自动管理消费位点。与独立使用 read_kafka 探查时不同。
|
|
288
|
+
|
|
289
|
+
### 步骤 6:验证 Pipe 运行状态
|
|
290
|
+
|
|
291
|
+
```sql
|
|
292
|
+
-- 查看 Pipe 详情
|
|
293
|
+
DESC PIPE EXTENDED kafka_orders_pipe;
|
|
294
|
+
-- 关键字段:pipe_execution_paused(是否暂停)、pipe_latency(延迟信息)
|
|
295
|
+
|
|
296
|
+
-- 查看目标表数据
|
|
297
|
+
SELECT COUNT(*) FROM ods.kafka_orders;
|
|
298
|
+
SELECT * FROM ods.kafka_orders LIMIT 10;
|
|
299
|
+
|
|
300
|
+
-- 查看加载历史(保留 7 天)
|
|
301
|
+
SELECT * FROM load_history('ods.kafka_orders')
|
|
302
|
+
ORDER BY last_load_time DESC
|
|
303
|
+
LIMIT 20;
|
|
304
|
+
|
|
305
|
+
-- 通过 query_tag 查看 Pipe 作业
|
|
306
|
+
SHOW JOBS WHERE query_tag = 'pipe.my_workspace.ods.kafka_orders_pipe';
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## 路径二:Kafka 外部表 + Table Stream Pipe
|
|
312
|
+
|
|
313
|
+
适合需要保留原始消息、或多个下游消费同一 Topic 的场景。
|
|
314
|
+
|
|
315
|
+
### 步骤 1:创建 Kafka Storage Connection
|
|
316
|
+
|
|
317
|
+
```sql
|
|
318
|
+
CREATE STORAGE CONNECTION IF NOT EXISTS kafka_conn
|
|
319
|
+
TYPE KAFKA
|
|
320
|
+
BOOTSTRAP_SERVERS = ['kafka.example.com:9092']
|
|
321
|
+
SECURITY_PROTOCOL = 'PLAINTEXT';
|
|
322
|
+
|
|
323
|
+
-- 删除 Connection(⚠️ 注意:用 DROP CONNECTION,不是 DROP STORAGE CONNECTION)
|
|
324
|
+
DROP CONNECTION IF EXISTS kafka_conn;
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### 步骤 2:创建 Kafka 外部表
|
|
328
|
+
|
|
329
|
+
```sql
|
|
330
|
+
-- ⚠️ 必须显式指定列定义,不能省略
|
|
331
|
+
-- ⚠️ offset 是保留字,必须用反引号转义
|
|
332
|
+
CREATE EXTERNAL TABLE kafka_orders_ext (
|
|
333
|
+
topic STRING,
|
|
334
|
+
partition INT,
|
|
335
|
+
`offset` BIGINT,
|
|
336
|
+
`timestamp` TIMESTAMP,
|
|
337
|
+
timestamp_type STRING,
|
|
338
|
+
headers STRING,
|
|
339
|
+
key BINARY,
|
|
340
|
+
value BINARY
|
|
341
|
+
)
|
|
342
|
+
USING KAFKA
|
|
343
|
+
OPTIONS (
|
|
344
|
+
'group_id' = 'lakehouse_ext_orders',
|
|
345
|
+
'topics' = 'orders',
|
|
346
|
+
'starting_offset' = 'earliest'
|
|
347
|
+
)
|
|
348
|
+
CONNECTION kafka_conn;
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
> **注意**:
|
|
352
|
+
> - 列定义是**必须的**,不指定会报错 `failed to detect columns`
|
|
353
|
+
> - `offset` 和 `timestamp` 是保留字,定义和查询时都需要反引号转义
|
|
354
|
+
> - 删除外部表用 `DROP TABLE`(❌ `DROP EXTERNAL TABLE` 会报语法错误)
|
|
355
|
+
> - 删除 Connection 用 `DROP CONNECTION`(❌ `DROP STORAGE CONNECTION` 会报语法错误)
|
|
356
|
+
|
|
357
|
+
### 步骤 3:创建 Table Stream
|
|
358
|
+
|
|
359
|
+
```sql
|
|
360
|
+
CREATE TABLE STREAM kafka_orders_stream
|
|
361
|
+
ON TABLE kafka_orders_ext
|
|
362
|
+
WITH PROPERTIES ('TABLE_STREAM_MODE' = 'APPEND_ONLY');
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
### 步骤 4:创建目标表和 Pipe
|
|
366
|
+
|
|
367
|
+
```sql
|
|
368
|
+
-- 目标表
|
|
369
|
+
CREATE TABLE IF NOT EXISTS ods.kafka_orders_from_ext (
|
|
370
|
+
order_id STRING,
|
|
371
|
+
user_id STRING,
|
|
372
|
+
amount DECIMAL(10,2),
|
|
373
|
+
kafka_ts TIMESTAMP
|
|
374
|
+
);
|
|
375
|
+
|
|
376
|
+
-- Pipe(从 Table Stream 消费)
|
|
377
|
+
-- ⚠️ 注意:Table Stream Pipe 使用 INSERT INTO ... SELECT 语法,不是 COPY INTO
|
|
378
|
+
CREATE PIPE kafka_ext_orders_pipe
|
|
379
|
+
VIRTUAL_CLUSTER = 'pipe_kafka_vc'
|
|
380
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
381
|
+
AS
|
|
382
|
+
INSERT INTO ods.kafka_orders_from_ext
|
|
383
|
+
SELECT
|
|
384
|
+
GET_JSON_OBJECT(CAST(value AS STRING), '$.order_id') AS order_id,
|
|
385
|
+
GET_JSON_OBJECT(CAST(value AS STRING), '$.user_id') AS user_id,
|
|
386
|
+
CAST(GET_JSON_OBJECT(CAST(value AS STRING), '$.amount') AS DECIMAL(10,2)) AS amount,
|
|
387
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
388
|
+
FROM kafka_orders_stream;
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
> **清理外部表**:使用 `DROP TABLE kafka_orders_ext`(不是 `DROP EXTERNAL TABLE`)
|
|
392
|
+
|
|
393
|
+
---
|
|
394
|
+
|
|
395
|
+
## 监控与运维
|
|
396
|
+
|
|
397
|
+
### 查看 Kafka 消费延迟
|
|
398
|
+
|
|
399
|
+
```sql
|
|
400
|
+
DESC PIPE EXTENDED kafka_orders_pipe;
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
关键字段 `pipe_latency`(JSON 格式):
|
|
404
|
+
- `lastConsumeTimestamp`:上一次消费的位点时间
|
|
405
|
+
- `offsetLag`:Kafka 数据堆积量
|
|
406
|
+
- `timeLag`:消费延迟(毫秒),当前时间减去上一次消费位点。异常时为 -1
|
|
407
|
+
|
|
408
|
+
> 当数据新鲜度为 60 秒且算力冗余一倍时,`timeLag` 应在 0~90 秒之间波动。持续上涨说明 Pipe 积压。
|
|
409
|
+
|
|
410
|
+
### 端到端延迟监控(需要 `__kafka_timestamp__` 字段)
|
|
411
|
+
|
|
412
|
+
```sql
|
|
413
|
+
-- 查看最近 1 小时的端到端延迟
|
|
414
|
+
SELECT
|
|
415
|
+
MAX(DATEDIFF('second', __kafka_timestamp__, CURRENT_TIMESTAMP())) AS max_delay_seconds,
|
|
416
|
+
AVG(DATEDIFF('second', __kafka_timestamp__, CURRENT_TIMESTAMP())) AS avg_delay_seconds
|
|
417
|
+
FROM ods.kafka_orders
|
|
418
|
+
WHERE __kafka_timestamp__ >= CURRENT_TIMESTAMP() - INTERVAL 1 HOUR;
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
### 暂停 / 恢复 Pipe
|
|
422
|
+
|
|
423
|
+
```sql
|
|
424
|
+
-- 暂停
|
|
425
|
+
ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = true;
|
|
426
|
+
|
|
427
|
+
-- 恢复
|
|
428
|
+
ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = false;
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
### 修改 Pipe 属性
|
|
432
|
+
|
|
433
|
+
```sql
|
|
434
|
+
-- 修改 VCluster
|
|
435
|
+
ALTER PIPE kafka_orders_pipe SET VIRTUAL_CLUSTER = 'new_vc';
|
|
436
|
+
|
|
437
|
+
-- 修改 COPY_JOB_HINT
|
|
438
|
+
ALTER PIPE kafka_orders_pipe SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
> ⚠️ **ALTER PIPE 支持的属性**(经验证):
|
|
442
|
+
> - ✅ `PIPE_EXECUTION_PAUSED`
|
|
443
|
+
> - ✅ `VIRTUAL_CLUSTER`
|
|
444
|
+
> - ✅ `COPY_JOB_HINT`
|
|
445
|
+
> - ❌ `BATCH_INTERVAL_IN_SECONDS`(不支持修改,需删除重建 Pipe)
|
|
446
|
+
> - ❌ `BATCH_SIZE_PER_KAFKA_PARTITION`(不支持修改,需删除重建 Pipe)
|
|
447
|
+
>
|
|
448
|
+
> 每次 ALTER 只能修改一个属性。不支持修改 COPY/INSERT 语句逻辑,需删除重建。
|
|
449
|
+
|
|
450
|
+
### 修改 Pipe SQL 逻辑(需删除重建)
|
|
451
|
+
|
|
452
|
+
```sql
|
|
453
|
+
-- 1. 删除当前 Pipe
|
|
454
|
+
DROP PIPE kafka_orders_pipe;
|
|
455
|
+
|
|
456
|
+
-- 2. 重建 Pipe(不要设置 RESET_KAFKA_GROUP_OFFSETS,保持从上次位点继续)
|
|
457
|
+
-- ⚠️ 注意:ClickZetta 不支持 CREATE OR REPLACE PIPE,使用 CREATE PIPE
|
|
458
|
+
CREATE PIPE kafka_orders_pipe
|
|
459
|
+
VIRTUAL_CLUSTER = 'pipe_kafka_vc'
|
|
460
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
461
|
+
AS
|
|
462
|
+
COPY INTO ods.kafka_orders FROM (
|
|
463
|
+
SELECT
|
|
464
|
+
j['order_id']::STRING,
|
|
465
|
+
j['user_id']::STRING,
|
|
466
|
+
j['amount']::DECIMAL(10,2),
|
|
467
|
+
UPPER(j['status']::STRING), -- 修改了转换逻辑
|
|
468
|
+
j['created_at']::TIMESTAMP,
|
|
469
|
+
CAST(`timestamp` AS TIMESTAMP) AS __kafka_timestamp__
|
|
470
|
+
FROM (
|
|
471
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
472
|
+
FROM read_kafka(
|
|
473
|
+
'kafka.example.com:9092',
|
|
474
|
+
'orders',
|
|
475
|
+
'',
|
|
476
|
+
'lakehouse_orders', -- 保持相同 group_id
|
|
477
|
+
'', '', '', '',
|
|
478
|
+
'raw', 'raw', 0,
|
|
479
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
480
|
+
)
|
|
481
|
+
)
|
|
482
|
+
);
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
> **关键**:重建时保持相同的 `group_id`,且不设置 `RESET_KAFKA_GROUP_OFFSETS`,Pipe 会从上次消费位点继续。
|
|
486
|
+
|
|
487
|
+
---
|
|
488
|
+
|
|
489
|
+
## 生产调优
|
|
490
|
+
|
|
491
|
+
### 判断是否积压
|
|
492
|
+
|
|
493
|
+
多次执行 `DESC PIPE EXTENDED` 查看 `pipe_latency` 中的 `timeLag`:
|
|
494
|
+
- 在 0~90 秒波动 → 正常(60 秒新鲜度 + 一倍冗余)
|
|
495
|
+
- 持续上涨 → 积压,需调优
|
|
496
|
+
|
|
497
|
+
### 调优参数
|
|
498
|
+
|
|
499
|
+
| 问题 | 调优方向 | 操作 |
|
|
500
|
+
|------|---------|------|
|
|
501
|
+
| 每批读取不完一个周期的数据 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION` | 删除重建 Pipe 时设置更大的值(如 `BATCH_SIZE_PER_KAFKA_PARTITION = '1000000'`) |
|
|
502
|
+
| 作业需要多轮才能完成 | 增大 VCluster 规格(使 core 数 ≥ partition 数) | `ALTER VCLUSTER ... SET VCLUSTER_SIZE = 16` |
|
|
503
|
+
| partition 少但数据量大 | 按条数切分 task | `ALTER PIPE ... SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}'` |
|
|
504
|
+
|
|
505
|
+
### COPY_JOB_HINT 参数
|
|
506
|
+
|
|
507
|
+
| Key | 默认值 | 说明 |
|
|
508
|
+
|-----|--------|------|
|
|
509
|
+
| `cz.sql.split.kafka.strategy` | `simple` | `simple`=每 partition 一个 task;`size`=按条数切分 |
|
|
510
|
+
| `cz.mapper.kafka.message.size` | `1000000` | 当 strategy=size 时,每个 task 处理的消息条数 |
|
|
511
|
+
|
|
512
|
+
> ⚠️ **格式要求**:`COPY_JOB_HINT` 必须是合法 JSON,键值都要用双引号包围:
|
|
513
|
+
> ```sql
|
|
514
|
+
> -- ✅ 正确
|
|
515
|
+
> ALTER PIPE my_pipe SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
|
|
516
|
+
> -- ❌ 错误(非 JSON 格式)
|
|
517
|
+
> ALTER PIPE my_pipe SET COPY_JOB_HINT = 'cz.sql.split.kafka.strategy=size';
|
|
518
|
+
> ```
|
|
519
|
+
> 修改 `COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数。
|
|
520
|
+
|
|
521
|
+
---
|
|
522
|
+
|
|
523
|
+
## 典型场景
|
|
524
|
+
|
|
525
|
+
### 场景 A:简单 JSON Topic 接入
|
|
526
|
+
|
|
527
|
+
```sql
|
|
528
|
+
-- 1. 探查
|
|
529
|
+
SELECT parse_json(value::string)['id']::STRING, parse_json(value::string)['name']::STRING
|
|
530
|
+
FROM read_kafka(
|
|
531
|
+
'kafka:9092', 'metrics', '', 'test',
|
|
532
|
+
'', '', '', '', 'raw', 'raw', 0,
|
|
533
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
534
|
+
) LIMIT 5;
|
|
535
|
+
|
|
536
|
+
-- 2. 建表
|
|
537
|
+
CREATE TABLE ods.metrics (id STRING, name STRING, value DOUBLE, kafka_ts TIMESTAMP);
|
|
538
|
+
|
|
539
|
+
-- 3. 建 Pipe
|
|
540
|
+
CREATE PIPE metrics_pipe
|
|
541
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
542
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
543
|
+
AS
|
|
544
|
+
COPY INTO ods.metrics FROM (
|
|
545
|
+
SELECT
|
|
546
|
+
j['id']::STRING, j['name']::STRING, j['value']::DOUBLE,
|
|
547
|
+
CAST(`timestamp` AS TIMESTAMP)
|
|
548
|
+
FROM (
|
|
549
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
550
|
+
FROM read_kafka(
|
|
551
|
+
'kafka:9092', 'metrics', '', 'cz_metrics',
|
|
552
|
+
'', '', '', '', 'raw', 'raw', 0,
|
|
553
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
554
|
+
)
|
|
555
|
+
)
|
|
556
|
+
);
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
### 场景 B:Kafka → ODS → DWD 实时 ETL
|
|
560
|
+
|
|
561
|
+
```sql
|
|
562
|
+
-- 1. Pipe 接入 ODS 层
|
|
563
|
+
CREATE PIPE kafka_events_pipe
|
|
564
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
565
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
566
|
+
AS
|
|
567
|
+
COPY INTO ods.events FROM (
|
|
568
|
+
SELECT
|
|
569
|
+
j['event_id']::STRING, j['user_id']::STRING, j['action']::STRING, j['ts']::TIMESTAMP
|
|
570
|
+
FROM (
|
|
571
|
+
SELECT parse_json(value::string) AS j
|
|
572
|
+
FROM read_kafka(
|
|
573
|
+
'kafka:9092', 'user_events', '', 'cz_events',
|
|
574
|
+
'', '', '', '', 'raw', 'raw', 0,
|
|
575
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
576
|
+
)
|
|
577
|
+
)
|
|
578
|
+
);
|
|
579
|
+
|
|
580
|
+
-- 2. Dynamic Table 清洗到 DWD 层
|
|
581
|
+
-- ⚠️ 注意:Dynamic Table 支持 CREATE OR REPLACE,与 Pipe 不同
|
|
582
|
+
CREATE OR REPLACE DYNAMIC TABLE dwd.events_clean
|
|
583
|
+
REFRESH INTERVAL 1 MINUTE vcluster default
|
|
584
|
+
AS
|
|
585
|
+
SELECT event_id, user_id, UPPER(action) AS action, ts, DATE(ts) AS dt
|
|
586
|
+
FROM ods.events
|
|
587
|
+
WHERE event_id IS NOT NULL AND action IS NOT NULL;
|
|
588
|
+
|
|
589
|
+
-- 3. Dynamic Table 聚合到 DWS 层
|
|
590
|
+
CREATE OR REPLACE DYNAMIC TABLE dws.events_hourly
|
|
591
|
+
REFRESH INTERVAL 5 MINUTE vcluster default
|
|
592
|
+
AS
|
|
593
|
+
SELECT DATE_TRUNC('hour', ts) AS hour, action, COUNT(*) AS cnt, COUNT(DISTINCT user_id) AS uv
|
|
594
|
+
FROM dwd.events_clean
|
|
595
|
+
GROUP BY 1, 2;
|
|
596
|
+
```
|
|
597
|
+
|
|
598
|
+
### 场景 C:SASL 认证 + 指定时间点消费
|
|
599
|
+
|
|
600
|
+
```sql
|
|
601
|
+
CREATE PIPE kafka_auth_pipe
|
|
602
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
603
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
604
|
+
RESET_KAFKA_GROUP_OFFSETS = '1737789688000'
|
|
605
|
+
AS
|
|
606
|
+
COPY INTO ods.secure_events FROM (
|
|
607
|
+
SELECT
|
|
608
|
+
j['id']::STRING AS event_id,
|
|
609
|
+
j['payload']::STRING AS payload,
|
|
610
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
611
|
+
FROM (
|
|
612
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
613
|
+
FROM read_kafka(
|
|
614
|
+
'kafka.example.com:9092',
|
|
615
|
+
'secure_events',
|
|
616
|
+
'',
|
|
617
|
+
'cz_secure',
|
|
618
|
+
'', '', '', '',
|
|
619
|
+
'raw', 'raw', 0,
|
|
620
|
+
MAP(
|
|
621
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
622
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
623
|
+
'kafka.sasl.username', 'my_user',
|
|
624
|
+
'kafka.sasl.password', 'my_password'
|
|
625
|
+
)
|
|
626
|
+
)
|
|
627
|
+
)
|
|
628
|
+
);
|
|
629
|
+
```
|
|
630
|
+
|
|
631
|
+
---
|
|
632
|
+
|
|
633
|
+
## 故障排除
|
|
634
|
+
|
|
635
|
+
| 问题 | 排查方向 |
|
|
636
|
+
|------|---------|
|
|
637
|
+
| READ_KAFKA 语法报错 `Syntax error at or near '('` | ❌ 不要用 `TABLE(READ_KAFKA(...))` 或 `=>` 命名参数。✅ 正确:`FROM read_kafka('broker', 'topic', '', 'group', '', '', '', '', 'raw', 'raw', 0, MAP(...))` |
|
|
638
|
+
| READ_KAFKA 报错 `cannot resolve column` | 使用了 `=` 赋值语法(如 `KAFKA_BROKER = 'xxx'`)。READ_KAFKA 只支持位置参数 |
|
|
639
|
+
| READ_KAFKA 探查无数据 | 检查 broker 地址/端口、topic 名称、网络连通性;在 MAP 中设置 `'kafka.auto.offset.reset', 'earliest'` |
|
|
640
|
+
| Pipe 创建后无数据加载 | `DESC PIPE EXTENDED` 检查是否暂停;确认 group_id 的消费位点(默认 latest,新数据才会消费) |
|
|
641
|
+
| Table Stream Pipe 语法报错 `Syntax error at or near 'SELECT'` | ❌ 不要用 `COPY INTO ... SELECT`。✅ 正确:`INSERT INTO ... SELECT FROM stream` |
|
|
642
|
+
| `CREATE OR REPLACE PIPE` 报错 AlreadyExist | ❌ ClickZetta 不支持 `CREATE OR REPLACE PIPE`。Pipe 不存在时 `CREATE OR REPLACE` 会创建成功,但 Pipe 已存在时报 AlreadyExist 错误。✅ 正确:用 `DROP PIPE` + `CREATE PIPE` 重建(与 Dynamic Table 不同,DT 支持 `CREATE OR REPLACE`) |
|
|
643
|
+
| JSON 解析报错 | 使用 `parse_json(value::string)['field']::TYPE` 语法;嵌套 JSON 需逐层 `parse_json()` 展开 |
|
|
644
|
+
| SASL 认证失败 | 确认安全协议为 SASL_PLAINTEXT(不支持 SSL);在 MAP 中设置 `kafka.sasl.mechanism`、`kafka.sasl.username`、`kafka.sasl.password` |
|
|
645
|
+
| 消费延迟持续增大 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION`;增大 VCluster 规格;使用 `COPY_JOB_HINT` 切分 task |
|
|
646
|
+
| 重建 Pipe 后数据重复 | 保持相同 group_id 且不设置 `RESET_KAFKA_GROUP_OFFSETS` |
|
|
647
|
+
| 重建 Pipe 后数据丢失 | 检查 group_id 的位点是否过期;如需回溯用 `RESET_KAFKA_GROUP_OFFSETS` 指定时间戳 |
|
|
648
|
+
| `COPY_JOB_HINT` 修改后参数丢失 | `SET COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数 |
|
|
649
|
+
| Pipe 作业 Failover | 查看作业详情;通常为 Kafka 连接中断或 Lakehouse 服务升级,会自动恢复 |
|
|
650
|
+
|
|
651
|
+
---
|
|
652
|
+
|
|
653
|
+
## 参考文档
|
|
654
|
+
|
|
655
|
+
- [Pipe 简介](https://www.yunqi.tech/documents/pipe-summary)
|
|
656
|
+
- [借助 read_kafka 函数持续导入](https://www.yunqi.tech/documents/pipe-kafka)
|
|
657
|
+
- [借助 Kafka 外表 Table Stream 持续导入](https://www.yunqi.tech/documents/pipe-kafka-table-stream)
|
|
658
|
+
- [最佳实践:使用 Pipe 高效接入 Kafka 数据](https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1)
|
|
659
|
+
- [read_kafka 函数](https://www.yunqi.tech/documents/read_kafka)
|
|
660
|
+
- [Kafka 外部表](https://www.yunqi.tech/documents/kafka-external-table)
|
|
661
|
+
- [Kafka Storage Connection](https://www.yunqi.tech/documents/Kafka_connection)
|
|
662
|
+
- [PIPE 导入语法](https://www.yunqi.tech/documents/pipe-syntax)
|
|
663
|
+
|
|
664
|
+
---
|
|
665
|
+
|
|
666
|
+
## cz-cli 替代路径
|
|
667
|
+
|
|
668
|
+
> 仅在 cz-cli 可用且 MCP 不可用时使用本节。步骤编号与上方 MCP 路径对应。
|
|
669
|
+
> 所有操作通过 `cz-cli agent run` 委托给内置 agent 完成,agent 内置完整的 MCP 工具访问能力。
|
|
670
|
+
|
|
671
|
+
### 路径一:READ_KAFKA Pipe(cz-cli 版)
|
|
672
|
+
|
|
673
|
+
#### 步骤 1-2:验证 Kafka 连接和探查数据结构
|
|
674
|
+
|
|
675
|
+
```bash
|
|
676
|
+
cz-cli agent run "验证 Kafka 连接并探查数据结构:broker 地址 <kafka-host:9092>,topic <topic-name>,消费组 test_explore,从 earliest 开始读取 10 条消息,展示原始 JSON 内容和字段结构" \
|
|
677
|
+
--format a2a --dangerously-skip-permissions
|
|
678
|
+
```
|
|
679
|
+
|
|
680
|
+
#### 步骤 3:创建目标表
|
|
681
|
+
|
|
682
|
+
```bash
|
|
683
|
+
cz-cli agent run "在 schema <my_schema> 下创建目标表 <table_name>,字段包括:<field1> <type1>, <field2> <type2>,以及 __kafka_timestamp__ TIMESTAMP 字段用于延迟监控" \
|
|
684
|
+
--format a2a --dangerously-skip-permissions
|
|
685
|
+
```
|
|
686
|
+
|
|
687
|
+
#### 步骤 4:创建专用 VCluster(可选)
|
|
688
|
+
|
|
689
|
+
```bash
|
|
690
|
+
cz-cli agent run "创建名为 pipe_kafka_vc 的 GENERAL 类型 VCluster,大小 4,AUTO_SUSPEND_IN_SECOND 设为 0(常驻运行),用于 Kafka Pipe 专用" \
|
|
691
|
+
--format a2a --dangerously-skip-permissions
|
|
692
|
+
```
|
|
693
|
+
|
|
694
|
+
#### 步骤 5:创建 Kafka Pipe
|
|
695
|
+
|
|
696
|
+
```bash
|
|
697
|
+
cz-cli agent run "创建 Kafka Pipe,名称 <pipe_name>,使用 VCluster pipe_kafka_vc,BATCH_INTERVAL_IN_SECONDS=60,从 Kafka broker <host:port> 的 topic <topic> 消费数据(消费组 <group_id>,JSON 格式),将字段 <field1>, <field2> 写入目标表 <schema>.<table>" \
|
|
698
|
+
--format a2a --dangerously-skip-permissions
|
|
699
|
+
```
|
|
700
|
+
|
|
701
|
+
#### 步骤 6:验证 Pipe 运行状态
|
|
702
|
+
|
|
703
|
+
```bash
|
|
704
|
+
cz-cli agent run "查看 Pipe <pipe_name> 的详细状态,包括是否暂停、延迟信息,以及目标表 <schema>.<table> 的数据量和最近加载历史" \
|
|
705
|
+
--format a2a --dangerously-skip-permissions
|
|
706
|
+
```
|
|
707
|
+
|
|
708
|
+
---
|
|
709
|
+
|
|
710
|
+
### 路径二:Kafka 外部表 + Table Stream Pipe(cz-cli 版)
|
|
711
|
+
|
|
712
|
+
#### 步骤 1-4:完整创建流程
|
|
713
|
+
|
|
714
|
+
```bash
|
|
715
|
+
# 步骤 1:创建 Kafka Storage Connection
|
|
716
|
+
cz-cli agent run "创建 Kafka Storage Connection,名称 kafka_conn,bootstrap servers 为 <kafka-host:9092>,安全协议 PLAINTEXT" \
|
|
717
|
+
--format a2a --dangerously-skip-permissions
|
|
718
|
+
|
|
719
|
+
# 步骤 2:创建 Kafka 外部表
|
|
720
|
+
cz-cli agent run "创建 Kafka 外部表 kafka_<topic>_ext,使用 Connection kafka_conn,消费组 lakehouse_ext_<topic>,topic 为 <topic>,从 earliest 开始" \
|
|
721
|
+
--format a2a --dangerously-skip-permissions
|
|
722
|
+
|
|
723
|
+
# 步骤 3:创建 Table Stream
|
|
724
|
+
cz-cli agent run "在 Kafka 外部表 kafka_<topic>_ext 上创建 APPEND_ONLY 模式的 Table Stream,名称 kafka_<topic>_stream" \
|
|
725
|
+
--format a2a --dangerously-skip-permissions
|
|
726
|
+
|
|
727
|
+
# 步骤 4:创建目标表和 Pipe
|
|
728
|
+
cz-cli agent run "创建目标表 <schema>.<target_table>,然后创建 Pipe kafka_ext_<topic>_pipe,使用 VCluster pipe_kafka_vc,BATCH_INTERVAL_IN_SECONDS=60,从 Table Stream kafka_<topic>_stream 消费数据,解析 JSON value 字段写入目标表" \
|
|
729
|
+
--format a2a --dangerously-skip-permissions
|
|
730
|
+
```
|
|
731
|
+
|
|
732
|
+
---
|
|
733
|
+
|
|
734
|
+
### 监控与运维(cz-cli 版)
|
|
735
|
+
|
|
736
|
+
```bash
|
|
737
|
+
# 查看 Pipe 延迟状态
|
|
738
|
+
cz-cli agent run "查看 Pipe <pipe_name> 的延迟信息,包括 timeLag 和 offsetLag,判断是否有积压" \
|
|
739
|
+
--format a2a --dangerously-skip-permissions
|
|
740
|
+
|
|
741
|
+
# 暂停/恢复 Pipe
|
|
742
|
+
cz-cli agent run "暂停 Pipe <pipe_name>" \
|
|
743
|
+
--format a2a --dangerously-skip-permissions
|
|
744
|
+
|
|
745
|
+
cz-cli agent run "恢复 Pipe <pipe_name>" \
|
|
746
|
+
--format a2a --dangerously-skip-permissions
|
|
747
|
+
|
|
748
|
+
# 修改 Pipe 属性
|
|
749
|
+
cz-cli agent run "修改 Pipe <pipe_name> 的 BATCH_INTERVAL_IN_SECONDS 为 120" \
|
|
750
|
+
--format a2a --dangerously-skip-permissions
|
|
751
|
+
```
|