@clickzetta/cz-cli-linux-x64 0.3.1 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +9 -2
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +1 -1
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +3 -3
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +27 -1
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +82 -43
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +262 -154
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +192 -54
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +1 -1
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +70 -45
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +79 -53
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +40 -28
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +133 -71
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +11 -9
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +79 -28
- package/bin/skills/clickzetta-volume-manager/SKILL.md +48 -5
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +7 -2
- package/package.json +1 -1
|
@@ -28,8 +28,8 @@ description: |
|
|
|
28
28
|
|
|
29
29
|
| 路径 | 适用场景 | 核心对象 |
|
|
30
30
|
|------|---------|---------|
|
|
31
|
-
| **READ_KAFKA Pipe**(推荐) | 通用场景,支持复杂 SQL 转换 | `CREATE PIPE ... AS
|
|
32
|
-
| **Kafka 外部表 + Table Stream Pipe** | 需要先落原始数据再增量消费 | Kafka 外部表 → Table Stream → Pipe
|
|
31
|
+
| **READ_KAFKA Pipe**(推荐) | 通用场景,支持复杂 SQL 转换 | `CREATE PIPE ... AS COPY INTO ... FROM (SELECT ... FROM read_kafka(...))` |
|
|
32
|
+
| **Kafka 外部表 + Table Stream Pipe** | 需要先落原始数据再增量消费 | Kafka 外部表 → Table Stream → Pipe `INSERT INTO ... SELECT` |
|
|
33
33
|
|
|
34
34
|
**选择建议**:大多数场景用 READ_KAFKA Pipe 即可,更简洁高效。Kafka 外部表路径适合需要保留原始消息、多个下游消费同一 Topic 的场景。
|
|
35
35
|
|
|
@@ -56,93 +56,121 @@ description: |
|
|
|
56
56
|
|
|
57
57
|
先用 `READ_KAFKA` 函数验证网络连通性和消息格式:
|
|
58
58
|
|
|
59
|
+
> ⚠️ **READ_KAFKA 使用位置参数(positional parameters)**,不支持 `=>` 命名参数语法。参数顺序固定,不可省略。
|
|
60
|
+
|
|
59
61
|
```sql
|
|
60
|
-
-- 无认证 Kafka
|
|
62
|
+
-- 无认证 Kafka(位置参数语法)
|
|
61
63
|
SELECT *
|
|
62
|
-
FROM
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
64
|
+
FROM read_kafka(
|
|
65
|
+
'kafka.example.com:9092', -- bootstrap_servers(必填)
|
|
66
|
+
'orders', -- topic(必填)
|
|
67
|
+
'', -- topic_pattern(保留,填空字符串)
|
|
68
|
+
'test_explore', -- group_id(必填)
|
|
69
|
+
'', -- starting_offsets(探查时可填 'earliest',或留空用默认 latest)
|
|
70
|
+
'', -- ending_offsets(留空)
|
|
71
|
+
'', -- starting_timestamp(留空)
|
|
72
|
+
'', -- ending_timestamp(留空)
|
|
73
|
+
'raw', -- key_format(目前只支持 raw)
|
|
74
|
+
'raw', -- value_format(目前只支持 raw)
|
|
75
|
+
0, -- max_errors
|
|
76
|
+
MAP(
|
|
77
|
+
'kafka.security.protocol', 'PLAINTEXT',
|
|
78
|
+
'kafka.auto.offset.reset', 'earliest'
|
|
69
79
|
)
|
|
70
80
|
)
|
|
71
81
|
LIMIT 10;
|
|
72
82
|
|
|
73
83
|
-- SASL_PLAINTEXT 认证
|
|
74
84
|
SELECT *
|
|
75
|
-
FROM
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
85
|
+
FROM read_kafka(
|
|
86
|
+
'kafka.example.com:9092',
|
|
87
|
+
'orders',
|
|
88
|
+
'',
|
|
89
|
+
'test_explore',
|
|
90
|
+
'', '', '', '',
|
|
91
|
+
'raw',
|
|
92
|
+
'raw',
|
|
93
|
+
0,
|
|
94
|
+
MAP(
|
|
95
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
96
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
97
|
+
'kafka.sasl.username', 'my_user',
|
|
98
|
+
'kafka.sasl.password', 'my_password',
|
|
99
|
+
'kafka.auto.offset.reset', 'earliest'
|
|
84
100
|
)
|
|
85
101
|
)
|
|
86
102
|
LIMIT 10;
|
|
87
103
|
```
|
|
88
104
|
|
|
89
|
-
>
|
|
105
|
+
> **参数说明**:
|
|
106
|
+
> - 探查用的 `group_id` 建议用临时名称(如 `test_explore`),避免影响正式消费组
|
|
107
|
+
> - `kafka.auto.offset.reset` 在 MAP 中设置为 `'earliest'` 可读取历史数据
|
|
108
|
+
> - key 和 value 都是 binary 类型,需要 CAST 转换后使用
|
|
109
|
+
> - **多 Broker 地址格式**:用逗号分隔多个 broker,Pipe 会自动故障转移
|
|
110
|
+
> - ✅ 推荐:`'broker1:9092,broker2:9092,broker3:9092'`(高可用)
|
|
111
|
+
> - ⚠️ 单 broker:`'broker1:9092'`(无故障转移,不推荐生产使用)
|
|
90
112
|
|
|
91
113
|
### 步骤 2:探查 JSON 结构并确定目标表 Schema
|
|
92
114
|
|
|
93
|
-
Kafka 的 key 和 value 都是 binary 类型。用
|
|
115
|
+
Kafka 的 key 和 value 都是 binary 类型。用 `value::string` 转换后查看内容,用 `parse_json()` 解析 JSON:
|
|
94
116
|
|
|
95
117
|
```sql
|
|
96
118
|
-- 将 value 转为字符串查看原始内容
|
|
97
|
-
SELECT
|
|
98
|
-
FROM
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
)
|
|
119
|
+
SELECT key::string, value::string
|
|
120
|
+
FROM read_kafka(
|
|
121
|
+
'kafka.example.com:9092',
|
|
122
|
+
'orders',
|
|
123
|
+
'',
|
|
124
|
+
'test_schema',
|
|
125
|
+
'', '', '', '',
|
|
126
|
+
'raw', 'raw', 0,
|
|
127
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
106
128
|
)
|
|
107
129
|
LIMIT 5;
|
|
108
130
|
|
|
109
|
-
--
|
|
131
|
+
-- 解析 JSON 字段(使用 parse_json)
|
|
110
132
|
SELECT
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
FROM
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
133
|
+
j['order_id']::STRING AS order_id,
|
|
134
|
+
j['user_id']::STRING AS user_id,
|
|
135
|
+
j['amount']::DECIMAL(10,2) AS amount,
|
|
136
|
+
j['status']::STRING AS status,
|
|
137
|
+
timestamp_millis(j['created_at']::BIGINT) AS created_at
|
|
138
|
+
FROM (
|
|
139
|
+
SELECT parse_json(value::string) AS j
|
|
140
|
+
FROM read_kafka(
|
|
141
|
+
'kafka.example.com:9092',
|
|
142
|
+
'orders',
|
|
143
|
+
'',
|
|
144
|
+
'test_schema',
|
|
145
|
+
'', '', '', '',
|
|
146
|
+
'raw', 'raw', 0,
|
|
147
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
123
148
|
)
|
|
124
|
-
|
|
125
|
-
|
|
149
|
+
LIMIT 5
|
|
150
|
+
);
|
|
126
151
|
|
|
127
|
-
-- 多层嵌套 JSON
|
|
152
|
+
-- 多层嵌套 JSON 解析(逐层 parse_json 展开)
|
|
128
153
|
SELECT
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
FROM
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
154
|
+
j['id']::STRING AS id,
|
|
155
|
+
j['type']::STRING AS event_type,
|
|
156
|
+
parse_json(j['event']::STRING)['action']::STRING AS action,
|
|
157
|
+
parse_json(parse_json(j['event']::STRING)['payload']::STRING)['ref']::STRING AS ref
|
|
158
|
+
FROM (
|
|
159
|
+
SELECT parse_json(value::string) AS j
|
|
160
|
+
FROM read_kafka(
|
|
161
|
+
'kafka.example.com:9092',
|
|
162
|
+
'events',
|
|
163
|
+
'',
|
|
164
|
+
'test_nested',
|
|
165
|
+
'', '', '', '',
|
|
166
|
+
'raw', 'raw', 0,
|
|
167
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
140
168
|
)
|
|
141
|
-
|
|
142
|
-
|
|
169
|
+
LIMIT 5
|
|
170
|
+
);
|
|
143
171
|
```
|
|
144
172
|
|
|
145
|
-
> **最佳实践**:在 SELECT 中将所有嵌套 JSON 字符串都 `
|
|
173
|
+
> **最佳实践**:在 SELECT 中将所有嵌套 JSON 字符串都 `parse_json` 展开后再落表,避免下游查询重复计算。
|
|
146
174
|
|
|
147
175
|
### 步骤 3:创建目标表
|
|
148
176
|
|
|
@@ -176,29 +204,42 @@ CREATE VCLUSTER IF NOT EXISTS pipe_kafka_vc
|
|
|
176
204
|
### 步骤 5:创建 Kafka Pipe
|
|
177
205
|
|
|
178
206
|
```sql
|
|
179
|
-
CREATE OR REPLACE PIPE
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
207
|
+
-- ⚠️ 注意:ClickZetta 不支持 CREATE OR REPLACE PIPE,需用 CREATE PIPE 或先 DROP 再 CREATE
|
|
208
|
+
CREATE PIPE kafka_orders_pipe
|
|
209
|
+
VIRTUAL_CLUSTER = 'pipe_kafka_vc'
|
|
210
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
211
|
+
BATCH_SIZE_PER_KAFKA_PARTITION = '500000'
|
|
183
212
|
AS
|
|
184
|
-
|
|
185
|
-
SELECT
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
FROM
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
213
|
+
COPY INTO ods.kafka_orders FROM (
|
|
214
|
+
SELECT
|
|
215
|
+
j['order_id']::STRING,
|
|
216
|
+
j['user_id']::STRING,
|
|
217
|
+
j['amount']::DECIMAL(10,2),
|
|
218
|
+
j['status']::STRING,
|
|
219
|
+
j['created_at']::TIMESTAMP,
|
|
220
|
+
CAST(`timestamp` AS TIMESTAMP) AS __kafka_timestamp__
|
|
221
|
+
FROM (
|
|
222
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
223
|
+
FROM read_kafka(
|
|
224
|
+
'kafka.example.com:9092', -- bootstrap_servers
|
|
225
|
+
'orders', -- topic
|
|
226
|
+
'', -- reserved
|
|
227
|
+
'lakehouse_orders', -- group_id(正式消费组名)
|
|
228
|
+
'', '', '', '', -- 位置参数留空,由 Pipe 自动管理
|
|
229
|
+
'raw', -- key_format
|
|
230
|
+
'raw', -- value_format
|
|
231
|
+
0, -- max_errors
|
|
232
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
233
|
+
)
|
|
198
234
|
)
|
|
199
235
|
);
|
|
200
236
|
```
|
|
201
237
|
|
|
238
|
+
> ⚠️ **Pipe 中 READ_KAFKA 的关键区别**:
|
|
239
|
+
> - 位置参数(starting_offsets 等)**必须留空**,由 Pipe 自动管理消费位点
|
|
240
|
+
> - 不要设置 `kafka.auto.offset.reset`(由 Pipe 的 `RESET_KAFKA_GROUP_OFFSETS` 参数控制)
|
|
241
|
+
> - group_id 使用正式名称(如 `lakehouse_orders`),Pipe 会持久化消费位点
|
|
242
|
+
|
|
202
243
|
**关键参数说明:**
|
|
203
244
|
|
|
204
245
|
| 参数 | 默认值 | 说明 |
|
|
@@ -220,7 +261,7 @@ FROM TABLE(
|
|
|
220
261
|
| `'latest'` | 重置到最新位点(仅消费新数据) |
|
|
221
262
|
| `'1737789688000'` | 重置到指定毫秒时间戳对应的位点 |
|
|
222
263
|
|
|
223
|
-
> **注意**:Pipe 中的
|
|
264
|
+
> **注意**:Pipe 中的 read_kafka 位置参数(starting_offsets 等)必须留空,由 Pipe 自动管理消费位点。与独立使用 read_kafka 探查时不同。
|
|
224
265
|
|
|
225
266
|
### 步骤 6:验证 Pipe 运行状态
|
|
226
267
|
|
|
@@ -234,7 +275,7 @@ SELECT COUNT(*) FROM ods.kafka_orders;
|
|
|
234
275
|
SELECT * FROM ods.kafka_orders LIMIT 10;
|
|
235
276
|
|
|
236
277
|
-- 查看加载历史(保留 7 天)
|
|
237
|
-
SELECT * FROM
|
|
278
|
+
SELECT * FROM load_history('ods.kafka_orders')
|
|
238
279
|
ORDER BY last_load_time DESC
|
|
239
280
|
LIMIT 20;
|
|
240
281
|
|
|
@@ -255,22 +296,40 @@ CREATE STORAGE CONNECTION IF NOT EXISTS kafka_conn
|
|
|
255
296
|
TYPE KAFKA
|
|
256
297
|
BOOTSTRAP_SERVERS = ['kafka.example.com:9092']
|
|
257
298
|
SECURITY_PROTOCOL = 'PLAINTEXT';
|
|
299
|
+
|
|
300
|
+
-- 删除 Connection(⚠️ 注意:用 DROP CONNECTION,不是 DROP STORAGE CONNECTION)
|
|
301
|
+
DROP CONNECTION IF EXISTS kafka_conn;
|
|
258
302
|
```
|
|
259
303
|
|
|
260
304
|
### 步骤 2:创建 Kafka 外部表
|
|
261
305
|
|
|
262
306
|
```sql
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
307
|
+
-- ⚠️ 必须显式指定列定义,不能省略
|
|
308
|
+
-- ⚠️ offset 是保留字,必须用反引号转义
|
|
309
|
+
CREATE EXTERNAL TABLE kafka_orders_ext (
|
|
310
|
+
topic STRING,
|
|
311
|
+
partition INT,
|
|
312
|
+
`offset` BIGINT,
|
|
313
|
+
`timestamp` TIMESTAMP,
|
|
314
|
+
timestamp_type STRING,
|
|
315
|
+
headers STRING,
|
|
316
|
+
key BINARY,
|
|
317
|
+
value BINARY
|
|
318
|
+
)
|
|
319
|
+
USING KAFKA
|
|
320
|
+
OPTIONS (
|
|
321
|
+
'group_id' = 'lakehouse_ext_orders',
|
|
322
|
+
'topics' = 'orders',
|
|
323
|
+
'starting_offset' = 'earliest'
|
|
324
|
+
)
|
|
325
|
+
CONNECTION kafka_conn;
|
|
271
326
|
```
|
|
272
327
|
|
|
273
|
-
|
|
328
|
+
> **注意**:
|
|
329
|
+
> - 列定义是**必须的**,不指定会报错 `failed to detect columns`
|
|
330
|
+
> - `offset` 和 `timestamp` 是保留字,定义和查询时都需要反引号转义
|
|
331
|
+
> - 删除外部表用 `DROP TABLE`(❌ `DROP EXTERNAL TABLE` 会报语法错误)
|
|
332
|
+
> - 删除 Connection 用 `DROP CONNECTION`(❌ `DROP STORAGE CONNECTION` 会报语法错误)
|
|
274
333
|
|
|
275
334
|
### 步骤 3:创建 Table Stream
|
|
276
335
|
|
|
@@ -292,19 +351,22 @@ CREATE TABLE IF NOT EXISTS ods.kafka_orders_from_ext (
|
|
|
292
351
|
);
|
|
293
352
|
|
|
294
353
|
-- Pipe(从 Table Stream 消费)
|
|
354
|
+
-- ⚠️ 注意:Table Stream Pipe 使用 INSERT INTO ... SELECT 语法,不是 COPY INTO
|
|
295
355
|
CREATE PIPE kafka_ext_orders_pipe
|
|
296
|
-
VIRTUAL_CLUSTER = pipe_kafka_vc
|
|
297
|
-
BATCH_INTERVAL_IN_SECONDS = 60
|
|
356
|
+
VIRTUAL_CLUSTER = 'pipe_kafka_vc'
|
|
357
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
298
358
|
AS
|
|
299
|
-
|
|
359
|
+
INSERT INTO ods.kafka_orders_from_ext
|
|
300
360
|
SELECT
|
|
301
361
|
GET_JSON_OBJECT(CAST(value AS STRING), '$.order_id') AS order_id,
|
|
302
362
|
GET_JSON_OBJECT(CAST(value AS STRING), '$.user_id') AS user_id,
|
|
303
363
|
CAST(GET_JSON_OBJECT(CAST(value AS STRING), '$.amount') AS DECIMAL(10,2)) AS amount,
|
|
304
|
-
CAST(timestamp AS TIMESTAMP) AS kafka_ts
|
|
364
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
305
365
|
FROM kafka_orders_stream;
|
|
306
366
|
```
|
|
307
367
|
|
|
368
|
+
> **清理外部表**:使用 `DROP TABLE kafka_orders_ext`(不是 `DROP EXTERNAL TABLE`)
|
|
369
|
+
|
|
308
370
|
---
|
|
309
371
|
|
|
310
372
|
## 监控与运维
|
|
@@ -346,16 +408,20 @@ ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = false;
|
|
|
346
408
|
### 修改 Pipe 属性
|
|
347
409
|
|
|
348
410
|
```sql
|
|
349
|
-
-- 修改批处理间隔
|
|
350
|
-
ALTER PIPE kafka_orders_pipe SET BATCH_INTERVAL_IN_SECONDS = 120;
|
|
351
|
-
|
|
352
|
-
-- 修改每分区批大小
|
|
353
|
-
ALTER PIPE kafka_orders_pipe SET BATCH_SIZE_PER_KAFKA_PARTITION = 1000000;
|
|
354
|
-
|
|
355
411
|
-- 修改 VCluster
|
|
356
412
|
ALTER PIPE kafka_orders_pipe SET VIRTUAL_CLUSTER = 'new_vc';
|
|
413
|
+
|
|
414
|
+
-- 修改 COPY_JOB_HINT
|
|
415
|
+
ALTER PIPE kafka_orders_pipe SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
|
|
357
416
|
```
|
|
358
417
|
|
|
418
|
+
> ⚠️ **ALTER PIPE 支持的属性**(经验证):
|
|
419
|
+
> - ✅ `PIPE_EXECUTION_PAUSED`
|
|
420
|
+
> - ✅ `VIRTUAL_CLUSTER`
|
|
421
|
+
> - ✅ `COPY_JOB_HINT`
|
|
422
|
+
> - ❌ `BATCH_INTERVAL_IN_SECONDS`(不支持修改,需删除重建 Pipe)
|
|
423
|
+
> - ❌ `BATCH_SIZE_PER_KAFKA_PARTITION`(不支持修改,需删除重建 Pipe)
|
|
424
|
+
>
|
|
359
425
|
> 每次 ALTER 只能修改一个属性。不支持修改 COPY/INSERT 语句逻辑,需删除重建。
|
|
360
426
|
|
|
361
427
|
### 修改 Pipe SQL 逻辑(需删除重建)
|
|
@@ -365,29 +431,35 @@ ALTER PIPE kafka_orders_pipe SET VIRTUAL_CLUSTER = 'new_vc';
|
|
|
365
431
|
DROP PIPE kafka_orders_pipe;
|
|
366
432
|
|
|
367
433
|
-- 2. 重建 Pipe(不要设置 RESET_KAFKA_GROUP_OFFSETS,保持从上次位点继续)
|
|
434
|
+
-- ⚠️ 注意:ClickZetta 不支持 CREATE OR REPLACE PIPE,使用 CREATE PIPE
|
|
368
435
|
CREATE PIPE kafka_orders_pipe
|
|
369
|
-
VIRTUAL_CLUSTER = pipe_kafka_vc
|
|
370
|
-
BATCH_INTERVAL_IN_SECONDS = 60
|
|
436
|
+
VIRTUAL_CLUSTER = 'pipe_kafka_vc'
|
|
437
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
371
438
|
AS
|
|
372
|
-
|
|
373
|
-
SELECT
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
FROM
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
439
|
+
COPY INTO ods.kafka_orders FROM (
|
|
440
|
+
SELECT
|
|
441
|
+
j['order_id']::STRING,
|
|
442
|
+
j['user_id']::STRING,
|
|
443
|
+
j['amount']::DECIMAL(10,2),
|
|
444
|
+
UPPER(j['status']::STRING), -- 修改了转换逻辑
|
|
445
|
+
j['created_at']::TIMESTAMP,
|
|
446
|
+
CAST(`timestamp` AS TIMESTAMP) AS __kafka_timestamp__
|
|
447
|
+
FROM (
|
|
448
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
449
|
+
FROM read_kafka(
|
|
450
|
+
'kafka.example.com:9092',
|
|
451
|
+
'orders',
|
|
452
|
+
'',
|
|
453
|
+
'lakehouse_orders', -- 保持相同 group_id
|
|
454
|
+
'', '', '', '',
|
|
455
|
+
'raw', 'raw', 0,
|
|
456
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
457
|
+
)
|
|
386
458
|
)
|
|
387
459
|
);
|
|
388
460
|
```
|
|
389
461
|
|
|
390
|
-
> **关键**:重建时保持相同的 `
|
|
462
|
+
> **关键**:重建时保持相同的 `group_id`,且不设置 `RESET_KAFKA_GROUP_OFFSETS`,Pipe 会从上次消费位点继续。
|
|
391
463
|
|
|
392
464
|
---
|
|
393
465
|
|
|
@@ -403,7 +475,7 @@ FROM TABLE(
|
|
|
403
475
|
|
|
404
476
|
| 问题 | 调优方向 | 操作 |
|
|
405
477
|
|------|---------|------|
|
|
406
|
-
| 每批读取不完一个周期的数据 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION` |
|
|
478
|
+
| 每批读取不完一个周期的数据 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION` | 删除重建 Pipe 时设置更大的值(如 `BATCH_SIZE_PER_KAFKA_PARTITION = '1000000'`) |
|
|
407
479
|
| 作业需要多轮才能完成 | 增大 VCluster 规格(使 core 数 ≥ partition 数) | `ALTER VCLUSTER ... SET VCLUSTER_SIZE = 16` |
|
|
408
480
|
| partition 少但数据量大 | 按条数切分 task | `ALTER PIPE ... SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}'` |
|
|
409
481
|
|
|
@@ -431,40 +503,61 @@ FROM TABLE(
|
|
|
431
503
|
|
|
432
504
|
```sql
|
|
433
505
|
-- 1. 探查
|
|
434
|
-
SELECT
|
|
435
|
-
FROM
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
506
|
+
SELECT parse_json(value::string)['id']::STRING, parse_json(value::string)['name']::STRING
|
|
507
|
+
FROM read_kafka(
|
|
508
|
+
'kafka:9092', 'metrics', '', 'test',
|
|
509
|
+
'', '', '', '', 'raw', 'raw', 0,
|
|
510
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
511
|
+
) LIMIT 5;
|
|
439
512
|
|
|
440
513
|
-- 2. 建表
|
|
441
514
|
CREATE TABLE ods.metrics (id STRING, name STRING, value DOUBLE, kafka_ts TIMESTAMP);
|
|
442
515
|
|
|
443
516
|
-- 3. 建 Pipe
|
|
444
|
-
CREATE PIPE metrics_pipe
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
517
|
+
CREATE PIPE metrics_pipe
|
|
518
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
519
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
520
|
+
AS
|
|
521
|
+
COPY INTO ods.metrics FROM (
|
|
522
|
+
SELECT
|
|
523
|
+
j['id']::STRING, j['name']::STRING, j['value']::DOUBLE,
|
|
524
|
+
CAST(`timestamp` AS TIMESTAMP)
|
|
525
|
+
FROM (
|
|
526
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
527
|
+
FROM read_kafka(
|
|
528
|
+
'kafka:9092', 'metrics', '', 'cz_metrics',
|
|
529
|
+
'', '', '', '', 'raw', 'raw', 0,
|
|
530
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
531
|
+
)
|
|
532
|
+
)
|
|
533
|
+
);
|
|
451
534
|
```
|
|
452
535
|
|
|
453
536
|
### 场景 B:Kafka → ODS → DWD 实时 ETL
|
|
454
537
|
|
|
455
538
|
```sql
|
|
456
539
|
-- 1. Pipe 接入 ODS 层
|
|
457
|
-
CREATE PIPE kafka_events_pipe
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
540
|
+
CREATE PIPE kafka_events_pipe
|
|
541
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
542
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
543
|
+
AS
|
|
544
|
+
COPY INTO ods.events FROM (
|
|
545
|
+
SELECT
|
|
546
|
+
j['event_id']::STRING, j['user_id']::STRING, j['action']::STRING, j['ts']::TIMESTAMP
|
|
547
|
+
FROM (
|
|
548
|
+
SELECT parse_json(value::string) AS j
|
|
549
|
+
FROM read_kafka(
|
|
550
|
+
'kafka:9092', 'user_events', '', 'cz_events',
|
|
551
|
+
'', '', '', '', 'raw', 'raw', 0,
|
|
552
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
553
|
+
)
|
|
554
|
+
)
|
|
555
|
+
);
|
|
464
556
|
|
|
465
557
|
-- 2. Dynamic Table 清洗到 DWD 层
|
|
558
|
+
-- ⚠️ 注意:Dynamic Table 支持 CREATE OR REPLACE,与 Pipe 不同
|
|
466
559
|
CREATE OR REPLACE DYNAMIC TABLE dwd.events_clean
|
|
467
|
-
REFRESH
|
|
560
|
+
REFRESH INTERVAL 1 MINUTE vcluster default
|
|
468
561
|
AS
|
|
469
562
|
SELECT event_id, user_id, UPPER(action) AS action, ts, DATE(ts) AS dt
|
|
470
563
|
FROM ods.events
|
|
@@ -472,7 +565,7 @@ WHERE event_id IS NOT NULL AND action IS NOT NULL;
|
|
|
472
565
|
|
|
473
566
|
-- 3. Dynamic Table 聚合到 DWS 层
|
|
474
567
|
CREATE OR REPLACE DYNAMIC TABLE dws.events_hourly
|
|
475
|
-
REFRESH
|
|
568
|
+
REFRESH INTERVAL 5 MINUTE vcluster default
|
|
476
569
|
AS
|
|
477
570
|
SELECT DATE_TRUNC('hour', ts) AS hour, action, COUNT(*) AS cnt, COUNT(DISTINCT user_id) AS uv
|
|
478
571
|
FROM dwd.events_clean
|
|
@@ -483,20 +576,31 @@ GROUP BY 1, 2;
|
|
|
483
576
|
|
|
484
577
|
```sql
|
|
485
578
|
CREATE PIPE kafka_auth_pipe
|
|
486
|
-
VIRTUAL_CLUSTER = pipe_vc
|
|
487
|
-
BATCH_INTERVAL_IN_SECONDS = 60
|
|
579
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
580
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
488
581
|
RESET_KAFKA_GROUP_OFFSETS = '1737789688000'
|
|
489
582
|
AS
|
|
490
|
-
|
|
491
|
-
SELECT
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
583
|
+
COPY INTO ods.secure_events FROM (
|
|
584
|
+
SELECT
|
|
585
|
+
j['id']::STRING AS event_id,
|
|
586
|
+
j['payload']::STRING AS payload,
|
|
587
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
588
|
+
FROM (
|
|
589
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
590
|
+
FROM read_kafka(
|
|
591
|
+
'kafka.example.com:9092',
|
|
592
|
+
'secure_events',
|
|
593
|
+
'',
|
|
594
|
+
'cz_secure',
|
|
595
|
+
'', '', '', '',
|
|
596
|
+
'raw', 'raw', 0,
|
|
597
|
+
MAP(
|
|
598
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
599
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
600
|
+
'kafka.sasl.username', 'my_user',
|
|
601
|
+
'kafka.sasl.password', 'my_password'
|
|
602
|
+
)
|
|
603
|
+
)
|
|
500
604
|
)
|
|
501
605
|
);
|
|
502
606
|
```
|
|
@@ -507,12 +611,16 @@ FROM TABLE(
|
|
|
507
611
|
|
|
508
612
|
| 问题 | 排查方向 |
|
|
509
613
|
|------|---------|
|
|
510
|
-
| READ_KAFKA
|
|
614
|
+
| READ_KAFKA 语法报错 `Syntax error at or near '('` | ❌ 不要用 `TABLE(READ_KAFKA(...))` 或 `=>` 命名参数。✅ 正确:`FROM read_kafka('broker', 'topic', '', 'group', '', '', '', '', 'raw', 'raw', 0, MAP(...))` |
|
|
615
|
+
| READ_KAFKA 报错 `cannot resolve column` | 使用了 `=` 赋值语法(如 `KAFKA_BROKER = 'xxx'`)。READ_KAFKA 只支持位置参数 |
|
|
616
|
+
| READ_KAFKA 探查无数据 | 检查 broker 地址/端口、topic 名称、网络连通性;在 MAP 中设置 `'kafka.auto.offset.reset', 'earliest'` |
|
|
511
617
|
| Pipe 创建后无数据加载 | `DESC PIPE EXTENDED` 检查是否暂停;确认 group_id 的消费位点(默认 latest,新数据才会消费) |
|
|
512
|
-
|
|
|
513
|
-
|
|
|
618
|
+
| Table Stream Pipe 语法报错 `Syntax error at or near 'SELECT'` | ❌ 不要用 `COPY INTO ... SELECT`。✅ 正确:`INSERT INTO ... SELECT FROM stream` |
|
|
619
|
+
| `CREATE OR REPLACE PIPE` 报错 AlreadyExist | ❌ ClickZetta 不支持 `CREATE OR REPLACE PIPE`。Pipe 不存在时 `CREATE OR REPLACE` 会创建成功,但 Pipe 已存在时报 AlreadyExist 错误。✅ 正确:用 `DROP PIPE` + `CREATE PIPE` 重建(与 Dynamic Table 不同,DT 支持 `CREATE OR REPLACE`) |
|
|
620
|
+
| JSON 解析报错 | 使用 `parse_json(value::string)['field']::TYPE` 语法;嵌套 JSON 需逐层 `parse_json()` 展开 |
|
|
621
|
+
| SASL 认证失败 | 确认安全协议为 SASL_PLAINTEXT(不支持 SSL);在 MAP 中设置 `kafka.sasl.mechanism`、`kafka.sasl.username`、`kafka.sasl.password` |
|
|
514
622
|
| 消费延迟持续增大 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION`;增大 VCluster 规格;使用 `COPY_JOB_HINT` 切分 task |
|
|
515
|
-
| 重建 Pipe 后数据重复 | 保持相同
|
|
623
|
+
| 重建 Pipe 后数据重复 | 保持相同 group_id 且不设置 `RESET_KAFKA_GROUP_OFFSETS` |
|
|
516
624
|
| 重建 Pipe 后数据丢失 | 检查 group_id 的位点是否过期;如需回溯用 `RESET_KAFKA_GROUP_OFFSETS` 指定时间戳 |
|
|
517
625
|
| `COPY_JOB_HINT` 修改后参数丢失 | `SET COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数 |
|
|
518
626
|
| Pipe 作业 Failover | 查看作业详情;通常为 Kafka 连接中断或 Lakehouse 服务升级,会自动恢复 |
|