@clickzetta/cz-cli-linux-x64 0.3.1 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +9 -2
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +1 -1
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +3 -3
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +27 -1
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +82 -43
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +262 -154
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +192 -54
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +1 -1
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +70 -45
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +79 -53
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +40 -28
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +133 -71
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +11 -9
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +79 -28
- package/bin/skills/clickzetta-volume-manager/SKILL.md +48 -5
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +7 -2
- package/package.json +1 -1
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# Pipe SQL 参考
|
|
2
2
|
|
|
3
3
|
> **⚠️ ClickZetta 特有语法**
|
|
4
|
-
> - Kafka 读取函数是 `
|
|
5
|
-
> -
|
|
6
|
-
> - JSON 字段提取用 `$1:field_name::TYPE` 语法(`$1` 表示整行 JSON)
|
|
4
|
+
> - Kafka 读取函数是 `read_kafka(...)`,使用**位置参数**(不是命名参数 `=>`)
|
|
5
|
+
> - JSON 字段提取用 `parse_json(value::string)['field']::TYPE` 语法
|
|
7
6
|
> - Pipe 创建后默认自动启动,无需手动 RESUME
|
|
7
|
+
> - OSS Pipe 的 `PURGE=true` 紧跟在 `USING <format>` 之后(如 `USING CSV PURGE=true`)
|
|
8
8
|
|
|
9
9
|
Pipe 是 ClickZetta Lakehouse 的持续数据导入对象,通过 SQL 定义从 Kafka 或对象存储(OSS/S3/COS)自动、持续地将数据导入目标表,无需外部调度。
|
|
10
10
|
|
|
@@ -12,21 +12,24 @@ Pipe 是 ClickZetta Lakehouse 的持续数据导入对象,通过 SQL 定义从
|
|
|
12
12
|
|
|
13
13
|
```sql
|
|
14
14
|
CREATE [ OR REPLACE ] PIPE <pipe_name>
|
|
15
|
-
|
|
16
|
-
[
|
|
15
|
+
VIRTUAL_CLUSTER = '<vcluster_name>'
|
|
16
|
+
[ BATCH_INTERVAL_IN_SECONDS = '<seconds>' ]
|
|
17
|
+
[ BATCH_SIZE_PER_KAFKA_PARTITION = '<count>' ]
|
|
18
|
+
[ RESET_KAFKA_GROUP_OFFSETS = '<none|valid|earliest|latest|timestamp_ms>' ]
|
|
19
|
+
[ COPY_JOB_HINT = '<json>' ]
|
|
17
20
|
AS
|
|
18
|
-
|
|
19
|
-
SELECT <expr> [, ...]
|
|
20
|
-
FROM
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
21
|
+
COPY INTO <target_table> FROM (
|
|
22
|
+
SELECT <expr> [, ...]
|
|
23
|
+
FROM read_kafka(
|
|
24
|
+
'<bootstrap_servers>', -- 必填:Kafka 集群地址
|
|
25
|
+
'<topic>', -- 必填:Topic 名称
|
|
26
|
+
'', -- 保留(填空字符串)
|
|
27
|
+
'<group_id>', -- 必填:持久消费者组 ID
|
|
28
|
+
'', '', '', '', -- 位置参数留空,由 Pipe 自动管理
|
|
29
|
+
'raw', -- key 格式(目前只支持 raw)
|
|
30
|
+
'raw', -- value 格式(目前只支持 raw)
|
|
31
|
+
0, -- max_errors
|
|
32
|
+
MAP(<kafka_config>) -- Kafka 配置参数
|
|
30
33
|
)
|
|
31
34
|
);
|
|
32
35
|
```
|
|
@@ -35,61 +38,121 @@ FROM TABLE(
|
|
|
35
38
|
```sql
|
|
36
39
|
-- 从 Kafka 持续导入 JSON 数据
|
|
37
40
|
CREATE OR REPLACE PIPE kafka_orders_pipe
|
|
38
|
-
|
|
41
|
+
VIRTUAL_CLUSTER = 'default'
|
|
42
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
39
43
|
AS
|
|
40
|
-
|
|
41
|
-
SELECT
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
44
|
+
COPY INTO ods.orders FROM (
|
|
45
|
+
SELECT
|
|
46
|
+
j['order_id']::STRING AS order_id,
|
|
47
|
+
j['user_id']::STRING AS user_id,
|
|
48
|
+
j['amount']::DECIMAL(10,2) AS amount,
|
|
49
|
+
j['created_at']::TIMESTAMP AS created_at,
|
|
50
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
51
|
+
FROM (
|
|
52
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
53
|
+
FROM read_kafka(
|
|
54
|
+
'kafka.example.com:9092',
|
|
55
|
+
'orders',
|
|
56
|
+
'',
|
|
57
|
+
'lakehouse_consumer',
|
|
58
|
+
'', '', '', '',
|
|
59
|
+
'raw', 'raw', 0,
|
|
60
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
-- SASL 认证
|
|
66
|
+
CREATE PIPE kafka_secure_pipe
|
|
67
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
68
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
69
|
+
AS
|
|
70
|
+
COPY INTO ods.secure_events FROM (
|
|
71
|
+
SELECT parse_json(value::string)['id']::STRING AS id,
|
|
72
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
73
|
+
FROM read_kafka(
|
|
74
|
+
'kafka.example.com:9092', 'secure_events', '', 'cz_secure',
|
|
75
|
+
'', '', '', '', 'raw', 'raw', 0,
|
|
76
|
+
MAP(
|
|
77
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
78
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
79
|
+
'kafka.sasl.username', 'my_user',
|
|
80
|
+
'kafka.sasl.password', 'my_password'
|
|
81
|
+
)
|
|
53
82
|
)
|
|
54
83
|
);
|
|
55
84
|
```
|
|
56
85
|
|
|
86
|
+
## 验证 Kafka 连接(创建 Pipe 前)
|
|
87
|
+
|
|
88
|
+
独立使用 `read_kafka` 探查数据时,可以在 MAP 中设置 `kafka.auto.offset.reset`:
|
|
89
|
+
|
|
90
|
+
```sql
|
|
91
|
+
-- 验证连接和数据格式
|
|
92
|
+
SELECT value::string
|
|
93
|
+
FROM read_kafka(
|
|
94
|
+
'kafka.example.com:9092',
|
|
95
|
+
'orders',
|
|
96
|
+
'',
|
|
97
|
+
'test_explore',
|
|
98
|
+
'', '', '', '',
|
|
99
|
+
'raw', 'raw', 0,
|
|
100
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
101
|
+
)
|
|
102
|
+
LIMIT 10;
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
> ⚠️ **独立探查 vs Pipe 中的区别**:
|
|
106
|
+
> - 独立探查:可在 MAP 中设置 `kafka.auto.offset.reset` 为 `earliest` 读取历史数据
|
|
107
|
+
> - Pipe 中:位置参数必须留空,消费位点由 Pipe 的 `RESET_KAFKA_GROUP_OFFSETS` 参数控制
|
|
108
|
+
|
|
57
109
|
## CREATE PIPE — 从对象存储导入
|
|
58
110
|
|
|
59
111
|
```sql
|
|
60
112
|
CREATE [ OR REPLACE ] PIPE [ IF NOT EXISTS ] <pipe_name>
|
|
61
|
-
VIRTUAL_CLUSTER = <virtual_cluster_name>
|
|
62
|
-
INGEST_MODE =
|
|
113
|
+
VIRTUAL_CLUSTER = '<virtual_cluster_name>'
|
|
114
|
+
INGEST_MODE = 'LIST_PURGE' | 'EVENT_NOTIFICATION'
|
|
115
|
+
[ COMMENT '<comment>' ]
|
|
63
116
|
[ COPY_JOB_HINT = '<hint>' ]
|
|
64
117
|
AS
|
|
65
118
|
COPY INTO <target_table>
|
|
66
119
|
FROM VOLUME <volume_name>
|
|
67
|
-
USING <csv | parquet | orc | json>
|
|
68
|
-
[ OPTIONS ('<key>' = '<value>', ...) ];
|
|
120
|
+
USING <csv | parquet | orc | json> [OPTIONS ('<key>' = '<value>', ...)] PURGE=true;
|
|
69
121
|
```
|
|
70
122
|
|
|
71
123
|
**关键参数:**
|
|
72
124
|
- `VIRTUAL_CLUSTER`:指定虚拟集群名称(OSS Pipe 必填)
|
|
73
|
-
- `INGEST_MODE = LIST_PURGE
|
|
74
|
-
- `INGEST_MODE = EVENT_NOTIFICATION`:事件通知模式,低延迟(仅阿里云 OSS + AWS S3
|
|
75
|
-
- `
|
|
125
|
+
- `INGEST_MODE = 'LIST_PURGE'`:通用模式,定期扫描文件列表,必须设置 `PURGE=true`
|
|
126
|
+
- `INGEST_MODE = 'EVENT_NOTIFICATION'`:事件通知模式,低延迟(仅阿里云 OSS + AWS S3),不需要 `PURGE=true`
|
|
127
|
+
- `COMMENT 'text'`:不带等号(`COMMENT = 'text'` 会报错)
|
|
128
|
+
- `PURGE=true`:放在最后,OPTIONS 在其之前:`USING CSV OPTIONS (...) PURGE=true`
|
|
76
129
|
- PIPE 中的 COPY 语句不支持 `files`、`regexp`、`subdirectory` 参数
|
|
77
130
|
|
|
78
131
|
**示例:**
|
|
79
132
|
```sql
|
|
80
|
-
--
|
|
133
|
+
-- LIST_PURGE 模式(带 OPTIONS)
|
|
81
134
|
CREATE OR REPLACE PIPE oss_events_pipe
|
|
82
|
-
VIRTUAL_CLUSTER = default
|
|
83
|
-
INGEST_MODE = LIST_PURGE
|
|
135
|
+
VIRTUAL_CLUSTER = 'default'
|
|
136
|
+
INGEST_MODE = 'LIST_PURGE'
|
|
137
|
+
COMMENT 'OSS events pipeline'
|
|
84
138
|
AS
|
|
85
139
|
COPY INTO ods.events
|
|
86
140
|
FROM VOLUME my_oss_volume
|
|
87
|
-
USING PARQUET;
|
|
141
|
+
USING PARQUET PURGE=true;
|
|
88
142
|
|
|
89
|
-
--
|
|
90
|
-
CREATE
|
|
91
|
-
VIRTUAL_CLUSTER = default
|
|
92
|
-
INGEST_MODE =
|
|
143
|
+
-- CSV 格式带 OPTIONS(OPTIONS 在 PURGE 之前)
|
|
144
|
+
CREATE PIPE oss_csv_pipe
|
|
145
|
+
VIRTUAL_CLUSTER = 'default'
|
|
146
|
+
INGEST_MODE = 'LIST_PURGE'
|
|
147
|
+
AS
|
|
148
|
+
COPY INTO ods.csv_data
|
|
149
|
+
FROM VOLUME my_csv_volume
|
|
150
|
+
USING CSV OPTIONS ('header' = 'true', 'sep' = ',') PURGE=true;
|
|
151
|
+
|
|
152
|
+
-- EVENT_NOTIFICATION 模式(不需要 PURGE)
|
|
153
|
+
CREATE PIPE oss_event_pipe
|
|
154
|
+
VIRTUAL_CLUSTER = 'default'
|
|
155
|
+
INGEST_MODE = 'EVENT_NOTIFICATION'
|
|
93
156
|
ALICLOUD_MNS_QUEUE = 'my-mns-queue-name'
|
|
94
157
|
AS
|
|
95
158
|
COPY INTO ods.events
|
|
@@ -107,6 +170,24 @@ ALTER PIPE <pipe_name> SET PIPE_EXECUTION_PAUSED = true;
|
|
|
107
170
|
ALTER PIPE <pipe_name> SET PIPE_EXECUTION_PAUSED = false;
|
|
108
171
|
```
|
|
109
172
|
|
|
173
|
+
## 修改 Pipe 属性
|
|
174
|
+
|
|
175
|
+
```sql
|
|
176
|
+
-- 每次只能修改一个属性
|
|
177
|
+
ALTER PIPE <pipe_name> SET VIRTUAL_CLUSTER = 'new_vc';
|
|
178
|
+
ALTER PIPE <pipe_name> SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
> ⚠️ **ALTER PIPE 支持的属性**:
|
|
182
|
+
> - ✅ `PIPE_EXECUTION_PAUSED`
|
|
183
|
+
> - ✅ `VIRTUAL_CLUSTER`
|
|
184
|
+
> - ✅ `COPY_JOB_HINT`
|
|
185
|
+
> - ❌ `BATCH_INTERVAL_IN_SECONDS`(不支持修改,需删除重建)
|
|
186
|
+
> - ❌ `BATCH_SIZE_PER_KAFKA_PARTITION`(不支持修改,需删除重建)
|
|
187
|
+
>
|
|
188
|
+
> 不支持修改 COPY/INSERT 语句逻辑,需删除 Pipe 后重建。
|
|
189
|
+
> `COPY_JOB_HINT` 修改会覆盖所有已有 hints,需一次性设置全部参数。
|
|
190
|
+
|
|
110
191
|
## DROP PIPE
|
|
111
192
|
|
|
112
193
|
```sql
|
|
@@ -119,40 +200,21 @@ DROP PIPE [ IF EXISTS ] <pipe_name>;
|
|
|
119
200
|
-- 列出当前 schema 下所有 Pipe
|
|
120
201
|
SHOW PIPES;
|
|
121
202
|
|
|
122
|
-
--
|
|
123
|
-
SHOW PIPES LIKE 'kafka%';
|
|
124
|
-
|
|
125
|
-
-- 查看 Pipe 详情
|
|
203
|
+
-- 查看 Pipe 详情(状态、延迟、定义)
|
|
126
204
|
DESC PIPE <pipe_name>;
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
## 验证 Kafka 连接(创建 Pipe 前)
|
|
130
|
-
|
|
131
|
-
```sql
|
|
132
|
-
-- 先用 READ_KAFKA 函数验证连接和数据格式
|
|
133
|
-
SELECT *
|
|
134
|
-
FROM TABLE(
|
|
135
|
-
READ_KAFKA(
|
|
136
|
-
KAFKA_BROKER => 'kafka.example.com:9092',
|
|
137
|
-
KAFKA_TOPIC => 'orders',
|
|
138
|
-
KAFKA_GROUP_ID => 'test_group',
|
|
139
|
-
KAFKA_OFFSET => 'earliest',
|
|
140
|
-
KAFKA_DATA_FORMAT => 'json'
|
|
141
|
-
)
|
|
142
|
-
)
|
|
143
|
-
LIMIT 10;
|
|
205
|
+
DESC PIPE EXTENDED <pipe_name>;
|
|
144
206
|
```
|
|
145
207
|
|
|
146
208
|
## 注意事项
|
|
147
209
|
|
|
148
210
|
- Pipe 创建后默认自动启动,无需手动 RESUME
|
|
149
|
-
- Kafka Pipe 使用 consumer group 管理 offset,重建 Pipe
|
|
150
|
-
- 对象存储 Pipe
|
|
151
|
-
- Pipe 不支持修改 AS
|
|
211
|
+
- Kafka Pipe 使用 consumer group 管理 offset,重建 Pipe 时保持相同 group_id 可从上次位点继续
|
|
212
|
+
- 对象存储 Pipe 通过文件列表或事件通知检测新文件,`load_history` 去重记录保留 7 天
|
|
213
|
+
- Pipe 不支持修改 AS 子句,需要删除后重建(不是 `CREATE OR REPLACE`)
|
|
214
|
+
- Kafka Pipe 仅支持 PLAINTEXT 和 SASL_PLAINTEXT 安全协议,不支持 SSL
|
|
152
215
|
|
|
153
216
|
## 参考文档
|
|
154
217
|
|
|
155
|
-
- [PIPE 导入语法](https://www.yunqi.tech/documents/pipe-syntax)
|
|
156
218
|
- [Pipe 简介](https://www.yunqi.tech/documents/pipe-summary)
|
|
157
219
|
- [借助 read_kafka 函数持续导入](https://www.yunqi.tech/documents/pipe-kafka)
|
|
158
220
|
- [借助 Kafka 外表 Table Stream 持续导入](https://www.yunqi.tech/documents/pipe-kafka-table-stream)
|
|
@@ -51,10 +51,10 @@ CREATE TABLE STREAM orders_stream_from_ts
|
|
|
51
51
|
|
|
52
52
|
## 消费 Table Stream
|
|
53
53
|
|
|
54
|
-
Table Stream
|
|
54
|
+
Table Stream 的 offset 通过 DML 操作移动。**仅 SELECT 不会移动 offset**,可以反复查询预览。执行 DML(INSERT INTO / MERGE INTO / UPDATE / DELETE)消费数据后,offset 前进。
|
|
55
55
|
|
|
56
56
|
```sql
|
|
57
|
-
--
|
|
57
|
+
-- 查看当前未消费的变更数据(不移动 offset)
|
|
58
58
|
SELECT * FROM orders_stream;
|
|
59
59
|
|
|
60
60
|
-- 变更数据包含的系统字段
|
|
@@ -62,21 +62,20 @@ SELECT * FROM orders_stream;
|
|
|
62
62
|
-- __commit_version: 变更版本号
|
|
63
63
|
-- __commit_timestamp: 变更发生时间
|
|
64
64
|
|
|
65
|
-
-- 典型用法:将变更数据 MERGE
|
|
65
|
+
-- 典型用法:将变更数据 MERGE 到目标表(过滤掉 UPDATE_BEFORE)
|
|
66
66
|
MERGE INTO dw.orders_dim AS target
|
|
67
67
|
USING (
|
|
68
68
|
SELECT * FROM orders_stream
|
|
69
|
-
WHERE __change_type
|
|
69
|
+
WHERE __change_type != 'UPDATE_BEFORE'
|
|
70
70
|
) AS src
|
|
71
71
|
ON target.order_id = src.order_id
|
|
72
72
|
WHEN MATCHED AND src.__change_type = 'UPDATE_AFTER' THEN UPDATE SET target.status = src.status, target.amount = src.amount
|
|
73
73
|
WHEN MATCHED AND src.__change_type = 'DELETE' THEN DELETE
|
|
74
|
-
WHEN NOT MATCHED AND src.__change_type
|
|
74
|
+
WHEN NOT MATCHED AND src.__change_type IN ('INSERT', 'UPDATE_AFTER') THEN INSERT (order_id, status, amount) VALUES (src.order_id, src.status, src.amount);
|
|
75
75
|
|
|
76
76
|
-- 配合 Dynamic Table 自动消费(推荐)
|
|
77
77
|
CREATE OR REPLACE DYNAMIC TABLE dw.orders_processed
|
|
78
|
-
REFRESH
|
|
79
|
-
VCLUSTER default
|
|
78
|
+
REFRESH INTERVAL 1 MINUTE vcluster default
|
|
80
79
|
AS
|
|
81
80
|
SELECT order_id, status, amount, __change_type, __commit_timestamp
|
|
82
81
|
FROM orders_stream
|
|
@@ -107,10 +106,13 @@ DESC TABLE STREAM <stream_name>;
|
|
|
107
106
|
|
|
108
107
|
## 注意事项
|
|
109
108
|
|
|
110
|
-
-
|
|
109
|
+
- 仅 SELECT 不会移动 offset,可反复查询预览
|
|
110
|
+
- DML 操作(INSERT INTO / MERGE INTO / UPDATE / DELETE)会移动 offset
|
|
111
|
+
- ⚠️ 即使 DML 带 WHERE 条件过滤了部分行,**所有行的 offset 都会移动**
|
|
111
112
|
- 若长时间不消费,超出源表的 `data_retention_days` 后数据会丢失
|
|
112
113
|
- `STANDARD` 模式下 UPDATE 会产生两条记录:`UPDATE_BEFORE`(更新前)和 `UPDATE_AFTER`(更新后)
|
|
113
|
-
- 消费时通常过滤 `__change_type
|
|
114
|
+
- 消费时通常过滤 `__change_type != 'UPDATE_BEFORE'`,忽略旧值
|
|
115
|
+
- 源表需先开启 `change_tracking`:`ALTER TABLE name SET PROPERTIES ('change_tracking' = 'true')`
|
|
114
116
|
|
|
115
117
|
## 参考文档
|
|
116
118
|
|
|
@@ -13,18 +13,22 @@ description: |
|
|
|
13
13
|
## 指令
|
|
14
14
|
|
|
15
15
|
### 步骤 1:开启源表变更跟踪(必需前置)
|
|
16
|
-
|
|
16
|
+
执行 SQL 开启源表的 change_tracking:
|
|
17
17
|
```sql
|
|
18
18
|
ALTER TABLE <source_table> SET PROPERTIES ('change_tracking' = 'true');
|
|
19
19
|
```
|
|
20
20
|
- 这是强制性前置步骤,不执行则 Stream 无法正确捕获变更
|
|
21
|
-
-
|
|
21
|
+
- 验证属性是否生效(两种方法):
|
|
22
22
|
```sql
|
|
23
|
-
|
|
23
|
+
-- 方法 1:DESC EXTENDED 查看 properties
|
|
24
|
+
DESC EXTENDED <source_table>;
|
|
25
|
+
|
|
26
|
+
-- 方法 2:查询 information_schema
|
|
27
|
+
SELECT table_name, properties FROM information_schema.tables WHERE table_name = '<source_table>';
|
|
24
28
|
```
|
|
25
29
|
|
|
26
30
|
### 步骤 2:创建 Table Stream
|
|
27
|
-
|
|
31
|
+
执行 SQL 创建 Stream:
|
|
28
32
|
```sql
|
|
29
33
|
CREATE [ OR REPLACE ] TABLE STREAM <stream_name>
|
|
30
34
|
ON TABLE <source_table>
|
|
@@ -36,37 +40,42 @@ CREATE [ OR REPLACE ] TABLE STREAM <stream_name>
|
|
|
36
40
|
);
|
|
37
41
|
```
|
|
38
42
|
关键参数选择:
|
|
39
|
-
- **STANDARD 模式**:捕获 INSERT/UPDATE/DELETE
|
|
43
|
+
- **STANDARD 模式**:捕获 INSERT/UPDATE/DELETE,反映表当前状态(delta 变化) → 适用于数据同步、增量 ETL
|
|
44
|
+
- delta 变化指两个事务时间点之间的净变化。例如:先 INSERT 再 DELETE 同一行 → delta 为空;先 INSERT 再 UPDATE → delta 为一条新行(最终状态)
|
|
40
45
|
- **APPEND_ONLY 模式**:仅捕获 INSERT,保留所有历史插入记录 → 适用于审计、历史记录保留
|
|
46
|
+
- 即使后续 DELETE 了某行,APPEND_ONLY 模式仍保留该行的 INSERT 记录
|
|
41
47
|
- **SHOW_INITIAL_ROWS = TRUE**:首次消费返回建 Stream 时表中已有行
|
|
42
48
|
- **SHOW_INITIAL_ROWS = FALSE**(默认):首次消费仅返回建 Stream 后的新变更
|
|
43
49
|
- 可选:指定起始时间点
|
|
44
50
|
```sql
|
|
45
|
-
--
|
|
46
|
-
--
|
|
51
|
+
-- TIMESTAMP AS OF 用于指定 Stream 的起始读取位点
|
|
52
|
+
-- 注意:此功能在某些场景下可能不稳定,建议优先使用默认行为(从创建时刻开始)
|
|
47
53
|
CREATE TABLE STREAM <stream_name>
|
|
48
54
|
ON TABLE <source_table>
|
|
49
|
-
TIMESTAMP AS OF
|
|
55
|
+
TIMESTAMP AS OF '<timestamp>'
|
|
50
56
|
WITH PROPERTIES ('TABLE_STREAM_MODE' = 'STANDARD');
|
|
51
57
|
```
|
|
52
58
|
|
|
53
59
|
### 步骤 3:准备目标表
|
|
54
|
-
|
|
60
|
+
创建与源表结构兼容的目标表:
|
|
55
61
|
- 目标表列定义需包含源表的业务列
|
|
56
62
|
- 建议额外添加元数据列(如 sync_version、sync_timestamp)用于追踪
|
|
57
63
|
|
|
58
64
|
### 步骤 4:查询 Stream 数据(预览,不移动 offset)
|
|
59
|
-
|
|
65
|
+
执行 SELECT 预览 Stream 中的变更数据:
|
|
60
66
|
```sql
|
|
61
67
|
SELECT *, __change_type, __commit_version, __commit_timestamp
|
|
62
68
|
FROM <stream_name>;
|
|
63
69
|
```
|
|
64
70
|
- 仅 SELECT 不会移动 offset
|
|
65
71
|
- 元数据字段:`__change_type`(值:`INSERT` / `UPDATE_BEFORE` / `UPDATE_AFTER` / `DELETE`)、`__commit_version`、`__commit_timestamp`
|
|
66
|
-
- UPDATE
|
|
72
|
+
- **UPDATE 处理要点**:UPDATE 操作产生两条记录:
|
|
73
|
+
- `UPDATE_BEFORE`:更新前的旧值(通常在消费时忽略)
|
|
74
|
+
- `UPDATE_AFTER`:更新后的新值(用于写入目标表)
|
|
75
|
+
- 消费时务必过滤 `__change_type`,避免将 `UPDATE_BEFORE` 旧值误写入目标表
|
|
67
76
|
|
|
68
77
|
### 步骤 5:消费 Stream 数据(移动 offset)
|
|
69
|
-
|
|
78
|
+
执行 DML 操作消费数据:
|
|
70
79
|
|
|
71
80
|
#### 方式 A:全量消费(INSERT INTO)
|
|
72
81
|
```sql
|
|
@@ -77,24 +86,39 @@ SELECT <columns> FROM <stream_name>;
|
|
|
77
86
|
#### 方式 B:幂等消费(MERGE,推荐)
|
|
78
87
|
```sql
|
|
79
88
|
MERGE INTO <target_table> t
|
|
80
|
-
USING <stream_name> s
|
|
89
|
+
USING (SELECT * FROM <stream_name> WHERE __change_type != 'UPDATE_BEFORE') s
|
|
81
90
|
ON t.<pk_column> = s.<pk_column>
|
|
82
|
-
WHEN MATCHED AND s.__change_type
|
|
91
|
+
WHEN MATCHED AND s.__change_type IN ('INSERT', 'UPDATE_AFTER') THEN UPDATE SET t.col1 = s.col1, t.col2 = s.col2
|
|
83
92
|
WHEN MATCHED AND s.__change_type = 'DELETE' THEN DELETE
|
|
84
93
|
WHEN NOT MATCHED AND s.__change_type = 'INSERT' THEN INSERT (<columns>) VALUES (s.<columns>);
|
|
85
94
|
```
|
|
86
95
|
- DML 操作(INSERT/UPDATE/MERGE)会移动 offset
|
|
87
|
-
- 即使使用 WHERE
|
|
96
|
+
- ⚠️ 即使使用 WHERE 条件过滤,**所有数据的 offset 仍会移动**(不仅是匹配的行)
|
|
88
97
|
- 推荐使用 MERGE 实现幂等性,避免重复消费导致数据重复
|
|
98
|
+
- 在 USING 子查询中过滤掉 `UPDATE_BEFORE`,避免旧值干扰 MERGE 逻辑
|
|
99
|
+
- ⚠️ **MERGE 语法顺序要求**:多个 `WHEN MATCHED` 子句时,**UPDATE 必须在 DELETE 之前**,否则报错(错误信息:`update statement must be before delete statement`)
|
|
89
100
|
|
|
90
101
|
### 步骤 6:验证消费状态
|
|
91
|
-
|
|
102
|
+
执行查询确认消费完成:
|
|
92
103
|
```sql
|
|
93
104
|
SELECT COUNT(*) FROM <stream_name>;
|
|
94
105
|
```
|
|
95
106
|
- 消费成功后 COUNT 应为 0 或仅包含新变更
|
|
96
107
|
- 记录最后消费的 `__commit_version` 用于故障恢复
|
|
97
108
|
|
|
109
|
+
## Offset 移动规则
|
|
110
|
+
|
|
111
|
+
| 操作 | 是否移动 offset | 说明 |
|
|
112
|
+
|------|----------------|------|
|
|
113
|
+
| `SELECT * FROM stream` | ❌ 不移动 | 仅预览,可反复查询 |
|
|
114
|
+
| `INSERT INTO target SELECT ... FROM stream` | ✅ 移动 | 消费数据 |
|
|
115
|
+
| `MERGE INTO target USING stream ...` | ✅ 移动 | 消费数据(推荐) |
|
|
116
|
+
| `UPDATE target SET ... FROM stream` | ✅ 移动 | 消费数据 |
|
|
117
|
+
| `DELETE FROM target USING stream` | ✅ 移动 | 消费数据 |
|
|
118
|
+
| 带 WHERE 的 DML | ✅ 全部移动 | 即使 WHERE 过滤了部分行,所有行的 offset 都会移动 |
|
|
119
|
+
|
|
120
|
+
> ⚠️ **关键注意**:offset 移动是全量的。一旦执行 DML 消费 Stream,所有变更记录的 offset 都会前进,无法部分消费。如果 DML 执行失败(如目标表不存在),offset 不会移动。
|
|
121
|
+
|
|
98
122
|
## 模式选择速查
|
|
99
123
|
|
|
100
124
|
| 需求 | 推荐模式 |
|
|
@@ -115,21 +139,48 @@ SELECT COUNT(*) FROM <stream_name>;
|
|
|
115
139
|
## 示例
|
|
116
140
|
|
|
117
141
|
### 示例 1:订单表实时同步
|
|
118
|
-
```
|
|
119
|
-
1.
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
142
|
+
```sql
|
|
143
|
+
-- 1. 开启源表变更跟踪
|
|
144
|
+
ALTER TABLE orders SET PROPERTIES ('change_tracking' = 'true');
|
|
145
|
+
|
|
146
|
+
-- 2. 创建 Table Stream
|
|
147
|
+
CREATE TABLE STREAM orders_stream ON TABLE orders
|
|
148
|
+
WITH PROPERTIES ('TABLE_STREAM_MODE' = 'STANDARD', 'SHOW_INITIAL_ROWS' = 'FALSE');
|
|
149
|
+
|
|
150
|
+
-- 3. 创建目标表(与源表结构兼容)
|
|
151
|
+
CREATE TABLE orders_sync (order_id INT, status STRING, amount DOUBLE);
|
|
152
|
+
|
|
153
|
+
-- 4. 预览 Stream 数据(不移动 offset)
|
|
154
|
+
SELECT *, __commit_version, __commit_timestamp FROM orders_stream;
|
|
155
|
+
|
|
156
|
+
-- 5. 消费 Stream 数据(移动 offset)
|
|
157
|
+
MERGE INTO orders_sync t
|
|
158
|
+
USING (SELECT * FROM orders_stream WHERE __change_type != 'UPDATE_BEFORE') s
|
|
159
|
+
ON t.order_id = s.order_id
|
|
160
|
+
WHEN MATCHED AND s.__change_type IN ('INSERT', 'UPDATE_AFTER') THEN UPDATE SET t.status = s.status, t.amount = s.amount
|
|
161
|
+
WHEN MATCHED AND s.__change_type = 'DELETE' THEN DELETE
|
|
162
|
+
WHEN NOT MATCHED AND s.__change_type = 'INSERT' THEN INSERT (order_id, status, amount) VALUES (s.order_id, s.status, s.amount);
|
|
163
|
+
|
|
164
|
+
-- 6. 验证消费完成
|
|
165
|
+
SELECT COUNT(*) FROM orders_stream;
|
|
125
166
|
```
|
|
126
167
|
|
|
127
168
|
### 示例 2:用户行为审计(保留全部插入历史)
|
|
128
|
-
```
|
|
129
|
-
1.
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
169
|
+
```sql
|
|
170
|
+
-- 1. 开启源表变更跟踪
|
|
171
|
+
ALTER TABLE user_actions SET PROPERTIES ('change_tracking' = 'true');
|
|
172
|
+
|
|
173
|
+
-- 2. 创建 Table Stream(APPEND_ONLY 模式)
|
|
174
|
+
CREATE TABLE STREAM user_actions_audit_stream ON TABLE user_actions
|
|
175
|
+
WITH PROPERTIES ('TABLE_STREAM_MODE' = 'APPEND_ONLY', 'SHOW_INITIAL_ROWS' = 'TRUE');
|
|
176
|
+
|
|
177
|
+
-- 3. 预览 Stream 数据
|
|
178
|
+
SELECT *, __commit_version, __commit_timestamp FROM user_actions_audit_stream;
|
|
179
|
+
|
|
180
|
+
-- 4. 消费 Stream 数据(INSERT INTO 移动 offset)
|
|
181
|
+
INSERT INTO user_actions_audit
|
|
182
|
+
SELECT *, __commit_version AS audit_version, __commit_timestamp AS audit_time
|
|
183
|
+
FROM user_actions_audit_stream;
|
|
133
184
|
```
|
|
134
185
|
|
|
135
186
|
## 故障排除
|
|
@@ -33,14 +33,17 @@ description: |
|
|
|
33
33
|
|
|
34
34
|
> ⚠️ **跨云限制**:Storage Connection 必须与 Lakehouse 实例在同一云厂商。阿里云实例不能创建 COS/S3 Connection,腾讯云实例不能创建 OSS Connection。
|
|
35
35
|
|
|
36
|
-
> ⚠️ **阿里云 OSS
|
|
36
|
+
> ⚠️ **阿里云 OSS 参数名**:
|
|
37
|
+
> - 小写形式:`access_id` / `access_key`(推荐)
|
|
38
|
+
> - 大写形式:`ACCESS_KEY_ID` / `ACCESS_KEY_SECRET`(也可以)
|
|
39
|
+
> - ⚠️ `ACCESS_KEY` / `SECRET_KEY` 会报错(缺少 `_ID` / `_SECRET` 后缀)
|
|
37
40
|
|
|
38
41
|
```sql
|
|
39
42
|
-- 阿里云 OSS
|
|
40
43
|
CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_conn
|
|
41
44
|
TYPE OSS
|
|
42
|
-
|
|
43
|
-
|
|
45
|
+
access_id = 'LTAIxxxxxxxxxxxx'
|
|
46
|
+
access_key = 'T8Gexxxxxxmtxxxxxx'
|
|
44
47
|
ENDPOINT = 'oss-cn-hangzhou-internal.aliyuncs.com';
|
|
45
48
|
|
|
46
49
|
-- 腾讯云 COS
|
|
@@ -101,15 +104,22 @@ DESC VOLUME my_oss_volume;
|
|
|
101
104
|
-- 查看目录下的文件
|
|
102
105
|
SHOW VOLUME DIRECTORY my_oss_volume;
|
|
103
106
|
|
|
104
|
-
--
|
|
107
|
+
-- 刷新目录元数据后查询(上传新文件后可能需要手动刷新)
|
|
105
108
|
ALTER VOLUME my_oss_volume REFRESH;
|
|
106
109
|
SELECT * FROM DIRECTORY(VOLUME my_oss_volume);
|
|
107
110
|
```
|
|
108
111
|
|
|
112
|
+
> ⚠️ **目录刷新注意**:上传文件到对象存储后,`SHOW VOLUME DIRECTORY` 可能不会立即显示新文件。
|
|
113
|
+
> 如果启用了 `AUTO_REFRESH = TRUE`,系统会定期自动刷新;否则需要手动执行 `ALTER VOLUME name REFRESH`。
|
|
114
|
+
|
|
109
115
|
---
|
|
110
116
|
|
|
111
117
|
## 直接查询 Volume 中的文件
|
|
112
118
|
|
|
119
|
+
> ⚠️ **语法限制**:ClickZetta 不支持 `@volume_name` 简写(Snowflake Stage 语法),必须使用 `FROM VOLUME name USING format` 完整语法。
|
|
120
|
+
> ⚠️ **多格式文件处理**:如果 Volume 中包含多种格式的文件(如 .csv 和 .json 混合),不指定 `FILES()` 或 `SUBDIRECTORY` 时会尝试读取所有文件,可能因格式不匹配而报错。建议使用 `FILES('xxx.csv')` 指定文件或 `SUBDIRECTORY 'csv_data/'` 指定子目录。
|
|
121
|
+
> ⚠️ **JSON 嵌套字段访问**:使用 `data['key']` 语法(不是 Snowflake 的 `data:key` 语法)。
|
|
122
|
+
|
|
113
123
|
```sql
|
|
114
124
|
-- 查询 CSV 文件(自动推断 schema)
|
|
115
125
|
SELECT * FROM VOLUME my_oss_volume
|
|
@@ -123,6 +133,19 @@ SELECT * FROM VOLUME my_oss_volume
|
|
|
123
133
|
USING PARQUET
|
|
124
134
|
REGEXP '.*2024-0[1-6].parquet';
|
|
125
135
|
|
|
136
|
+
-- 查询指定文件(推荐,避免多格式冲突)
|
|
137
|
+
SELECT * FROM VOLUME my_oss_volume
|
|
138
|
+
USING JSON
|
|
139
|
+
FILES('user_events.json');
|
|
140
|
+
|
|
141
|
+
-- 查询 JSON 嵌套字段
|
|
142
|
+
SELECT
|
|
143
|
+
data['event_id'] AS event_id,
|
|
144
|
+
data['properties']['device'] AS device
|
|
145
|
+
FROM VOLUME my_oss_volume
|
|
146
|
+
USING JSON
|
|
147
|
+
FILES('events.json');
|
|
148
|
+
|
|
126
149
|
-- 查询 User Volume 文件
|
|
127
150
|
SELECT * FROM USER VOLUME
|
|
128
151
|
USING CSV
|
|
@@ -244,6 +267,26 @@ DROP VOLUME IF EXISTS my_oss_volume;
|
|
|
244
267
|
| 问题 | 原因 | 解决方案 |
|
|
245
268
|
|---|---|---|
|
|
246
269
|
| SHOW VOLUME DIRECTORY 无文件 | 目录未刷新 | 执行 `ALTER VOLUME name REFRESH` |
|
|
247
|
-
| SELECT FROM VOLUME 报错 | 格式不匹配 | 确认 USING
|
|
270
|
+
| SELECT FROM VOLUME 报错 | 格式不匹配 | 确认 USING 后的格式与实际文件格式一致;使用 `FILES()` 指定文件 |
|
|
271
|
+
| COPY INTO 读取多格式文件失败 | Volume 中有混合格式文件 | 使用 `FILES('xxx.csv')` 指定文件或 `SUBDIRECTORY` 指定子目录 |
|
|
248
272
|
| PUT 命令失败 | 本地路径不存在 | 确认本地文件路径正确 |
|
|
249
273
|
| COPY INTO 报错 | 权限不足 | 检查 STORAGE CONNECTION 的访问密钥权限 |
|
|
274
|
+
| `@volume` 语法报错 | ClickZetta 不支持 | 使用 `FROM VOLUME name USING format` 完整语法 |
|
|
275
|
+
| `data:key` 语法报错 | Snowflake JSON 语法不适用 | 使用 `data['key']` 语法访问 JSON 嵌套字段 |
|
|
276
|
+
| `METADATA$FILENAME` 报错 | ClickZetta 不支持此元数据字段 | 使用字符串字面量或在 INSERT 时手动添加文件路径列 |
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## Snowflake 迁移对照
|
|
281
|
+
|
|
282
|
+
| Snowflake 语法 | ClickZetta 等价语法 | 说明 |
|
|
283
|
+
|---|---|---|
|
|
284
|
+
| `@my_stage` | `VOLUME my_volume` | Stage → Volume |
|
|
285
|
+
| `SELECT * FROM @stage/path` | `SELECT * FROM VOLUME vol USING CSV SUBDIRECTORY 'path/'` | 必须指定 USING 格式 |
|
|
286
|
+
| `data:key::STRING` | `data['key']` | JSON 字段访问 |
|
|
287
|
+
| `data:nested.key` | `data['nested']['key']` | 嵌套 JSON 访问 |
|
|
288
|
+
| `METADATA$FILENAME` | 不支持 | 需手动添加文件路径列 |
|
|
289
|
+
| `METADATA$FILE_ROW_NUMBER` | 不支持 | 无等价功能 |
|
|
290
|
+
| `FILE_FORMAT = (TYPE = CSV)` | `USING CSV OPTIONS(...)` | 导入时用 USING,导出时用 FILE_FORMAT |
|
|
291
|
+
| `COPY INTO table FROM @stage` | `COPY INTO table FROM VOLUME vol USING format` | 导入语法 |
|
|
292
|
+
| `COPY INTO @stage FROM table` | `COPY INTO VOLUME vol SUBDIRECTORY '/' FROM TABLE t FILE_FORMAT=(...)` | 导出语法 |
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
## CREATE EXTERNAL VOLUME
|
|
15
15
|
|
|
16
16
|
```sql
|
|
17
|
-
-- OSS
|
|
17
|
+
-- OSS(Connection 必须使用小写 access_id/access_key)
|
|
18
18
|
CREATE EXTERNAL VOLUME my_oss_volume
|
|
19
19
|
LOCATION 'oss://<bucket>/<path>'
|
|
20
20
|
USING CONNECTION my_oss_conn
|
|
@@ -42,6 +42,8 @@ CREATE EXTERNAL VOLUME my_s3_volume
|
|
|
42
42
|
- `DIRECTORY`:目录功能配置,`ENABLE=TRUE` 开启目录索引,`AUTO_REFRESH=TRUE` 自动刷新
|
|
43
43
|
- `RECURSIVE`:是否递归扫描子目录
|
|
44
44
|
|
|
45
|
+
> ⚠️ 上传新文件后如果 `SHOW VOLUME DIRECTORY` 未显示,执行 `ALTER VOLUME name REFRESH` 手动刷新。
|
|
46
|
+
|
|
45
47
|
---
|
|
46
48
|
|
|
47
49
|
## ALTER VOLUME
|
|
@@ -191,4 +193,7 @@ FROM TABLE my_table
|
|
|
191
193
|
FILE_FORMAT = (TYPE = CSV);
|
|
192
194
|
```
|
|
193
195
|
|
|
194
|
-
> ⚠️
|
|
196
|
+
> ⚠️ **关键区分**:
|
|
197
|
+
> - **导入**(COPY INTO TABLE / SELECT FROM VOLUME):用 `USING CSV/PARQUET/JSON` + `OPTIONS(...)`
|
|
198
|
+
> - **导出**(COPY INTO VOLUME):用 `FILE_FORMAT = (TYPE = CSV/PARQUET/JSON)`
|
|
199
|
+
> - 两者语法不可混用!
|