@clickzetta/cz-cli-linux-x64 0.3.1 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +9 -2
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +1 -1
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +3 -3
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +27 -1
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +82 -43
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +262 -154
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +192 -54
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +1 -1
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +70 -45
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +79 -53
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +40 -28
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +133 -71
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +11 -9
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +79 -28
- package/bin/skills/clickzetta-volume-manager/SKILL.md +48 -5
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +7 -2
- package/package.json +1 -1
|
@@ -2,32 +2,43 @@
|
|
|
2
2
|
|
|
3
3
|
> 来源:https://www.yunqi.tech/documents/pipe-kafka 和 https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1
|
|
4
4
|
|
|
5
|
+
> **⚠️ ClickZetta READ_KAFKA 使用位置参数(positional parameters)**
|
|
6
|
+
> - ❌ 不支持 `=>` 命名参数语法(如 `KAFKA_BROKER => 'host:port'`)
|
|
7
|
+
> - ❌ 不支持 `TABLE(READ_KAFKA(...))` 包装
|
|
8
|
+
> - ✅ 正确:`FROM read_kafka('broker', 'topic', '', 'group', '', '', '', '', 'raw', 'raw', 0, MAP(...))`
|
|
9
|
+
|
|
5
10
|
## CREATE PIPE(READ_KAFKA 方式)
|
|
6
11
|
|
|
7
12
|
```sql
|
|
8
13
|
CREATE [ OR REPLACE ] PIPE <pipe_name>
|
|
9
|
-
VIRTUAL_CLUSTER = <vcluster_name>
|
|
10
|
-
[ BATCH_INTERVAL_IN_SECONDS = <seconds> ]
|
|
11
|
-
[ BATCH_SIZE_PER_KAFKA_PARTITION = <count> ]
|
|
12
|
-
[ MAX_SKIP_BATCH_COUNT_ON_ERROR = <count> ]
|
|
13
|
-
[ INITIAL_DELAY_IN_SECONDS = <seconds> ]
|
|
14
|
+
VIRTUAL_CLUSTER = '<vcluster_name>'
|
|
15
|
+
[ BATCH_INTERVAL_IN_SECONDS = '<seconds>' ]
|
|
16
|
+
[ BATCH_SIZE_PER_KAFKA_PARTITION = '<count>' ]
|
|
17
|
+
[ MAX_SKIP_BATCH_COUNT_ON_ERROR = '<count>' ]
|
|
18
|
+
[ INITIAL_DELAY_IN_SECONDS = '<seconds>' ]
|
|
14
19
|
[ RESET_KAFKA_GROUP_OFFSETS = '<offset_value>' ]
|
|
20
|
+
[ COPY_JOB_HINT = '<json>' ]
|
|
15
21
|
AS
|
|
16
|
-
|
|
17
|
-
SELECT <expr> [, ...]
|
|
18
|
-
FROM
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
22
|
+
COPY INTO <target_table> FROM (
|
|
23
|
+
SELECT <expr> [, ...]
|
|
24
|
+
FROM read_kafka(
|
|
25
|
+
'<bootstrap_servers>', -- 位置 1:Kafka 集群地址(必填)
|
|
26
|
+
'<topic_name>', -- 位置 2:Topic 名称(必填)
|
|
27
|
+
'', -- 位置 3:Topic pattern(保留,填空字符串)
|
|
28
|
+
'<group_id>', -- 位置 4:消费者组 ID(必填)
|
|
29
|
+
'', -- 位置 5:starting_offsets(Pipe 中留空)
|
|
30
|
+
'', -- 位置 6:ending_offsets(Pipe 中留空)
|
|
31
|
+
'', -- 位置 7:starting_timestamp(Pipe 中留空)
|
|
32
|
+
'', -- 位置 8:ending_timestamp(Pipe 中留空)
|
|
33
|
+
'raw', -- 位置 9:key 格式(目前只支持 raw)
|
|
34
|
+
'raw', -- 位置 10:value 格式(目前只支持 raw)
|
|
35
|
+
0, -- 位置 11:max_errors
|
|
36
|
+
MAP(<kafka_config>) -- 位置 12:Kafka 配置参数
|
|
26
37
|
)
|
|
27
38
|
);
|
|
28
39
|
```
|
|
29
40
|
|
|
30
|
-
### 参数说明
|
|
41
|
+
### Pipe 参数说明
|
|
31
42
|
|
|
32
43
|
| 参数 | 必填 | 默认值 | 说明 |
|
|
33
44
|
|------|------|--------|------|
|
|
@@ -37,6 +48,7 @@ FROM TABLE(
|
|
|
37
48
|
| `MAX_SKIP_BATCH_COUNT_ON_ERROR` | 否 | 30 | 出错时跳过批次的最大重试次数 |
|
|
38
49
|
| `INITIAL_DELAY_IN_SECONDS` | 否 | 0 | 首个作业调度延迟 |
|
|
39
50
|
| `RESET_KAFKA_GROUP_OFFSETS` | 否 | — | 启动时消费位点(仅创建时生效) |
|
|
51
|
+
| `COPY_JOB_HINT` | 否 | — | JSON 格式的作业参数 |
|
|
40
52
|
|
|
41
53
|
### RESET_KAFKA_GROUP_OFFSETS 可选值
|
|
42
54
|
|
|
@@ -50,20 +62,138 @@ FROM TABLE(
|
|
|
50
62
|
|
|
51
63
|
### READ_KAFKA 参数(在 Pipe 中 vs 独立使用)
|
|
52
64
|
|
|
53
|
-
| 特性 | 独立使用
|
|
65
|
+
| 特性 | 独立使用 read_kafka | 在 Pipe 中使用 |
|
|
54
66
|
|------|-------------------|---------------|
|
|
55
67
|
| 消费者组 | 临时,执行完即销毁 | 持久,保持消费位置 |
|
|
56
|
-
| 位置管理 |
|
|
68
|
+
| 位置管理 | 在 MAP 中设置 `kafka.auto.offset.reset` | Pipe 自动管理,位置参数**必须留空** |
|
|
57
69
|
| 执行方式 | 一次性查询 | 持续调度执行 |
|
|
58
|
-
| 默认起始位置 | earliest
|
|
70
|
+
| 默认起始位置 | latest(可在 MAP 中改为 earliest) | latest(由 RESET_KAFKA_GROUP_OFFSETS 控制) |
|
|
71
|
+
|
|
72
|
+
### MAP 配置参数
|
|
73
|
+
|
|
74
|
+
| 参数 | 说明 |
|
|
75
|
+
|------|------|
|
|
76
|
+
| `kafka.security.protocol` | 安全协议:`PLAINTEXT` 或 `SASL_PLAINTEXT` |
|
|
77
|
+
| `kafka.sasl.mechanism` | SASL 机制:`PLAIN` |
|
|
78
|
+
| `kafka.sasl.username` | SASL 用户名 |
|
|
79
|
+
| `kafka.sasl.password` | SASL 密码 |
|
|
80
|
+
| `kafka.auto.offset.reset` | 独立探查时的起始位点(`earliest` / `latest`) |
|
|
81
|
+
| `cz.kafka.fetch.retry.enable` | 启用 fetch 重试(`true`/`false`) |
|
|
82
|
+
| `cz.kafka.fetch.retry.times` | 重试次数 |
|
|
83
|
+
| `cz.kafka.fetch.retry.intervalMs` | 重试间隔(毫秒) |
|
|
59
84
|
|
|
60
85
|
### JSON 字段提取语法
|
|
61
86
|
|
|
62
87
|
```sql
|
|
63
|
-
--
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
88
|
+
-- key 和 value 都是 binary 类型,需要先转换
|
|
89
|
+
value::string -- 转为字符串
|
|
90
|
+
parse_json(value::string) -- 解析为 JSON 对象
|
|
91
|
+
parse_json(value::string)['field']::TYPE -- 提取顶层字段
|
|
92
|
+
parse_json(value::string)['nested']['key']::TYPE -- 提取嵌套字段
|
|
93
|
+
|
|
94
|
+
-- 推荐模式:在子查询中先 parse_json,外层直接用 j['field']
|
|
95
|
+
SELECT j['order_id']::STRING, j['amount']::DECIMAL(10,2)
|
|
96
|
+
FROM (
|
|
97
|
+
SELECT parse_json(value::string) AS j
|
|
98
|
+
FROM read_kafka(...)
|
|
99
|
+
)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### 完整示例
|
|
103
|
+
|
|
104
|
+
```sql
|
|
105
|
+
-- 无认证 Kafka Pipe
|
|
106
|
+
CREATE PIPE kafka_orders_pipe
|
|
107
|
+
VIRTUAL_CLUSTER = 'default'
|
|
108
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
109
|
+
AS
|
|
110
|
+
COPY INTO ods.orders FROM (
|
|
111
|
+
SELECT
|
|
112
|
+
j['order_id']::STRING AS order_id,
|
|
113
|
+
j['user_id']::STRING AS user_id,
|
|
114
|
+
j['amount']::DECIMAL(10,2) AS amount,
|
|
115
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
116
|
+
FROM (
|
|
117
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
118
|
+
FROM read_kafka(
|
|
119
|
+
'kafka.example.com:9092',
|
|
120
|
+
'orders',
|
|
121
|
+
'',
|
|
122
|
+
'lakehouse_orders',
|
|
123
|
+
'', '', '', '',
|
|
124
|
+
'raw', 'raw', 0,
|
|
125
|
+
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
);
|
|
129
|
+
|
|
130
|
+
-- SASL 认证 + 指定时间点消费
|
|
131
|
+
CREATE PIPE kafka_secure_pipe
|
|
132
|
+
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
133
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
134
|
+
RESET_KAFKA_GROUP_OFFSETS = '1737789688000'
|
|
135
|
+
AS
|
|
136
|
+
COPY INTO ods.secure_events FROM (
|
|
137
|
+
SELECT
|
|
138
|
+
j['id']::STRING AS event_id,
|
|
139
|
+
j['payload']::STRING AS payload,
|
|
140
|
+
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
141
|
+
FROM (
|
|
142
|
+
SELECT `timestamp`, parse_json(value::string) AS j
|
|
143
|
+
FROM read_kafka(
|
|
144
|
+
'kafka.example.com:9092',
|
|
145
|
+
'secure_events',
|
|
146
|
+
'',
|
|
147
|
+
'cz_secure',
|
|
148
|
+
'', '', '', '',
|
|
149
|
+
'raw', 'raw', 0,
|
|
150
|
+
MAP(
|
|
151
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
152
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
153
|
+
'kafka.sasl.username', 'my_user',
|
|
154
|
+
'kafka.sasl.password', 'my_password'
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
);
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## 独立探查(验证连接和数据格式)
|
|
164
|
+
|
|
165
|
+
```sql
|
|
166
|
+
-- 无认证
|
|
167
|
+
SELECT value::string
|
|
168
|
+
FROM read_kafka(
|
|
169
|
+
'kafka.example.com:9092',
|
|
170
|
+
'orders',
|
|
171
|
+
'',
|
|
172
|
+
'test_explore',
|
|
173
|
+
'', '', '', '',
|
|
174
|
+
'raw', 'raw', 0,
|
|
175
|
+
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
176
|
+
)
|
|
177
|
+
LIMIT 10;
|
|
178
|
+
|
|
179
|
+
-- SASL 认证
|
|
180
|
+
SELECT value::string
|
|
181
|
+
FROM read_kafka(
|
|
182
|
+
'kafka.example.com:9092',
|
|
183
|
+
'orders',
|
|
184
|
+
'',
|
|
185
|
+
'test_explore',
|
|
186
|
+
'', '', '', '',
|
|
187
|
+
'raw', 'raw', 0,
|
|
188
|
+
MAP(
|
|
189
|
+
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
190
|
+
'kafka.sasl.mechanism', 'PLAIN',
|
|
191
|
+
'kafka.sasl.username', 'my_user',
|
|
192
|
+
'kafka.sasl.password', 'my_password',
|
|
193
|
+
'kafka.auto.offset.reset', 'earliest'
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
LIMIT 10;
|
|
67
197
|
```
|
|
68
198
|
|
|
69
199
|
---
|
|
@@ -76,34 +206,37 @@ PARSE_JSON($1:field::STRING) -- 将字符串字段解析为 JSON 对象
|
|
|
76
206
|
CREATE STORAGE CONNECTION IF NOT EXISTS <conn_name>
|
|
77
207
|
TYPE KAFKA
|
|
78
208
|
BOOTSTRAP_SERVERS = ['<host1>:<port1>', '<host2>:<port2>']
|
|
79
|
-
SECURITY_PROTOCOL = '
|
|
209
|
+
SECURITY_PROTOCOL = 'PLAINTEXT';
|
|
80
210
|
```
|
|
81
211
|
|
|
82
212
|
### 步骤 2:创建 Kafka 外部表
|
|
83
213
|
|
|
84
214
|
```sql
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
215
|
+
-- ⚠️ 必须显式指定列定义(不能省略)
|
|
216
|
+
-- ⚠️ offset 是保留字,必须用反引号转义
|
|
217
|
+
CREATE EXTERNAL TABLE <ext_table_name> (
|
|
218
|
+
topic STRING,
|
|
219
|
+
partition INT,
|
|
220
|
+
`offset` BIGINT,
|
|
221
|
+
`timestamp` TIMESTAMP,
|
|
222
|
+
timestamp_type STRING,
|
|
223
|
+
headers STRING,
|
|
224
|
+
key BINARY,
|
|
225
|
+
value BINARY
|
|
226
|
+
)
|
|
227
|
+
USING KAFKA
|
|
228
|
+
OPTIONS (
|
|
229
|
+
'group_id' = '<consumer_group>',
|
|
230
|
+
'topics' = '<topic_name>',
|
|
231
|
+
'starting_offset' = '<earliest | latest>'
|
|
232
|
+
)
|
|
233
|
+
CONNECTION <conn_name>;
|
|
93
234
|
```
|
|
94
235
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
| topic | STRING | Kafka 主题名称 |
|
|
100
|
-
| partition | INT | 分区 ID |
|
|
101
|
-
| offset | BIGINT | 分区内偏移量 |
|
|
102
|
-
| timestamp | TIMESTAMP_LTZ | 消息时间戳 |
|
|
103
|
-
| timestamp_type | STRING | 时间戳类型 |
|
|
104
|
-
| headers | MAP<STRING, BINARY> | 消息头 |
|
|
105
|
-
| key | BINARY | 消息键 |
|
|
106
|
-
| value | BINARY | 消息体 |
|
|
236
|
+
> **注意**:
|
|
237
|
+
> - 列定义是**必须的**,省略会报错 `failed to detect columns`
|
|
238
|
+
> - `offset` 和 `timestamp` 是保留字,需要反引号转义
|
|
239
|
+
> - 删除外部表用 `DROP TABLE`(不是 `DROP EXTERNAL TABLE`)
|
|
107
240
|
|
|
108
241
|
### 步骤 3:创建 Table Stream
|
|
109
242
|
|
|
@@ -117,8 +250,8 @@ CREATE TABLE STREAM <stream_name>
|
|
|
117
250
|
|
|
118
251
|
```sql
|
|
119
252
|
CREATE PIPE <pipe_name>
|
|
120
|
-
VIRTUAL_CLUSTER = <vcluster_name>
|
|
121
|
-
|
|
253
|
+
VIRTUAL_CLUSTER = '<vcluster_name>'
|
|
254
|
+
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
122
255
|
AS
|
|
123
256
|
COPY INTO <target_table>
|
|
124
257
|
SELECT <expr> [, ...]
|
|
@@ -136,13 +269,20 @@ ALTER PIPE <pipe_name> SET PIPE_EXECUTION_PAUSED = true;
|
|
|
136
269
|
-- 恢复
|
|
137
270
|
ALTER PIPE <pipe_name> SET PIPE_EXECUTION_PAUSED = false;
|
|
138
271
|
|
|
139
|
-
--
|
|
140
|
-
ALTER PIPE <pipe_name> SET BATCH_INTERVAL_IN_SECONDS = 120;
|
|
141
|
-
ALTER PIPE <pipe_name> SET BATCH_SIZE_PER_KAFKA_PARTITION = 1000000;
|
|
272
|
+
-- 修改 VCluster
|
|
142
273
|
ALTER PIPE <pipe_name> SET VIRTUAL_CLUSTER = 'new_vc';
|
|
274
|
+
|
|
275
|
+
-- 修改 COPY_JOB_HINT
|
|
143
276
|
ALTER PIPE <pipe_name> SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
|
|
144
277
|
```
|
|
145
278
|
|
|
279
|
+
> ⚠️ **ALTER PIPE 支持的属性**:
|
|
280
|
+
> - ✅ `PIPE_EXECUTION_PAUSED`
|
|
281
|
+
> - ✅ `VIRTUAL_CLUSTER`
|
|
282
|
+
> - ✅ `COPY_JOB_HINT`
|
|
283
|
+
> - ❌ `BATCH_INTERVAL_IN_SECONDS`(不支持,需删除重建)
|
|
284
|
+
> - ❌ `BATCH_SIZE_PER_KAFKA_PARTITION`(不支持,需删除重建)
|
|
285
|
+
>
|
|
146
286
|
> 不支持修改 COPY/INSERT 语句逻辑,需删除 Pipe 后重建。
|
|
147
287
|
> 修改 `COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数。
|
|
148
288
|
|
|
@@ -151,15 +291,14 @@ ALTER PIPE <pipe_name> SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size
|
|
|
151
291
|
## 监控
|
|
152
292
|
|
|
153
293
|
```sql
|
|
154
|
-
-- 查看 Pipe
|
|
155
|
-
DESC PIPE <pipe_name>;
|
|
294
|
+
-- 查看 Pipe 详情(含延迟信息 pipe_latency)
|
|
156
295
|
DESC PIPE EXTENDED <pipe_name>;
|
|
157
296
|
|
|
158
297
|
-- 查看所有 Pipe
|
|
159
298
|
SHOW PIPES;
|
|
160
299
|
|
|
161
300
|
-- 查看加载历史
|
|
162
|
-
SELECT * FROM
|
|
301
|
+
SELECT * FROM load_history('<schema>.<table>')
|
|
163
302
|
ORDER BY last_load_time DESC LIMIT 20;
|
|
164
303
|
|
|
165
304
|
-- 通过 query_tag 查看 Pipe 作业
|
|
@@ -181,6 +320,5 @@ DROP PIPE [ IF EXISTS ] <pipe_name>;
|
|
|
181
320
|
- [借助 read_kafka 函数持续导入](https://www.yunqi.tech/documents/pipe-kafka)
|
|
182
321
|
- [借助 Kafka 外表 Table Stream 持续导入](https://www.yunqi.tech/documents/pipe-kafka-table-stream)
|
|
183
322
|
- [最佳实践:使用 Pipe 高效接入 Kafka 数据](https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1)
|
|
184
|
-
- [read_kafka 函数](https://www.yunqi.tech/documents/read_kafka)
|
|
185
323
|
- [Kafka 外部表](https://www.yunqi.tech/documents/kafka-external-table)
|
|
186
|
-
- [
|
|
324
|
+
- [Kafka Storage Connection](https://www.yunqi.tech/documents/Kafka_connection)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: clickzetta-lakehouse-connect
|
|
3
3
|
description: |
|
|
4
|
-
Guide for connecting to ClickZetta Lakehouse via SDK/JDBC. Covers Python SDK (clickzetta.connect), ZettaPark Session (DataFrame API), SQLAlchemy (ORM/BI tools), and JDBC (Java). Use this skill when user needs to configure a connection from external tools or code
|
|
4
|
+
Guide for connecting to ClickZetta Lakehouse via SDK/JDBC. Covers Python SDK (clickzetta.connect), ZettaPark Session (DataFrame API), SQLAlchemy (ORM/BI tools), and JDBC (Java). Use this skill when user needs to configure a connection from external tools or code. Trigger for: "Python SDK 连接", "JDBC 连接", "SQLAlchemy 配置", "ZettaPark 怎么用", "连接报错", "clickzetta-connector-python", "clickzetta-sqlalchemy".
|
|
5
5
|
Keywords: connection, Python SDK, JDBC, SQLAlchemy, ZettaPark, driver, connect
|
|
6
6
|
---
|
|
7
7
|
|
|
@@ -64,12 +64,17 @@ description: |
|
|
|
64
64
|
-- 密钥方式(LIST_PURGE 模式支持)
|
|
65
65
|
CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_connection
|
|
66
66
|
TYPE OSS
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
COMMENT = 'OSS connection for data pipeline';
|
|
67
|
+
access_id = '<your_access_key_id>'
|
|
68
|
+
access_key = '<your_access_key_secret>'
|
|
69
|
+
ENDPOINT = 'oss-cn-hangzhou.aliyuncs.com';
|
|
71
70
|
```
|
|
72
71
|
|
|
72
|
+
> **参数说明**:
|
|
73
|
+
> - `access_id`:对应阿里云控制台的 **AccessKey ID**
|
|
74
|
+
> - `access_key`:对应阿里云控制台的 **AccessKey Secret**
|
|
75
|
+
> - 也可使用大写形式 `ACCESS_KEY_ID` / `ACCESS_KEY_SECRET`
|
|
76
|
+
> - ⚠️ `ACCESS_KEY` / `SECRET_KEY` 会报错(缺少 `_ID` / `_SECRET` 后缀)
|
|
77
|
+
>
|
|
73
78
|
> **提示**:如果使用 Role ARN 方式(EVENT_NOTIFICATION 模式必须),参见下方"模式 B"中的 Connection 创建语法。
|
|
74
79
|
|
|
75
80
|
#### 步骤 2:创建外部 Volume
|
|
@@ -77,14 +82,17 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_connection
|
|
|
77
82
|
```sql
|
|
78
83
|
-- 使用 LH_execute_query 执行
|
|
79
84
|
CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_volume
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
85
|
+
LOCATION 'oss://my-bucket/data-path/'
|
|
86
|
+
USING CONNECTION my_oss_connection
|
|
87
|
+
DIRECTORY = (enable = true, auto_refresh = true)
|
|
88
|
+
RECURSIVE = true
|
|
89
|
+
COMMENT 'Volume for OSS PIPE ingestion';
|
|
83
90
|
```
|
|
84
91
|
|
|
85
92
|
> **关键参数**:
|
|
86
|
-
> -
|
|
87
|
-
> -
|
|
93
|
+
> - `RECURSIVE = true`:递归扫描子目录
|
|
94
|
+
> - `DIRECTORY = (enable = true, auto_refresh = true)`:自动刷新目录元数据
|
|
95
|
+
> - ⚠️ COMMENT 不带等号:`COMMENT 'text'`(不是 `COMMENT = 'text'`)
|
|
88
96
|
|
|
89
97
|
#### 步骤 3:验证 COPY INTO 可独立运行
|
|
90
98
|
|
|
@@ -94,14 +102,12 @@ CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_volume
|
|
|
94
102
|
-- 使用 LH_execute_query 执行
|
|
95
103
|
COPY INTO my_schema.target_table
|
|
96
104
|
FROM VOLUME pipe_volume
|
|
97
|
-
USING CSV
|
|
98
|
-
OPTIONS (
|
|
99
|
-
'header' = 'true',
|
|
100
|
-
'delimiter' = ','
|
|
101
|
-
);
|
|
105
|
+
USING CSV OPTIONS ('header' = 'true', 'delimiter' = ',') PURGE=true;
|
|
102
106
|
```
|
|
103
107
|
|
|
104
|
-
> **重要**:
|
|
108
|
+
> **重要**:
|
|
109
|
+
> - PIPE 中的 COPY 语句不支持 `files`、`regexp`、`subdirectory` 参数。确保此处验证时也不使用这些参数。
|
|
110
|
+
> - OPTIONS 放在 PURGE=true **之前**:`USING CSV OPTIONS (...) PURGE=true`
|
|
105
111
|
|
|
106
112
|
#### 步骤 4:创建 PIPE(LIST_PURGE 模式)
|
|
107
113
|
|
|
@@ -110,22 +116,32 @@ OPTIONS (
|
|
|
110
116
|
CREATE PIPE IF NOT EXISTS my_oss_pipe
|
|
111
117
|
INGEST_MODE = 'LIST_PURGE'
|
|
112
118
|
VIRTUAL_CLUSTER = 'my_vc'
|
|
113
|
-
COMMENT
|
|
119
|
+
COMMENT 'OSS data pipeline - scan mode'
|
|
114
120
|
AS
|
|
115
121
|
COPY INTO my_schema.target_table
|
|
116
122
|
FROM VOLUME pipe_volume
|
|
117
|
-
USING CSV
|
|
118
|
-
OPTIONS (
|
|
119
|
-
'header' = 'true',
|
|
120
|
-
'delimiter' = ',',
|
|
121
|
-
'purge' = 'true'
|
|
122
|
-
);
|
|
123
|
+
USING CSV OPTIONS ('header' = 'true') PURGE=true;
|
|
123
124
|
```
|
|
124
125
|
|
|
125
|
-
>
|
|
126
|
-
> - `
|
|
127
|
-
> -
|
|
126
|
+
> **⚠️ 语法关键点**:
|
|
127
|
+
> - `PURGE=true` 放在最后:`USING <format> [OPTIONS (...)] PURGE=true`
|
|
128
|
+
> - OPTIONS 在 PURGE=true **之前**(如果需要的话)
|
|
129
|
+
> - 也可以不带 OPTIONS:`USING CSV PURGE=true`(推荐简洁写法)
|
|
130
|
+
> - COMMENT 不带等号:`COMMENT 'text'`
|
|
131
|
+
> - 大写 `PURGE`,小写 `true`,中间用 `=` 连接,无空格
|
|
132
|
+
> - **LIST_PURGE 模式必须设置** `PURGE=true`,加载成功后删除源文件(避免重复导入)
|
|
133
|
+
> - 即使不想删除源文件,LIST_PURGE 模式也需要此参数,否则会重复导入同一文件
|
|
128
134
|
> - `VIRTUAL_CLUSTER`:指定执行 PIPE 任务的虚拟集群
|
|
135
|
+
>
|
|
136
|
+
> **错误写法**(会报语法错误):
|
|
137
|
+
> ```sql
|
|
138
|
+
> -- ❌ 不要把 purge 放在 OPTIONS 里
|
|
139
|
+
> OPTIONS ('header' = 'true', 'purge' = 'true')
|
|
140
|
+
> -- ❌ OPTIONS 不能在 PURGE 之后
|
|
141
|
+
> USING CSV PURGE=true OPTIONS ('header' = 'true')
|
|
142
|
+
> -- ❌ 不要用小写或加引号
|
|
143
|
+
> 'purge'='true'
|
|
144
|
+
> ```
|
|
129
145
|
|
|
130
146
|
#### 步骤 5:验证 PIPE 状态
|
|
131
147
|
|
|
@@ -157,8 +173,7 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_role_connection
|
|
|
157
173
|
TYPE OSS
|
|
158
174
|
ENDPOINT = 'oss-cn-hangzhou.aliyuncs.com'
|
|
159
175
|
ROLE_ARN = 'acs:ram::1234567890:role/clickzetta-oss-role'
|
|
160
|
-
REGION = 'cn-hangzhou'
|
|
161
|
-
COMMENT = 'OSS connection via Role ARN for event notification mode';
|
|
176
|
+
REGION = 'cn-hangzhou';
|
|
162
177
|
```
|
|
163
178
|
|
|
164
179
|
#### 步骤 2:创建外部 Volume
|
|
@@ -166,8 +181,10 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_role_connection
|
|
|
166
181
|
```sql
|
|
167
182
|
-- 使用 LH_execute_query 执行
|
|
168
183
|
CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_event_volume
|
|
169
|
-
|
|
170
|
-
|
|
184
|
+
LOCATION 'oss://my-bucket/data-path/'
|
|
185
|
+
USING CONNECTION my_oss_role_connection
|
|
186
|
+
DIRECTORY = (enable = true, auto_refresh = true)
|
|
187
|
+
RECURSIVE = true;
|
|
171
188
|
```
|
|
172
189
|
|
|
173
190
|
#### 步骤 3:创建 PIPE(EVENT_NOTIFICATION 模式)
|
|
@@ -178,21 +195,18 @@ CREATE PIPE IF NOT EXISTS my_oss_event_pipe
|
|
|
178
195
|
INGEST_MODE = 'EVENT_NOTIFICATION'
|
|
179
196
|
VIRTUAL_CLUSTER = 'my_vc'
|
|
180
197
|
ALICLOUD_MNS_QUEUE = 'my-mns-queue-name'
|
|
181
|
-
COMMENT
|
|
198
|
+
COMMENT 'OSS data pipeline - event notification mode'
|
|
182
199
|
AS
|
|
183
200
|
COPY INTO my_schema.target_table
|
|
184
201
|
FROM VOLUME pipe_event_volume
|
|
185
|
-
USING CSV
|
|
186
|
-
OPTIONS (
|
|
187
|
-
'header' = 'true',
|
|
188
|
-
'delimiter' = ','
|
|
189
|
-
);
|
|
202
|
+
USING CSV;
|
|
190
203
|
```
|
|
191
204
|
|
|
192
205
|
> **参数说明**:
|
|
193
206
|
> - `INGEST_MODE = 'EVENT_NOTIFICATION'`:通过消息通知触发加载
|
|
194
207
|
> - `ALICLOUD_MNS_QUEUE`:阿里云 MNS 队列名称(AWS 使用 `AWS_SQS_QUEUE`)
|
|
195
|
-
> - 此模式下不需要 `
|
|
208
|
+
> - 此模式下不需要 `PURGE=true`,因为是事件驱动而非扫描
|
|
209
|
+
> - COMMENT 不带等号:`COMMENT 'text'`
|
|
196
210
|
|
|
197
211
|
---
|
|
198
212
|
|
|
@@ -222,16 +236,17 @@ CREATE TABLE IF NOT EXISTS my_schema.target_table (
|
|
|
222
236
|
|
|
223
237
|
```sql
|
|
224
238
|
-- 使用 LH_execute_query 执行
|
|
225
|
-
-- 批量导入场景使用 access_id / access_key 语法
|
|
226
239
|
CREATE STORAGE CONNECTION IF NOT EXISTS my_batch_conn
|
|
227
240
|
TYPE OSS
|
|
228
241
|
ENDPOINT = 'oss-cn-shanghai-internal.aliyuncs.com'
|
|
229
|
-
access_id = '<
|
|
230
|
-
access_key = '<
|
|
231
|
-
COMMENTS = 'OSS batch import connection';
|
|
242
|
+
access_id = '<your_access_key_id>'
|
|
243
|
+
access_key = '<your_access_key_secret>';
|
|
232
244
|
```
|
|
233
245
|
|
|
234
|
-
>
|
|
246
|
+
> **Connection 参数命名**:
|
|
247
|
+
> - 小写形式:`access_id` / `access_key`(推荐)
|
|
248
|
+
> - 大写形式:`ACCESS_KEY_ID` / `ACCESS_KEY_SECRET`(也可以)
|
|
249
|
+
> - ⚠️ `ACCESS_KEY` / `SECRET_KEY` 会报错(缺少后缀)
|
|
235
250
|
|
|
236
251
|
#### 步骤 3:创建外部 Volume(启用目录自动刷新)
|
|
237
252
|
|
|
@@ -248,7 +263,10 @@ CREATE EXTERNAL VOLUME IF NOT EXISTS my_batch_volume
|
|
|
248
263
|
> - `USING CONNECTION`:引用已创建的存储连接
|
|
249
264
|
> - `DIRECTORY = (enable=true, auto_refresh=true)`:启用目录元数据并自动刷新,便于查询 Volume 中的文件列表
|
|
250
265
|
>
|
|
251
|
-
>
|
|
266
|
+
> **Volume 创建语法统一说明**:
|
|
267
|
+
> - ✅ 推荐语法:`LOCATION '...' USING CONNECTION conn_name`(官方文档标准写法)
|
|
268
|
+
> - ⚠️ 旧语法:`STORAGE_CONNECTION = conn_name LOCATION = '...'`(部分旧文档中出现,仍可使用)
|
|
269
|
+
> - 两种语法功能等价,建议统一使用 `LOCATION ... USING CONNECTION` 形式
|
|
252
270
|
|
|
253
271
|
#### 步骤 4a:INSERT INTO 从 Volume 导入(支持过滤转换)
|
|
254
272
|
|
|
@@ -289,6 +307,7 @@ FROM VOLUME my_batch_volume (
|
|
|
289
307
|
> - `INSERT INTO`:支持 `FILES()` 指定文件、`WHERE` 过滤转换,适合精细控制
|
|
290
308
|
> - `COPY INTO`:语法更简洁,适合全量加载
|
|
291
309
|
> - 两者都支持 Schema-on-Read(在 FROM VOLUME 中定义列)
|
|
310
|
+
> - ⚠️ **load_history 差异**:只有 `COPY INTO` 会记录到 `load_history`,`INSERT INTO ... FROM VOLUME` 不会记录。如需去重保护,请使用 `COPY INTO`
|
|
292
311
|
|
|
293
312
|
#### 步骤 5:验证导入结果
|
|
294
313
|
|
|
@@ -319,7 +338,7 @@ DESC PIPE EXTENDED my_oss_pipe;
|
|
|
319
338
|
|
|
320
339
|
```sql
|
|
321
340
|
-- 使用 LH_execute_query 执行
|
|
322
|
-
SELECT * FROM
|
|
341
|
+
SELECT * FROM load_history('my_schema.target_table')
|
|
323
342
|
ORDER BY last_load_time DESC
|
|
324
343
|
LIMIT 20;
|
|
325
344
|
```
|
|
@@ -376,10 +395,13 @@ DROP PIPE IF EXISTS my_oss_pipe;
|
|
|
376
395
|
| 问题 | 排查方向 |
|
|
377
396
|
|------|---------|
|
|
378
397
|
| PIPE 创建后无数据加载 | 1. `DESC PIPE EXTENDED` 检查是否暂停 2. 确认 Volume 路径下有新文件 3. 检查 COPY INTO 是否能独立运行 |
|
|
379
|
-
| LIST_PURGE 模式文件未被删除 | 确认 `
|
|
398
|
+
| LIST_PURGE 模式文件未被删除 | 确认 `PURGE=true` 已设置(紧跟 `USING <format>` 之后);检查 Connection 的 AccessKey 是否有删除权限 |
|
|
399
|
+
| `PURGE=true` 语法错误 | OPTIONS 必须在 PURGE 之前:`USING CSV OPTIONS (...) PURGE=true`。不要写成 `USING CSV PURGE=true OPTIONS(...)` |
|
|
380
400
|
| EVENT_NOTIFICATION 模式无触发 | 1. 检查 MNS/SQS 队列是否收到消息 2. 确认 OSS 事件通知规则配置正确 3. 检查 Role ARN 授权 |
|
|
381
401
|
| 重复加载数据 | `load_history` 去重记录仅保留 7 天,超过 7 天的同名文件会被重新加载 |
|
|
382
402
|
| COPY_JOB_HINT 修改后部分参数丢失 | `SET COPY_JOB_HINT` 会覆盖所有已有 hints,需在一次 ALTER 中设置全部参数 |
|
|
403
|
+
| INSERT INTO FROM VOLUME 后 load_history 无记录 | 正常行为:只有 `COPY INTO` 会记录到 load_history,`INSERT INTO` 不会 |
|
|
404
|
+
| COPY INTO 报格式错误 | Volume 中有多种格式文件,使用 `FILES('xxx.json')` 指定文件 |
|
|
383
405
|
|
|
384
406
|
## 注意事项
|
|
385
407
|
|
|
@@ -399,4 +421,7 @@ DROP PIPE IF EXISTS my_oss_pipe;
|
|
|
399
421
|
- 同地域建议使用内网 Endpoint 以提升传输速度和稳定性
|
|
400
422
|
- 推荐使用 GENERAL PURPOSE 类型虚拟集群执行批量加载任务
|
|
401
423
|
- INSERT INTO 方式支持 `FILES()` 和 `WHERE` 参数,COPY INTO 不支持
|
|
402
|
-
- Connection
|
|
424
|
+
- Connection 参数使用 `access_id`/`access_key`(小写)或 `ACCESS_KEY_ID`/`ACCESS_KEY_SECRET`(大写),不要用 `ACCESS_KEY`/`SECRET_KEY`
|
|
425
|
+
- ⚠️ `INSERT INTO ... FROM VOLUME` 不会记录到 `load_history`,只有 `COPY INTO` 会记录
|
|
426
|
+
- ⚠️ Volume 中有多种格式文件时,不指定 `FILES()` 的 COPY INTO 会尝试读取所有文件,可能因格式不匹配而失败。建议使用 `FILES('xxx.json')` 指定文件或 `SUBDIRECTORY` 指定子目录
|
|
427
|
+
- 上传文件到 OSS 后,`SHOW VOLUME DIRECTORY` 可能需要先执行 `ALTER VOLUME name REFRESH` 刷新目录元数据
|