@clickzetta/cz-cli-linux-x64 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
- package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
- package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
- package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -457
- package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
- package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
- package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
- package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
- package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
- package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
- package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
- package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
- package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
- package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
- package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -112
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
- package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
- package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
- package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -156
- package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
- package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
- package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
- package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
- package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
- package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
- package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -639
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -324
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
- package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
- package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
- package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
- package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -427
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
- package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -379
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -185
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -222
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -125
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
- package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -206
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
- package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -292
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -199
- package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
|
@@ -1,324 +0,0 @@
|
|
|
1
|
-
# Kafka Pipe SQL 语法参考
|
|
2
|
-
|
|
3
|
-
> 来源:https://www.yunqi.tech/documents/pipe-kafka 和 https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1
|
|
4
|
-
|
|
5
|
-
> **⚠️ ClickZetta READ_KAFKA 使用位置参数(positional parameters)**
|
|
6
|
-
> - ❌ 不支持 `=>` 命名参数语法(如 `KAFKA_BROKER => 'host:port'`)
|
|
7
|
-
> - ❌ 不支持 `TABLE(READ_KAFKA(...))` 包装
|
|
8
|
-
> - ✅ 正确:`FROM read_kafka('broker', 'topic', '', 'group', '', '', '', '', 'raw', 'raw', 0, MAP(...))`
|
|
9
|
-
|
|
10
|
-
## CREATE PIPE(READ_KAFKA 方式)
|
|
11
|
-
|
|
12
|
-
```sql
|
|
13
|
-
CREATE [ OR REPLACE ] PIPE <pipe_name>
|
|
14
|
-
VIRTUAL_CLUSTER = '<vcluster_name>'
|
|
15
|
-
[ BATCH_INTERVAL_IN_SECONDS = '<seconds>' ]
|
|
16
|
-
[ BATCH_SIZE_PER_KAFKA_PARTITION = '<count>' ]
|
|
17
|
-
[ MAX_SKIP_BATCH_COUNT_ON_ERROR = '<count>' ]
|
|
18
|
-
[ INITIAL_DELAY_IN_SECONDS = '<seconds>' ]
|
|
19
|
-
[ RESET_KAFKA_GROUP_OFFSETS = '<offset_value>' ]
|
|
20
|
-
[ COPY_JOB_HINT = '<json>' ]
|
|
21
|
-
AS
|
|
22
|
-
COPY INTO <target_table> FROM (
|
|
23
|
-
SELECT <expr> [, ...]
|
|
24
|
-
FROM read_kafka(
|
|
25
|
-
'<bootstrap_servers>', -- 位置 1:Kafka 集群地址(必填)
|
|
26
|
-
'<topic_name>', -- 位置 2:Topic 名称(必填)
|
|
27
|
-
'', -- 位置 3:Topic pattern(保留,填空字符串)
|
|
28
|
-
'<group_id>', -- 位置 4:消费者组 ID(必填)
|
|
29
|
-
'', -- 位置 5:starting_offsets(Pipe 中留空)
|
|
30
|
-
'', -- 位置 6:ending_offsets(Pipe 中留空)
|
|
31
|
-
'', -- 位置 7:starting_timestamp(Pipe 中留空)
|
|
32
|
-
'', -- 位置 8:ending_timestamp(Pipe 中留空)
|
|
33
|
-
'raw', -- 位置 9:key 格式(目前只支持 raw)
|
|
34
|
-
'raw', -- 位置 10:value 格式(目前只支持 raw)
|
|
35
|
-
0, -- 位置 11:max_errors
|
|
36
|
-
MAP(<kafka_config>) -- 位置 12:Kafka 配置参数
|
|
37
|
-
)
|
|
38
|
-
);
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
### Pipe 参数说明
|
|
42
|
-
|
|
43
|
-
| 参数 | 必填 | 默认值 | 说明 |
|
|
44
|
-
|------|------|--------|------|
|
|
45
|
-
| `VIRTUAL_CLUSTER` | 是 | — | 执行 Pipe 任务的计算集群 |
|
|
46
|
-
| `BATCH_INTERVAL_IN_SECONDS` | 否 | 60 | 批处理间隔(秒),即数据新鲜度 |
|
|
47
|
-
| `BATCH_SIZE_PER_KAFKA_PARTITION` | 否 | 500000 | 每个 Kafka 分区每批最大消息数 |
|
|
48
|
-
| `MAX_SKIP_BATCH_COUNT_ON_ERROR` | 否 | 30 | 出错时跳过批次的最大重试次数 |
|
|
49
|
-
| `INITIAL_DELAY_IN_SECONDS` | 否 | 0 | 首个作业调度延迟 |
|
|
50
|
-
| `RESET_KAFKA_GROUP_OFFSETS` | 否 | — | 启动时消费位点(仅创建时生效) |
|
|
51
|
-
| `COPY_JOB_HINT` | 否 | — | JSON 格式的作业参数 |
|
|
52
|
-
|
|
53
|
-
### RESET_KAFKA_GROUP_OFFSETS 可选值
|
|
54
|
-
|
|
55
|
-
| 值 | 说明 |
|
|
56
|
-
|----|------|
|
|
57
|
-
| `'none'` | 无操作,使用 Kafka `auto.offset.reset`(默认 latest) |
|
|
58
|
-
| `'valid'` | 检查当前位点是否过期,将过期分区重置到 earliest |
|
|
59
|
-
| `'earliest'` | 重置到最早位点 |
|
|
60
|
-
| `'latest'` | 重置到最新位点 |
|
|
61
|
-
| `'<毫秒时间戳>'` | 重置到指定时间戳对应位点(如 `'1737789688000'`) |
|
|
62
|
-
|
|
63
|
-
### READ_KAFKA 参数(在 Pipe 中 vs 独立使用)
|
|
64
|
-
|
|
65
|
-
| 特性 | 独立使用 read_kafka | 在 Pipe 中使用 |
|
|
66
|
-
|------|-------------------|---------------|
|
|
67
|
-
| 消费者组 | 临时,执行完即销毁 | 持久,保持消费位置 |
|
|
68
|
-
| 位置管理 | 在 MAP 中设置 `kafka.auto.offset.reset` | Pipe 自动管理,位置参数**必须留空** |
|
|
69
|
-
| 执行方式 | 一次性查询 | 持续调度执行 |
|
|
70
|
-
| 默认起始位置 | latest(可在 MAP 中改为 earliest) | latest(由 RESET_KAFKA_GROUP_OFFSETS 控制) |
|
|
71
|
-
|
|
72
|
-
### MAP 配置参数
|
|
73
|
-
|
|
74
|
-
| 参数 | 说明 |
|
|
75
|
-
|------|------|
|
|
76
|
-
| `kafka.security.protocol` | 安全协议:`PLAINTEXT` 或 `SASL_PLAINTEXT` |
|
|
77
|
-
| `kafka.sasl.mechanism` | SASL 机制:`PLAIN` |
|
|
78
|
-
| `kafka.sasl.username` | SASL 用户名 |
|
|
79
|
-
| `kafka.sasl.password` | SASL 密码 |
|
|
80
|
-
| `kafka.auto.offset.reset` | 独立探查时的起始位点(`earliest` / `latest`) |
|
|
81
|
-
| `cz.kafka.fetch.retry.enable` | 启用 fetch 重试(`true`/`false`) |
|
|
82
|
-
| `cz.kafka.fetch.retry.times` | 重试次数 |
|
|
83
|
-
| `cz.kafka.fetch.retry.intervalMs` | 重试间隔(毫秒) |
|
|
84
|
-
|
|
85
|
-
### JSON 字段提取语法
|
|
86
|
-
|
|
87
|
-
```sql
|
|
88
|
-
-- key 和 value 都是 binary 类型,需要先转换
|
|
89
|
-
value::string -- 转为字符串
|
|
90
|
-
parse_json(value::string) -- 解析为 JSON 对象
|
|
91
|
-
parse_json(value::string)['field']::TYPE -- 提取顶层字段
|
|
92
|
-
parse_json(value::string)['nested']['key']::TYPE -- 提取嵌套字段
|
|
93
|
-
|
|
94
|
-
-- 推荐模式:在子查询中先 parse_json,外层直接用 j['field']
|
|
95
|
-
SELECT j['order_id']::STRING, j['amount']::DECIMAL(10,2)
|
|
96
|
-
FROM (
|
|
97
|
-
SELECT parse_json(value::string) AS j
|
|
98
|
-
FROM read_kafka(...)
|
|
99
|
-
)
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
### 完整示例
|
|
103
|
-
|
|
104
|
-
```sql
|
|
105
|
-
-- 无认证 Kafka Pipe
|
|
106
|
-
CREATE PIPE kafka_orders_pipe
|
|
107
|
-
VIRTUAL_CLUSTER = 'default'
|
|
108
|
-
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
109
|
-
AS
|
|
110
|
-
COPY INTO ods.orders FROM (
|
|
111
|
-
SELECT
|
|
112
|
-
j['order_id']::STRING AS order_id,
|
|
113
|
-
j['user_id']::STRING AS user_id,
|
|
114
|
-
j['amount']::DECIMAL(10,2) AS amount,
|
|
115
|
-
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
116
|
-
FROM (
|
|
117
|
-
SELECT `timestamp`, parse_json(value::string) AS j
|
|
118
|
-
FROM read_kafka(
|
|
119
|
-
'kafka.example.com:9092',
|
|
120
|
-
'orders',
|
|
121
|
-
'',
|
|
122
|
-
'lakehouse_orders',
|
|
123
|
-
'', '', '', '',
|
|
124
|
-
'raw', 'raw', 0,
|
|
125
|
-
MAP('kafka.security.protocol', 'PLAINTEXT')
|
|
126
|
-
)
|
|
127
|
-
)
|
|
128
|
-
);
|
|
129
|
-
|
|
130
|
-
-- SASL 认证 + 指定时间点消费
|
|
131
|
-
CREATE PIPE kafka_secure_pipe
|
|
132
|
-
VIRTUAL_CLUSTER = 'pipe_vc'
|
|
133
|
-
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
134
|
-
RESET_KAFKA_GROUP_OFFSETS = '1737789688000'
|
|
135
|
-
AS
|
|
136
|
-
COPY INTO ods.secure_events FROM (
|
|
137
|
-
SELECT
|
|
138
|
-
j['id']::STRING AS event_id,
|
|
139
|
-
j['payload']::STRING AS payload,
|
|
140
|
-
CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
|
|
141
|
-
FROM (
|
|
142
|
-
SELECT `timestamp`, parse_json(value::string) AS j
|
|
143
|
-
FROM read_kafka(
|
|
144
|
-
'kafka.example.com:9092',
|
|
145
|
-
'secure_events',
|
|
146
|
-
'',
|
|
147
|
-
'cz_secure',
|
|
148
|
-
'', '', '', '',
|
|
149
|
-
'raw', 'raw', 0,
|
|
150
|
-
MAP(
|
|
151
|
-
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
152
|
-
'kafka.sasl.mechanism', 'PLAIN',
|
|
153
|
-
'kafka.sasl.username', 'my_user',
|
|
154
|
-
'kafka.sasl.password', 'my_password'
|
|
155
|
-
)
|
|
156
|
-
)
|
|
157
|
-
)
|
|
158
|
-
);
|
|
159
|
-
```
|
|
160
|
-
|
|
161
|
-
---
|
|
162
|
-
|
|
163
|
-
## 独立探查(验证连接和数据格式)
|
|
164
|
-
|
|
165
|
-
```sql
|
|
166
|
-
-- 无认证
|
|
167
|
-
SELECT value::string
|
|
168
|
-
FROM read_kafka(
|
|
169
|
-
'kafka.example.com:9092',
|
|
170
|
-
'orders',
|
|
171
|
-
'',
|
|
172
|
-
'test_explore',
|
|
173
|
-
'', '', '', '',
|
|
174
|
-
'raw', 'raw', 0,
|
|
175
|
-
MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
|
|
176
|
-
)
|
|
177
|
-
LIMIT 10;
|
|
178
|
-
|
|
179
|
-
-- SASL 认证
|
|
180
|
-
SELECT value::string
|
|
181
|
-
FROM read_kafka(
|
|
182
|
-
'kafka.example.com:9092',
|
|
183
|
-
'orders',
|
|
184
|
-
'',
|
|
185
|
-
'test_explore',
|
|
186
|
-
'', '', '', '',
|
|
187
|
-
'raw', 'raw', 0,
|
|
188
|
-
MAP(
|
|
189
|
-
'kafka.security.protocol', 'SASL_PLAINTEXT',
|
|
190
|
-
'kafka.sasl.mechanism', 'PLAIN',
|
|
191
|
-
'kafka.sasl.username', 'my_user',
|
|
192
|
-
'kafka.sasl.password', 'my_password',
|
|
193
|
-
'kafka.auto.offset.reset', 'earliest'
|
|
194
|
-
)
|
|
195
|
-
)
|
|
196
|
-
LIMIT 10;
|
|
197
|
-
```
|
|
198
|
-
|
|
199
|
-
---
|
|
200
|
-
|
|
201
|
-
## CREATE PIPE(Kafka 外部表 + Table Stream 方式)
|
|
202
|
-
|
|
203
|
-
### 步骤 1:创建 Kafka Storage Connection
|
|
204
|
-
|
|
205
|
-
```sql
|
|
206
|
-
CREATE STORAGE CONNECTION IF NOT EXISTS <conn_name>
|
|
207
|
-
TYPE KAFKA
|
|
208
|
-
BOOTSTRAP_SERVERS = ['<host1>:<port1>', '<host2>:<port2>']
|
|
209
|
-
SECURITY_PROTOCOL = 'PLAINTEXT';
|
|
210
|
-
```
|
|
211
|
-
|
|
212
|
-
### 步骤 2:创建 Kafka 外部表
|
|
213
|
-
|
|
214
|
-
```sql
|
|
215
|
-
-- ⚠️ 必须显式指定列定义(不能省略)
|
|
216
|
-
-- ⚠️ offset 是保留字,必须用反引号转义
|
|
217
|
-
CREATE EXTERNAL TABLE <ext_table_name> (
|
|
218
|
-
topic STRING,
|
|
219
|
-
partition INT,
|
|
220
|
-
`offset` BIGINT,
|
|
221
|
-
`timestamp` TIMESTAMP,
|
|
222
|
-
timestamp_type STRING,
|
|
223
|
-
headers STRING,
|
|
224
|
-
key BINARY,
|
|
225
|
-
value BINARY
|
|
226
|
-
)
|
|
227
|
-
USING KAFKA
|
|
228
|
-
OPTIONS (
|
|
229
|
-
'group_id' = '<consumer_group>',
|
|
230
|
-
'topics' = '<topic_name>',
|
|
231
|
-
'starting_offset' = '<earliest | latest>'
|
|
232
|
-
)
|
|
233
|
-
CONNECTION <conn_name>;
|
|
234
|
-
```
|
|
235
|
-
|
|
236
|
-
> **注意**:
|
|
237
|
-
> - 列定义是**必须的**,省略会报错 `failed to detect columns`
|
|
238
|
-
> - `offset` 和 `timestamp` 是保留字,需要反引号转义
|
|
239
|
-
> - 删除外部表用 `DROP TABLE`(不是 `DROP EXTERNAL TABLE`)
|
|
240
|
-
|
|
241
|
-
### 步骤 3:创建 Table Stream
|
|
242
|
-
|
|
243
|
-
```sql
|
|
244
|
-
CREATE TABLE STREAM <stream_name>
|
|
245
|
-
ON TABLE <ext_table_name>
|
|
246
|
-
WITH PROPERTIES ('TABLE_STREAM_MODE' = 'APPEND_ONLY');
|
|
247
|
-
```
|
|
248
|
-
|
|
249
|
-
### 步骤 4:创建 Pipe
|
|
250
|
-
|
|
251
|
-
```sql
|
|
252
|
-
CREATE PIPE <pipe_name>
|
|
253
|
-
VIRTUAL_CLUSTER = '<vcluster_name>'
|
|
254
|
-
BATCH_INTERVAL_IN_SECONDS = '60'
|
|
255
|
-
AS
|
|
256
|
-
COPY INTO <target_table>
|
|
257
|
-
SELECT <expr> [, ...]
|
|
258
|
-
FROM <stream_name>;
|
|
259
|
-
```
|
|
260
|
-
|
|
261
|
-
---
|
|
262
|
-
|
|
263
|
-
## ALTER PIPE
|
|
264
|
-
|
|
265
|
-
```sql
|
|
266
|
-
-- 暂停
|
|
267
|
-
ALTER PIPE <pipe_name> SET PIPE_EXECUTION_PAUSED = true;
|
|
268
|
-
|
|
269
|
-
-- 恢复
|
|
270
|
-
ALTER PIPE <pipe_name> SET PIPE_EXECUTION_PAUSED = false;
|
|
271
|
-
|
|
272
|
-
-- 修改 VCluster
|
|
273
|
-
ALTER PIPE <pipe_name> SET VIRTUAL_CLUSTER = 'new_vc';
|
|
274
|
-
|
|
275
|
-
-- 修改 COPY_JOB_HINT
|
|
276
|
-
ALTER PIPE <pipe_name> SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
|
|
277
|
-
```
|
|
278
|
-
|
|
279
|
-
> ⚠️ **ALTER PIPE 支持的属性**:
|
|
280
|
-
> - ✅ `PIPE_EXECUTION_PAUSED`
|
|
281
|
-
> - ✅ `VIRTUAL_CLUSTER`
|
|
282
|
-
> - ✅ `COPY_JOB_HINT`
|
|
283
|
-
> - ❌ `BATCH_INTERVAL_IN_SECONDS`(不支持,需删除重建)
|
|
284
|
-
> - ❌ `BATCH_SIZE_PER_KAFKA_PARTITION`(不支持,需删除重建)
|
|
285
|
-
>
|
|
286
|
-
> 不支持修改 COPY/INSERT 语句逻辑,需删除 Pipe 后重建。
|
|
287
|
-
> 修改 `COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数。
|
|
288
|
-
|
|
289
|
-
---
|
|
290
|
-
|
|
291
|
-
## 监控
|
|
292
|
-
|
|
293
|
-
```sql
|
|
294
|
-
-- 查看 Pipe 详情(含延迟信息 pipe_latency)
|
|
295
|
-
DESC PIPE EXTENDED <pipe_name>;
|
|
296
|
-
|
|
297
|
-
-- 查看所有 Pipe
|
|
298
|
-
SHOW PIPES;
|
|
299
|
-
|
|
300
|
-
-- 查看加载历史
|
|
301
|
-
SELECT * FROM load_history('<schema>.<table>')
|
|
302
|
-
ORDER BY last_load_time DESC LIMIT 20;
|
|
303
|
-
|
|
304
|
-
-- 通过 query_tag 查看 Pipe 作业
|
|
305
|
-
-- 格式:pipe.<workspace_name>.<schema_name>.<pipe_name>
|
|
306
|
-
SHOW JOBS WHERE query_tag = 'pipe.my_workspace.ods.kafka_orders_pipe';
|
|
307
|
-
```
|
|
308
|
-
|
|
309
|
-
---
|
|
310
|
-
|
|
311
|
-
## DROP PIPE
|
|
312
|
-
|
|
313
|
-
```sql
|
|
314
|
-
DROP PIPE [ IF EXISTS ] <pipe_name>;
|
|
315
|
-
```
|
|
316
|
-
|
|
317
|
-
## 参考文档
|
|
318
|
-
|
|
319
|
-
- [Pipe 简介](https://www.yunqi.tech/documents/pipe-summary)
|
|
320
|
-
- [借助 read_kafka 函数持续导入](https://www.yunqi.tech/documents/pipe-kafka)
|
|
321
|
-
- [借助 Kafka 外表 Table Stream 持续导入](https://www.yunqi.tech/documents/pipe-kafka-table-stream)
|
|
322
|
-
- [最佳实践:使用 Pipe 高效接入 Kafka 数据](https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1)
|
|
323
|
-
- [Kafka 外部表](https://www.yunqi.tech/documents/kafka-external-table)
|
|
324
|
-
- [Kafka Storage Connection](https://www.yunqi.tech/documents/Kafka_connection)
|
|
@@ -1,218 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: clickzetta-lakehouse-connect
|
|
3
|
-
description: |
|
|
4
|
-
Guide for connecting to ClickZetta Lakehouse via SDK/JDBC. Covers Python SDK (clickzetta.connect), ZettaPark Session (DataFrame API), SQLAlchemy (ORM/BI tools), and JDBC (Java). Use this skill when user needs to configure a connection from external tools or code. Trigger for: "Python SDK 连接", "JDBC 连接", "SQLAlchemy 配置", "ZettaPark 怎么用", "连接报错", "clickzetta-connector-python", "clickzetta-sqlalchemy".
|
|
5
|
-
Keywords: connection, Python SDK, JDBC, SQLAlchemy, ZettaPark, driver, connect
|
|
6
|
-
---
|
|
7
|
-
|
|
8
|
-
# ClickZetta Lakehouse 连接指南
|
|
9
|
-
|
|
10
|
-
## 指令
|
|
11
|
-
|
|
12
|
-
### 步骤 0:自动获取连接参数(优先)
|
|
13
|
-
|
|
14
|
-
**在询问用户之前,先尝试从本地配置文件自动读取连接参数。**
|
|
15
|
-
|
|
16
|
-
按以下优先级查找配置文件(找到第一个即停止):
|
|
17
|
-
1. `/app/.clickzetta/lakehouse_connection/connections.json`
|
|
18
|
-
2. `config/lakehouse_connection/connections.json`
|
|
19
|
-
3. `~/.clickzetta/connections.json`
|
|
20
|
-
4. `/app/.clickzetta/connections.json`
|
|
21
|
-
|
|
22
|
-
找到配置文件后:
|
|
23
|
-
- 解析 JSON,提取 `connections` 数组
|
|
24
|
-
- 根据用户描述的区域/环境匹配对应连接(如"阿里云上海"匹配 `service` 含 `cn-shanghai-alicloud` 的连接)
|
|
25
|
-
- 若有 `is_default: true` 且用户未指定区域,使用默认连接
|
|
26
|
-
- **不要将密码或完整配置输出到对话中**,仅内部使用
|
|
27
|
-
|
|
28
|
-
若配置文件不存在或无匹配连接,再向用户询问:service、instance、workspace、username、password、schema、vcluster。
|
|
29
|
-
|
|
30
|
-
### 步骤 1:确认连接方式
|
|
31
|
-
|
|
32
|
-
根据用户场景选择连接方式,阅读对应参考文件:
|
|
33
|
-
|
|
34
|
-
| 用户需求 | 参考文件 |
|
|
35
|
-
|:--|:--|
|
|
36
|
-
| Python 脚本 / 自动化 / 执行 SQL | [references/python-sdk.md](references/python-sdk.md) |
|
|
37
|
-
| DataFrame / 数据工程 | [references/zettapark-session.md](references/zettapark-session.md) |
|
|
38
|
-
| ORM / Web 应用 / BI 工具(Superset) | [references/sqlalchemy.md](references/sqlalchemy.md) |
|
|
39
|
-
| Java 应用 / BI 工具(DBeaver) | [references/jdbc.md](references/jdbc.md) |
|
|
40
|
-
| 多环境配置文件管理 | [references/config-file.md](references/config-file.md) |
|
|
41
|
-
|
|
42
|
-
不确定时参考决策树:
|
|
43
|
-
- 需要 DataFrame 操作 → ZettaPark Session
|
|
44
|
-
- 需要 ORM / SQLAlchemy 集成 → SQLAlchemy
|
|
45
|
-
- Java 应用 → JDBC
|
|
46
|
-
- 其他 Python 场景(含直接执行 SQL)→ Python SDK
|
|
47
|
-
|
|
48
|
-
### 步骤 2:确认 service 地址
|
|
49
|
-
|
|
50
|
-
`service` 参数必须包含区域前缀,根据实例所在区域选择:
|
|
51
|
-
|
|
52
|
-
**云器 Lakehouse(国内版,`clickzetta.com`)**
|
|
53
|
-
|
|
54
|
-
| 云厂商 | 区域 | service 地址 |
|
|
55
|
-
|:--|:--|:--|
|
|
56
|
-
| 阿里云 | 华东2(上海) | `cn-shanghai-alicloud.api.clickzetta.com` |
|
|
57
|
-
| 腾讯云 | 华东(上海) | `ap-shanghai-tencentcloud.api.clickzetta.com` |
|
|
58
|
-
| 腾讯云 | 华北(北京) | `ap-beijing-tencentcloud.api.clickzetta.com` |
|
|
59
|
-
| 腾讯云 | 华南(广州) | `ap-guangzhou-tencentcloud.api.clickzetta.com` |
|
|
60
|
-
| AWS | 中国(北京) | `cn-north-1-aws.api.clickzetta.com` |
|
|
61
|
-
|
|
62
|
-
**Singdata Lakehouse(国际版,`singdata.com`)**
|
|
63
|
-
|
|
64
|
-
| 云厂商 | 区域 | service 地址 |
|
|
65
|
-
|:--|:--|:--|
|
|
66
|
-
| 阿里云 | 亚太东南1(新加坡) | `ap-southeast-1-alicloud.api.singdata.com` |
|
|
67
|
-
| AWS | 亚太(新加坡) | `ap-southeast-1-aws.api.singdata.com` |
|
|
68
|
-
|
|
69
|
-
控制台:`https://{instance}.{region}.app.clickzetta.com`
|
|
70
|
-
|
|
71
|
-
### 步骤 3:执行查询或提供可运行代码
|
|
72
|
-
|
|
73
|
-
**若用户要求执行查询(如 SHOW SCHEMAS、SELECT、SHOW TABLES 等):**
|
|
74
|
-
|
|
75
|
-
1. 确认 `clickzetta-connector-python` 已安装:
|
|
76
|
-
```bash
|
|
77
|
-
pip3 show clickzetta-connector-python
|
|
78
|
-
```
|
|
79
|
-
若未安装,执行:`pip3 install clickzetta-connector-python --user`
|
|
80
|
-
|
|
81
|
-
2. 使用步骤 0 获取的连接参数直接执行查询,将结果格式化后展示给用户。
|
|
82
|
-
|
|
83
|
-
**若用户要求生成代码:**
|
|
84
|
-
|
|
85
|
-
阅读对应参考文件后,根据参数生成完整可运行代码。所有参数均为必填,`vcluster` 默认值为 `default_ap`。
|
|
86
|
-
|
|
87
|
-
密码含特殊字符时(SQLAlchemy URI),提醒用户用 `urllib.parse.quote_plus()` 编码。
|
|
88
|
-
|
|
89
|
-
## 示例
|
|
90
|
-
|
|
91
|
-
### 示例 0:自动读取配置并执行查询
|
|
92
|
-
|
|
93
|
-
```python
|
|
94
|
-
import json, os, clickzetta
|
|
95
|
-
|
|
96
|
-
# 按优先级查找配置文件
|
|
97
|
-
config_paths = [
|
|
98
|
-
"/app/.clickzetta/lakehouse_connection/connections.json",
|
|
99
|
-
"config/lakehouse_connection/connections.json",
|
|
100
|
-
os.path.expanduser("~/.clickzetta/connections.json"),
|
|
101
|
-
"/app/.clickzetta/connections.json",
|
|
102
|
-
]
|
|
103
|
-
config = None
|
|
104
|
-
for path in config_paths:
|
|
105
|
-
if os.path.exists(path):
|
|
106
|
-
with open(path) as f:
|
|
107
|
-
config = json.load(f)
|
|
108
|
-
break
|
|
109
|
-
|
|
110
|
-
# 选择目标连接(示例:匹配阿里云上海)
|
|
111
|
-
conn_cfg = next(
|
|
112
|
-
(c for c in config["connections"] if "cn-shanghai-alicloud" in c.get("service", "")),
|
|
113
|
-
None
|
|
114
|
-
) or next((c for c in config["connections"] if c.get("is_default")), config["connections"][0])
|
|
115
|
-
|
|
116
|
-
conn = clickzetta.connect(
|
|
117
|
-
service=conn_cfg["service"],
|
|
118
|
-
instance=conn_cfg["instance"],
|
|
119
|
-
workspace=conn_cfg["workspace"],
|
|
120
|
-
schema=conn_cfg.get("schema", "public"),
|
|
121
|
-
username=conn_cfg["username"],
|
|
122
|
-
password=conn_cfg["password"],
|
|
123
|
-
vcluster=conn_cfg.get("vcluster", "default_ap")
|
|
124
|
-
)
|
|
125
|
-
cursor = conn.cursor()
|
|
126
|
-
cursor.execute("SHOW SCHEMAS")
|
|
127
|
-
for row in cursor.fetchall():
|
|
128
|
-
print(row[0])
|
|
129
|
-
cursor.close()
|
|
130
|
-
conn.close()
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
### 示例 1:Python SDK 连接并查询
|
|
134
|
-
|
|
135
|
-
```python
|
|
136
|
-
import clickzetta
|
|
137
|
-
|
|
138
|
-
conn = clickzetta.connect(
|
|
139
|
-
service="cn-shanghai-alicloud.api.clickzetta.com",
|
|
140
|
-
instance="my_instance",
|
|
141
|
-
workspace="my_workspace",
|
|
142
|
-
schema="public",
|
|
143
|
-
username="my_user",
|
|
144
|
-
password="my_password",
|
|
145
|
-
vcluster="default_ap"
|
|
146
|
-
)
|
|
147
|
-
cursor = conn.cursor()
|
|
148
|
-
cursor.execute("SELECT * FROM orders LIMIT 10")
|
|
149
|
-
for row in cursor.fetchall():
|
|
150
|
-
print(row)
|
|
151
|
-
cursor.close()
|
|
152
|
-
conn.close()
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
### 示例 2:ZettaPark 按 region 汇总 revenue
|
|
156
|
-
|
|
157
|
-
```python
|
|
158
|
-
from clickzetta.zettapark.session import Session
|
|
159
|
-
from clickzetta.zettapark import functions as F
|
|
160
|
-
|
|
161
|
-
session = Session.builder.configs({
|
|
162
|
-
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
163
|
-
"instance": "my_instance", "workspace": "my_workspace",
|
|
164
|
-
"schema": "public", "username": "my_user",
|
|
165
|
-
"password": "my_password", "vcluster": "default_ap"
|
|
166
|
-
}).create()
|
|
167
|
-
|
|
168
|
-
session.table("sales") \
|
|
169
|
-
.group_by(F.col("region")) \
|
|
170
|
-
.agg(F.sum("revenue").as_("total_revenue")) \
|
|
171
|
-
.write.save_as_table("sales_summary", mode="overwrite")
|
|
172
|
-
session.close()
|
|
173
|
-
```
|
|
174
|
-
|
|
175
|
-
## 故障排除
|
|
176
|
-
|
|
177
|
-
| 错误信息 | 原因 | 解决方案 |
|
|
178
|
-
|:--|:--|:--|
|
|
179
|
-
| `Connection refused` | service 地址不正确或网络不通 | 检查 service 是否匹配区域(参见步骤 2 区域表) |
|
|
180
|
-
| `Authentication failed` | 用户名或密码错误 | 核实 username 和 password |
|
|
181
|
-
| `Workspace not found` | 工作空间名称不存在 | 在控制台确认 workspace 拼写 |
|
|
182
|
-
| `Instance not found` | 实例名称不存在 | 在控制台确认 instance 拼写 |
|
|
183
|
-
| `Timeout` | 查询超时 | 增大 `hints` 中的 `sdk.job.timeout`(默认 300 秒) |
|
|
184
|
-
| `VCluster not available` | 虚拟集群未启动或名称错误 | 确认 vcluster 名称,检查集群状态 |
|
|
185
|
-
| SQLAlchemy URL 解析错误 | 密码含特殊字符 | 用 `urllib.parse.quote_plus()` 对密码 URL 编码 |
|
|
186
|
-
| `ClassNotFoundException` | JDBC 驱动未在 classpath | 确保 `clickzetta-java` JAR 已加入 classpath |
|
|
187
|
-
|
|
188
|
-
## 安装
|
|
189
|
-
|
|
190
|
-
> ⚠️ **Python 版本要求**:推荐 **Python 3.12**(最低 3.10)。Python 3.9 及以下不支持。
|
|
191
|
-
|
|
192
|
-
| 连接方式 | 安装命令 |
|
|
193
|
-
|:--|:--|
|
|
194
|
-
| Python SDK | `pip install clickzetta-connector-python -i https://pypi.tuna.tsinghua.edu.cn/simple` |
|
|
195
|
-
| ZettaPark | `pip install clickzetta-zettapark-python -i https://pypi.tuna.tsinghua.edu.cn/simple` |
|
|
196
|
-
| SQLAlchemy | `pip install clickzetta-connector-python clickzetta-sqlalchemy -i https://pypi.tuna.tsinghua.edu.cn/simple` |
|
|
197
|
-
| JDBC | Maven: `com.clickzetta:clickzetta-java` |
|
|
198
|
-
|
|
199
|
-
```bash
|
|
200
|
-
# 方式 1:venv(Python 内置,推荐)
|
|
201
|
-
python3.12 -m venv .venv
|
|
202
|
-
source .venv/bin/activate # macOS/Linux
|
|
203
|
-
# .venv\Scripts\activate # Windows
|
|
204
|
-
pip install clickzetta-connector-python clickzetta-zettapark-python \
|
|
205
|
-
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
206
|
-
|
|
207
|
-
# 方式 2:pyenv(需要切换 Python 版本时)
|
|
208
|
-
pyenv install 3.12.9
|
|
209
|
-
pyenv local 3.12.9
|
|
210
|
-
python -m venv .venv && source .venv/bin/activate
|
|
211
|
-
pip install clickzetta-connector-python clickzetta-zettapark-python \
|
|
212
|
-
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
213
|
-
|
|
214
|
-
# 方式 3:conda(数据科学环境)
|
|
215
|
-
conda create -n lakehouse python=3.12 -y && conda activate lakehouse
|
|
216
|
-
pip install clickzetta-connector-python clickzetta-zettapark-python \
|
|
217
|
-
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
218
|
-
```
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"skill_name": "clickzetta-lakehouse-connect",
|
|
3
|
-
"evals": [
|
|
4
|
-
{
|
|
5
|
-
"id": 1,
|
|
6
|
-
"prompt": "我需要用 Python 连接 ClickZetta,实例名是 my_instance,工作空间是 analytics,region 是上海阿里云,用户名 alice,密码 secret123。帮我写一段查询 orders 表前 10 行的代码。",
|
|
7
|
-
"expected_output": "使用 clickzetta.connect() 或 clickzetta-connector-python,包含所有必填参数(service/instance/workspace/schema/username/password/vcluster),并演示 cursor.execute + fetchall 查询",
|
|
8
|
-
"files": []
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": 2,
|
|
12
|
-
"prompt": "我想用 ZettaPark 做数据工程,需要读取 sales 表,按 region 分组求 revenue 总和,然后写回到 sales_summary 表。帮我写完整代码。",
|
|
13
|
-
"expected_output": "使用 Session.builder.configs().create(),展示 session.table() + group_by + agg + write.save_as_table(),包含连接参数配置",
|
|
14
|
-
"files": []
|
|
15
|
-
},
|
|
16
|
-
{
|
|
17
|
-
"id": 3,
|
|
18
|
-
"prompt": "我在用 Apache Superset 连接 ClickZetta,SQLAlchemy URI 应该怎么填?密码是 P@ss#2024,需要注意什么?",
|
|
19
|
-
"expected_output": "提供正确的 clickzetta:// URI 格式,指出密码特殊字符需要 quote_plus 编码,给出编码后的示例",
|
|
20
|
-
"files": []
|
|
21
|
-
},
|
|
22
|
-
{
|
|
23
|
-
"id": 4,
|
|
24
|
-
"prompt": "连接云器 Lakehouse 报错 Connection refused,我的 service 填的是 api.clickzetta.com,实例在上海腾讯云,怎么排查?",
|
|
25
|
-
"expected_output": "识别 service 地址填错,给出正确的上海腾讯云地址 ap-shanghai-tencentcloud.api.clickzetta.com,并提供排查步骤",
|
|
26
|
-
"files": []
|
|
27
|
-
},
|
|
28
|
-
{
|
|
29
|
-
"id": 5,
|
|
30
|
-
"prompt": "我有三个环境:dev/staging/prod,都在同一个 ClickZetta 实例上但不同 workspace。想用 connections.json 统一管理,并在代码里切换。怎么配置?",
|
|
31
|
-
"expected_output": "提供 connections.json 多连接配置示例(含 is_default),展示 switch_connection() 用法,说明文件放置路径",
|
|
32
|
-
"files": []
|
|
33
|
-
}
|
|
34
|
-
]
|
|
35
|
-
}
|