@clickzetta/cz-cli-darwin-arm64 0.3.92 → 0.3.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
  3. package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
  4. package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
  5. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
  6. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
  7. package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
  8. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
  9. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
  10. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
  11. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
  12. package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
  13. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
  14. package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
  15. package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
  16. package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
  17. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
  18. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
  19. package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
  20. package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
  21. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
  22. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
  23. package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
  24. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
  25. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
  26. package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
  27. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
  28. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
  29. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
  30. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
  31. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
  32. package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
  33. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
  34. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
  35. package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
  36. package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
  37. package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
  38. package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
  39. package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
  40. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
  41. package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
  42. package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
  43. package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
  44. package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
  45. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
  46. package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
  47. package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
  48. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
  49. package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
  50. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
  51. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
  52. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
  53. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
  54. package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
  55. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
  56. package/package.json +1 -1
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  58. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  60. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
  61. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  62. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  63. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  64. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  65. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  66. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  67. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  68. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  69. /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
@@ -1,223 +1,207 @@
1
1
  ---
2
2
  name: clickzetta-kafka-ingest-pipeline
3
3
  description: |
4
- 搭建 ClickZetta Lakehouse Kafka 数据接入管道,覆盖从连接验证、数据探查、目标表创建
5
- Pipe 持续导入的端到端工作流。支持两种接入路径:READ_KAFKA Pipe(推荐)和
6
- Kafka 外部表 + Table Stream Pipe。
7
- 当用户说"Kafka 接入"、"Kafka 导入"、"Kafka Pipe"、"read_kafka"、"Kafka 数据管道"、
8
- "Kafka 外部表"、"Kafka 消费"、"消息队列导入"、"Kafka 到 Lakehouse"、
9
- "Kafka 实时导入"、"Kafka 持续导入"、"Kafka topic 导入"、"Kafka JSON 解析"、
10
- "Kafka 延迟监控"、"Kafka 积压"时触发。
11
- 包含 READ_KAFKA 函数探查、JSON 多层嵌套解析、Kafka Pipe DDL、Kafka 外部表 + Table Stream、
12
- SASL 认证配置、生产调优(BATCH_SIZE / COPY_JOB_HINT / VCluster 规格)、
13
- 延迟监控(pipe_latency / query_tag)等 ClickZetta 特有逻辑。
14
- Keywords: Kafka, READ_KAFKA, Pipe, streaming ingestion, topic, consumer
4
+ Build Kafka-to-Lakehouse ingestion pipelines using READ_KAFKA Pipe or Kafka External Table + Table Stream.
5
+ Covers: connection validation, JSON/CSV parsing, Pipe DDL, SASL auth, VCluster sizing, latency monitoring, tuning.
6
+ Triggers: Kafka ingestion, Kafka Pipe, read_kafka, Kafka external table, Kafka consumer, message queue import, Kafka backlog.
15
7
  ---
16
8
 
17
- # Kafka 数据接入管道工作流
9
+ # Kafka Data Ingestion Pipeline Workflow
18
10
 
19
- ## 向导:收集必要信息
11
+ > **Compatibility**: Requires ClickZetta Lakehouse with Pipe support (v2.0+). All SQL executed via `cz-cli sql --sync`.
20
12
 
21
- 开始搭建 Kafka 管道前,优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
13
+ ## Quick Start (Simple JSON, No Auth)
14
+
15
+ For the fastest path — flat JSON topic, PLAINTEXT, default settings:
16
+
17
+ ```bash
18
+ # 1. Verify connectivity
19
+ cz-cli sql "SELECT value::string FROM read_kafka('BROKER:9092','TOPIC','','test_explore','','','','','raw','raw',0,MAP('kafka.security.protocol','PLAINTEXT','kafka.auto.offset.reset','earliest')) LIMIT 5" --sync
20
+
21
+ # 2. Create target table
22
+ cz-cli sql "CREATE TABLE IF NOT EXISTS ods.my_table (id STRING, name STRING, amount DECIMAL(10,2), __kafka_timestamp__ TIMESTAMP)" --sync
23
+
24
+ # 3. Create Pipe
25
+ cz-cli sql "CREATE PIPE my_pipe VIRTUAL_CLUSTER='default' BATCH_INTERVAL_IN_SECONDS='60' AS COPY INTO ods.my_table FROM (SELECT j['id']::STRING, j['name']::STRING, j['amount']::DECIMAL(10,2), CAST(\`timestamp\` AS TIMESTAMP) FROM (SELECT \`timestamp\`, parse_json(value::string) AS j FROM read_kafka('BROKER:9092','TOPIC','','cz_my_group','','','','','raw','raw',0,MAP('kafka.security.protocol','PLAINTEXT'))))" --sync
26
+
27
+ # 4. Verify
28
+ cz-cli sql "DESC PIPE EXTENDED my_pipe" --sync
29
+ cz-cli sql "SELECT COUNT(*) FROM ods.my_table" --sync
30
+ ```
31
+
32
+ Replace `BROKER:9092`, `TOPIC`, field names, and types. Pipe starts automatically.
33
+
34
+ > **Tip**: If backtick escaping causes issues in your shell, write the SQL to a file and run `cz-cli sql -f pipe.sql --sync` instead.
35
+
36
+ ---
37
+
38
+ ## Decision Tree
39
+
40
+ ```
41
+ User wants Kafka → Lakehouse ingestion
42
+
43
+ ├─ Message format?
44
+ │ ├─ JSON (flat) → Standard path, parse_json + field extraction
45
+ │ ├─ JSON (nested) → Layer-by-layer parse_json unwrapping (see Step 2 below)
46
+ │ ├─ CSV → split(value::string, ',')[N] extraction (see CSV section)
47
+ │ └─ Avro / Protobuf → NOT SUPPORTED natively; land as raw BINARY, decode downstream
48
+
49
+ ├─ Ingestion path?
50
+ │ ├─ READ_KAFKA Pipe (default)
51
+ │ │ → Simpler, fewer objects, supports complex SQL in COPY INTO
52
+ │ │ → Use for: most scenarios
53
+ │ │
54
+ │ └─ Kafka External Table + Table Stream Pipe
55
+ │ → Retains raw messages in external table
56
+ │ → Multiple downstream consumers can read same topic independently
57
+ │ → Use for: audit trail, multi-consumer fan-out
58
+
59
+ ├─ Authentication?
60
+ │ ├─ None → MAP('kafka.security.protocol','PLAINTEXT')
61
+ │ ├─ SASL_PLAINTEXT → Add sasl.mechanism, username, password to MAP
62
+ │ └─ SSL / mTLS → NOT SUPPORTED by Kafka Pipe
63
+
64
+ └─ Starting offset?
65
+ ├─ Latest (default) → Omit RESET_KAFKA_GROUP_OFFSETS
66
+ ├─ Earliest → RESET_KAFKA_GROUP_OFFSETS = 'earliest'
67
+ └─ Specific time → RESET_KAFKA_GROUP_OFFSETS = '<epoch_millis>'
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Wizard: Collect Required Information
73
+
74
+ Before building a Kafka pipeline, use an interactive Q&A tool (e.g., `question`) to collect the following. If no such tool is available, list all questions in text:
22
75
 
23
76
  ```
24
77
  question({
25
78
  questions: [
26
79
  {
27
- question: "Kafka 消息格式?",
80
+ question: "Kafka message format?",
28
81
  options: [
29
- { label: "JSON(简单结构)", description: "顶层字段直接映射" },
30
- { label: "JSON(嵌套结构)", description: "需要 JSONPath 解析嵌套字段" },
31
- { label: "CSV", description: "逗号分隔文本" },
32
- { label: "Avro / 其他", description: "需要额外配置" }
82
+ { label: "JSON (flat)", description: "Top-level fields map directly" },
83
+ { label: "JSON (nested)", description: "Requires layer-by-layer parse_json" },
84
+ { label: "CSV", description: "Comma-separated, use split()" },
85
+ { label: "Avro / Protobuf / Other", description: "Land as raw binary, decode downstream" }
33
86
  ]
34
87
  },
35
88
  {
36
- question: "接入路径?",
89
+ question: "Ingestion path?",
37
90
  options: [
38
- { label: "READ_KAFKA Pipe(推荐)", description: "通用场景,支持复杂 SQL 转换" },
39
- { label: "Kafka 外部表 + Table Stream", description: "需要保留原始消息或多个下游消费同一 Topic" }
91
+ { label: "READ_KAFKA Pipe (recommended)", description: "General use case, fewer objects" },
92
+ { label: "Kafka External Table + Table Stream", description: "Retain raw messages or multi-consumer fan-out" }
93
+ ]
94
+ },
95
+ {
96
+ question: "Authentication?",
97
+ options: [
98
+ { label: "None (PLAINTEXT)", description: "No credentials needed" },
99
+ { label: "SASL_PLAINTEXT", description: "Username/password authentication" }
40
100
  ]
41
101
  }
42
102
  ]
43
103
  })
44
104
  ```
45
105
 
46
- **如果用户已经提供了足够信息,直接进入工作流,不再弹出菜单。**
106
+ **If the user has already provided sufficient information, skip the wizard and proceed directly.**
47
107
 
48
108
  ---
49
109
 
50
- ## 适用场景
51
-
52
- - 将 Kafka Topic 数据持续导入 ClickZetta Lakehouse 表
53
- - 需要近实时(分钟级)数据新鲜度
54
- - Kafka 消息格式为 JSON / CSV / Avro
55
- - 需要在导入前对 JSON 消息进行多层嵌套解析和转换
56
- - 关键词:Kafka Pipe、read_kafka、Kafka 外部表、消息队列导入、Kafka 持续导入
57
-
58
- ## 两种接入路径
59
-
60
- | 路径 | 适用场景 | 核心对象 |
61
- |------|---------|---------|
62
- | **READ_KAFKA Pipe**(推荐) | 通用场景,支持复杂 SQL 转换 | `CREATE PIPE ... AS COPY INTO ... FROM (SELECT ... FROM read_kafka(...))` |
63
- | **Kafka 外部表 + Table Stream Pipe** | 需要先落原始数据再增量消费 | Kafka 外部表 → Table Stream → Pipe `INSERT INTO ... SELECT` |
64
-
65
- **选择建议**:大多数场景用 READ_KAFKA Pipe 即可,更简洁高效。Kafka 外部表路径适合需要保留原始消息、多个下游消费同一 Topic 的场景。
66
-
67
- ## 前置依赖
68
-
69
- - ClickZetta Lakehouse 账户,具备创建 Pipe、表、VCluster 等权限
70
- - Kafka 集群网络可达(确认 bootstrap 地址和端口)
71
- - 已知 Kafka Topic 名称和消息格式
72
- - 认证信息(如需要):SASL 用户名/密码
73
- - **执行环境**:已安装并配置 cz-cli
74
-
75
- ## 执行环境
76
-
77
- 所有 SQL 通过 `cz-cli sql` 执行:
110
+ ## Key Constraints
78
111
 
79
- ```bash
80
- cz-cli --version # 确认 cz-cli 可用
81
- cz-cli sql "SELECT 1" --sync # 验证连接
82
- ```
83
-
84
- 需要 cz-cli,请参考官方文档安装并完成配置后重试。
85
-
86
- ## ⚠️ 关键注意事项
87
-
88
- - Kafka Pipe 仅支持 **PLAINTEXT** 和 **SASL_PLAINTEXT** 两种安全协议,不支持 SSL 证书方式
89
- - Pipe 创建后**自动启动**,无需手动 RESUME
90
- - Pipe 不支持修改 COPY 语句逻辑,需删除后重建
91
- - 建议为 Kafka Pipe 分配**专用 GP 集群**,避免与其他查询争抢资源
92
- - `RESET_KAFKA_GROUP_OFFSETS` 仅在创建时生效,会强制改写消费位点,谨慎使用
112
+ - Kafka Pipe supports only **PLAINTEXT** and **SASL_PLAINTEXT** (no SSL/mTLS)
113
+ - Pipe **starts automatically** after creation — no manual RESUME needed
114
+ - Pipe SQL logic cannot be altered — must DROP + CREATE to change the SELECT
115
+ - `CREATE OR REPLACE PIPE` is **not supported** — use DROP then CREATE
116
+ - `RESET_KAFKA_GROUP_OFFSETS` only takes effect at creation time
117
+ - `topic_pattern` (position 3) is **reserved and unused** — always pass empty string `''`
118
+ - Recommend a **dedicated GP VCluster** for Kafka Pipe to avoid resource contention
93
119
 
94
120
  ---
95
121
 
96
- ## 路径一:READ_KAFKA Pipe(推荐)
122
+ ## Path One: READ_KAFKA Pipe (Recommended)
97
123
 
98
- ### 步骤 1:验证 Kafka 连接和探查数据
124
+ ### Step 1: Validate Kafka Connection
99
125
 
100
- 先用 `READ_KAFKA` 函数验证网络连通性和消息格式:
101
-
102
- > ⚠️ **READ_KAFKA 使用位置参数(positional parameters)**,不支持 `=>` 命名参数语法。参数顺序固定,不可省略。
126
+ > ⚠️ READ_KAFKA uses **positional parameters only**. No `=>` named params, no `TABLE()` wrapper.
127
+ > Full parameter reference: see `references/kafka-pipe-syntax.md`
103
128
 
104
129
  ```sql
105
- -- 无认证 Kafka(位置参数语法)
106
- SELECT *
130
+ SELECT value::string
107
131
  FROM read_kafka(
108
- 'kafka.example.com:9092', -- bootstrap_servers(必填)
109
- 'orders', -- topic(必填)
110
- '', -- topic_pattern(保留,填空字符串)
111
- 'test_explore', -- group_id(必填)
112
- '', -- starting_offsets(探查时可填 'earliest',或留空用默认 latest)
113
- '', -- ending_offsets(留空)
114
- '', -- starting_timestamp(留空)
115
- '', -- ending_timestamp(留空)
116
- 'raw', -- key_format(目前只支持 raw)
117
- 'raw', -- value_format(目前只支持 raw)
118
- 0, -- max_errors
132
+ 'kafka.example.com:9092', -- bootstrap_servers
133
+ 'orders', -- topic
134
+ '', -- reserved (always empty)
135
+ 'test_explore', -- group_id (use temp name for exploration)
136
+ '', '', '', '', -- offsets/timestamps (leave empty)
137
+ 'raw', 'raw', 0, -- key_format, value_format, max_errors
119
138
  MAP(
120
139
  'kafka.security.protocol', 'PLAINTEXT',
121
140
  'kafka.auto.offset.reset', 'earliest'
122
141
  )
123
142
  )
124
143
  LIMIT 10;
144
+ ```
125
145
 
126
- -- SASL_PLAINTEXT 认证
127
- SELECT *
128
- FROM read_kafka(
129
- 'kafka.example.com:9092',
130
- 'orders',
131
- '',
132
- 'test_explore',
133
- '', '', '', '',
134
- 'raw',
135
- 'raw',
136
- 0,
137
- MAP(
138
- 'kafka.security.protocol', 'SASL_PLAINTEXT',
139
- 'kafka.sasl.mechanism', 'PLAIN',
140
- 'kafka.sasl.username', 'my_user',
141
- 'kafka.sasl.password', 'my_password',
142
- 'kafka.auto.offset.reset', 'earliest'
143
- )
146
+ For SASL authentication, add to MAP:
147
+ ```sql
148
+ MAP(
149
+ 'kafka.security.protocol', 'SASL_PLAINTEXT',
150
+ 'kafka.sasl.mechanism', 'PLAIN',
151
+ 'kafka.sasl.username', 'my_user',
152
+ 'kafka.sasl.password', 'my_password',
153
+ 'kafka.auto.offset.reset', 'earliest'
144
154
  )
145
- LIMIT 10;
146
155
  ```
147
156
 
148
- > **参数说明**:
149
- > - 探查用的 `group_id` 建议用临时名称(如 `test_explore`),避免影响正式消费组
150
- > - `kafka.auto.offset.reset` 在 MAP 中设置为 `'earliest'` 可读取历史数据
151
- > - key 和 value 都是 binary 类型,需要 CAST 转换后使用
152
- > - **多 Broker 地址格式**:用逗号分隔多个 broker,Pipe 会自动故障转移
153
- > - ✅ 推荐:`'broker1:9092,broker2:9092,broker3:9092'`(高可用)
154
- > - ⚠️ 单 broker:`'broker1:9092'`(无故障转移,不推荐生产使用)
157
+ > **Multi-broker format**: `'broker1:9092,broker2:9092,broker3:9092'` (recommended for HA)
155
158
 
156
- ### 步骤 2:探查 JSON 结构并确定目标表 Schema
157
-
158
- Kafka 的 key 和 value 都是 binary 类型。用 `value::string` 转换后查看内容,用 `parse_json()` 解析 JSON:
159
+ ### Step 2: Explore Schema and Parse Messages
159
160
 
161
+ **JSON (flat)**:
160
162
  ```sql
161
- -- 将 value 转为字符串查看原始内容
162
- SELECT key::string, value::string
163
- FROM read_kafka(
164
- 'kafka.example.com:9092',
165
- 'orders',
166
- '',
167
- 'test_schema',
168
- '', '', '', '',
169
- 'raw', 'raw', 0,
170
- MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
171
- )
172
- LIMIT 5;
173
-
174
- -- 解析 JSON 字段(使用 parse_json)
175
163
  SELECT
176
164
  j['order_id']::STRING AS order_id,
177
- j['user_id']::STRING AS user_id,
178
165
  j['amount']::DECIMAL(10,2) AS amount,
179
- j['status']::STRING AS status,
180
166
  timestamp_millis(j['created_at']::BIGINT) AS created_at
181
167
  FROM (
182
168
  SELECT parse_json(value::string) AS j
183
- FROM read_kafka(
184
- 'kafka.example.com:9092',
185
- 'orders',
186
- '',
187
- 'test_schema',
188
- '', '', '', '',
189
- 'raw', 'raw', 0,
190
- MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
191
- )
169
+ FROM read_kafka('kafka:9092','orders','','test_schema','','','','','raw','raw',0,
170
+ MAP('kafka.security.protocol','PLAINTEXT','kafka.auto.offset.reset','earliest'))
192
171
  LIMIT 5
193
172
  );
173
+ ```
194
174
 
195
- -- 多层嵌套 JSON 解析(逐层 parse_json 展开)
175
+ **JSON (nested)** unwrap layer by layer:
176
+ ```sql
196
177
  SELECT
197
178
  j['id']::STRING AS id,
198
- j['type']::STRING AS event_type,
199
179
  parse_json(j['event']::STRING)['action']::STRING AS action,
200
180
  parse_json(parse_json(j['event']::STRING)['payload']::STRING)['ref']::STRING AS ref
201
181
  FROM (
202
182
  SELECT parse_json(value::string) AS j
203
- FROM read_kafka(
204
- 'kafka.example.com:9092',
205
- 'events',
206
- '',
207
- 'test_nested',
208
- '', '', '', '',
209
- 'raw', 'raw', 0,
210
- MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
211
- )
183
+ FROM read_kafka('kafka:9092','events','','test_nested','','','','','raw','raw',0,
184
+ MAP('kafka.security.protocol','PLAINTEXT','kafka.auto.offset.reset','earliest'))
212
185
  LIMIT 5
213
186
  );
214
187
  ```
215
188
 
216
- > **最佳实践**:在 SELECT 中将所有嵌套 JSON 字符串都 `parse_json` 展开后再落表,避免下游查询重复计算。
189
+ **CSV** use `split()`:
190
+ ```sql
191
+ SELECT
192
+ split(value::string, ',')[0] AS id,
193
+ split(value::string, ',')[1] AS name,
194
+ CAST(split(value::string, ',')[2] AS DECIMAL(10,2)) AS amount
195
+ FROM read_kafka('kafka:9092','csv_topic','','test_csv','','','','','raw','raw',0,
196
+ MAP('kafka.security.protocol','PLAINTEXT','kafka.auto.offset.reset','earliest'))
197
+ LIMIT 5;
198
+ ```
199
+
200
+ **Avro / Protobuf**: Not natively supported for parsing. Land as raw binary (`value` column) and decode in a downstream Dynamic Table or external process.
217
201
 
218
- ### 步骤 3:创建目标表
202
+ > **Best Practice**: Unwrap all nested JSON with `parse_json` in the Pipe SELECT to avoid repeated computation downstream.
219
203
 
220
- 根据探查结果创建目标表:
204
+ ### Step 3: Create Target Table
221
205
 
222
206
  ```sql
223
207
  CREATE TABLE IF NOT EXISTS ods.kafka_orders (
@@ -226,28 +210,27 @@ CREATE TABLE IF NOT EXISTS ods.kafka_orders (
226
210
  amount DECIMAL(10,2),
227
211
  status STRING,
228
212
  created_at TIMESTAMP,
229
- __kafka_timestamp__ TIMESTAMP COMMENT 'Kafka 消息时间戳,用于端到端延迟监控'
213
+ __kafka_timestamp__ TIMESTAMP COMMENT 'Kafka message timestamp for e2e latency monitoring'
230
214
  );
231
215
  ```
232
216
 
233
- > 建议额外添加 `__kafka_timestamp__` 字段记录 Kafka 消息时间戳,用于后续端到端延迟监控。
217
+ > Always add `__kafka_timestamp__` for end-to-end latency monitoring.
234
218
 
235
- ### 步骤 4:创建专用 VCluster(推荐)
219
+ ### Step 4: Create Dedicated VCluster (Recommended)
236
220
 
237
221
  ```sql
238
222
  CREATE VCLUSTER IF NOT EXISTS pipe_kafka_vc
239
223
  VCLUSTER_TYPE = GENERAL
240
224
  VCLUSTER_SIZE = 4
241
225
  AUTO_SUSPEND_IN_SECOND = 0
242
- COMMENT 'Kafka Pipe 专用集群,常驻运行';
226
+ COMMENT 'Dedicated always-on cluster for Kafka Pipe';
243
227
  ```
244
228
 
245
- > 数据新鲜度要求 1 分钟时,建议 VCluster 常驻(`AUTO_SUSPEND_IN_SECOND = 0`),避免冷启动延迟。
229
+ > Set `AUTO_SUSPEND_IN_SECOND = 0` for sub-minute freshness to avoid cold-start latency.
246
230
 
247
- ### 步骤 5:创建 Kafka Pipe
231
+ ### Step 5: Create Kafka Pipe
248
232
 
249
233
  ```sql
250
- -- ⚠️ 注意:ClickZetta 不支持 CREATE OR REPLACE PIPE,需用 CREATE PIPE 或先 DROP 再 CREATE
251
234
  CREATE PIPE kafka_orders_pipe
252
235
  VIRTUAL_CLUSTER = 'pipe_kafka_vc'
253
236
  BATCH_INTERVAL_IN_SECONDS = '60'
@@ -264,91 +247,51 @@ COPY INTO ods.kafka_orders FROM (
264
247
  FROM (
265
248
  SELECT `timestamp`, parse_json(value::string) AS j
266
249
  FROM read_kafka(
267
- 'kafka.example.com:9092', -- bootstrap_servers
268
- 'orders', -- topic
269
- '', -- reserved
270
- 'lakehouse_orders', -- group_id(正式消费组名)
271
- '', '', '', '', -- 位置参数留空,由 Pipe 自动管理
272
- 'raw', -- key_format
273
- 'raw', -- value_format
274
- 0, -- max_errors
250
+ 'kafka.example.com:9092',
251
+ 'orders',
252
+ '',
253
+ 'lakehouse_orders', -- production group_id
254
+ '', '', '', '', -- must be empty in Pipe
255
+ 'raw', 'raw', 0,
275
256
  MAP('kafka.security.protocol', 'PLAINTEXT')
276
257
  )
277
258
  )
278
259
  );
279
260
  ```
280
261
 
281
- > ⚠️ **Pipe READ_KAFKA 的关键区别**:
282
- > - 位置参数(starting_offsets 等)**必须留空**,由 Pipe 自动管理消费位点
283
- > - 不要设置 `kafka.auto.offset.reset`(由 Pipe 的 `RESET_KAFKA_GROUP_OFFSETS` 参数控制)
284
- > - group_id 使用正式名称(如 `lakehouse_orders`),Pipe 会持久化消费位点
262
+ > **Inside a Pipe**: positional offset params MUST be empty (Pipe manages offsets). Do NOT set `kafka.auto.offset.reset` in MAP — use `RESET_KAFKA_GROUP_OFFSETS` Pipe parameter instead.
285
263
 
286
- **关键参数说明:**
264
+ For full parameter reference, see `references/kafka-pipe-syntax.md`.
287
265
 
288
- | 参数 | 默认值 | 说明 |
289
- |------|--------|------|
290
- | `VIRTUAL_CLUSTER` | — | 必填,指定执行 Pipe 的计算集群 |
291
- | `BATCH_INTERVAL_IN_SECONDS` | 60 | 批处理间隔(秒),即数据新鲜度 |
292
- | `BATCH_SIZE_PER_KAFKA_PARTITION` | 500000 | 每个 Kafka 分区每批最大消息数 |
293
- | `MAX_SKIP_BATCH_COUNT_ON_ERROR` | 30 | 出错时跳过批次的最大重试次数 |
294
- | `INITIAL_DELAY_IN_SECONDS` | 0 | 首个作业调度延迟 |
295
- | `RESET_KAFKA_GROUP_OFFSETS` | — | 可选,指定起始消费位点(仅创建时生效) |
296
-
297
- **RESET_KAFKA_GROUP_OFFSETS 可选值:**
298
-
299
- | 值 | 说明 |
300
- |----|------|
301
- | `'none'` | 无操作,使用 Kafka 的 `auto.offset.reset` 配置(默认 latest) |
302
- | `'valid'` | 检查当前位点是否过期,将过期分区重置到 earliest |
303
- | `'earliest'` | 重置到最早位点(消费全部历史数据) |
304
- | `'latest'` | 重置到最新位点(仅消费新数据) |
305
- | `'1737789688000'` | 重置到指定毫秒时间戳对应的位点 |
306
-
307
- > **注意**:Pipe 中的 read_kafka 位置参数(starting_offsets 等)必须留空,由 Pipe 自动管理消费位点。与独立使用 read_kafka 探查时不同。
308
-
309
- ### 步骤 6:验证 Pipe 运行状态
266
+ ### Step 6: Verify
310
267
 
311
268
  ```sql
312
- -- 查看 Pipe 详情
313
269
  DESC PIPE EXTENDED kafka_orders_pipe;
314
- -- 关键字段:pipe_execution_paused(是否暂停)、pipe_latency(延迟信息)
315
-
316
- -- 查看目标表数据
317
270
  SELECT COUNT(*) FROM ods.kafka_orders;
318
- SELECT * FROM ods.kafka_orders LIMIT 10;
319
-
320
- -- 查看加载历史(保留 7 天)
321
- SELECT * FROM load_history('ods.kafka_orders')
322
- ORDER BY last_load_time DESC
323
- LIMIT 20;
324
-
325
- -- 通过 query_tag 查看 Pipe 作业
271
+ SELECT * FROM load_history('ods.kafka_orders') ORDER BY last_load_time DESC LIMIT 10;
326
272
  SHOW JOBS WHERE query_tag = 'pipe.my_workspace.ods.kafka_orders_pipe';
327
273
  ```
328
274
 
329
275
  ---
330
276
 
331
- ## 路径二:Kafka 外部表 + Table Stream Pipe
277
+ ## Path Two: Kafka External Table + Table Stream Pipe
332
278
 
333
- 适合需要保留原始消息、或多个下游消费同一 Topic 的场景。
279
+ Use when: raw message retention needed, or multiple independent downstream consumers on the same topic.
334
280
 
335
- ### 步骤 1:创建 Kafka Storage Connection
281
+ ### Step 1: Create Kafka Storage Connection
336
282
 
337
283
  ```sql
338
284
  CREATE STORAGE CONNECTION IF NOT EXISTS kafka_conn
339
285
  TYPE KAFKA
340
286
  BOOTSTRAP_SERVERS = ['kafka.example.com:9092']
341
287
  SECURITY_PROTOCOL = 'PLAINTEXT';
342
-
343
- -- 删除 Connection(⚠️ 注意:用 DROP CONNECTION,不是 DROP STORAGE CONNECTION)
344
- DROP CONNECTION IF EXISTS kafka_conn;
345
288
  ```
346
289
 
347
- ### 步骤 2:创建 Kafka 外部表
290
+ > Drop with `DROP CONNECTION IF EXISTS kafka_conn` (not `DROP STORAGE CONNECTION`).
291
+
292
+ ### Step 2: Create Kafka External Table
348
293
 
349
294
  ```sql
350
- -- ⚠️ 必须显式指定列定义,不能省略
351
- -- ⚠️ offset 是保留字,必须用反引号转义
352
295
  CREATE EXTERNAL TABLE kafka_orders_ext (
353
296
  topic STRING,
354
297
  partition INT,
@@ -368,13 +311,11 @@ OPTIONS (
368
311
  CONNECTION kafka_conn;
369
312
  ```
370
313
 
371
- > **注意**:
372
- > - 列定义是**必须的**,不指定会报错 `failed to detect columns`
373
- > - `offset` `timestamp` 是保留字,定义和查询时都需要反引号转义
374
- > - 删除外部表用 `DROP TABLE`(❌ `DROP EXTERNAL TABLE` 会报语法错误)
375
- > - 删除 Connection 用 `DROP CONNECTION`(❌ `DROP STORAGE CONNECTION` 会报语法错误)
314
+ > - Column definitions are **required** (omitting causes `failed to detect columns`)
315
+ > - `offset` and `timestamp` are reserved words — always backtick-escape
316
+ > - Drop with `DROP TABLE` (not `DROP EXTERNAL TABLE`)
376
317
 
377
- ### 步骤 3:创建 Table Stream
318
+ ### Step 3: Create Table Stream
378
319
 
379
320
  ```sql
380
321
  CREATE TABLE STREAM kafka_orders_stream
@@ -382,99 +323,76 @@ CREATE TABLE STREAM kafka_orders_stream
382
323
  WITH PROPERTIES ('TABLE_STREAM_MODE' = 'APPEND_ONLY');
383
324
  ```
384
325
 
385
- ### 步骤 4:创建目标表和 Pipe
326
+ ### Step 4: Create Target Table and Pipe
386
327
 
387
328
  ```sql
388
- -- 目标表
389
329
  CREATE TABLE IF NOT EXISTS ods.kafka_orders_from_ext (
390
- order_id STRING,
391
- user_id STRING,
392
- amount DECIMAL(10,2),
393
- kafka_ts TIMESTAMP
330
+ order_id STRING, user_id STRING, amount DECIMAL(10,2), kafka_ts TIMESTAMP
394
331
  );
395
332
 
396
- -- Pipe(从 Table Stream 消费)
397
- -- ⚠️ 注意:Table Stream Pipe 使用 INSERT INTO ... SELECT 语法,不是 COPY INTO
333
+ -- Table Stream Pipe uses INSERT INTO ... SELECT (not COPY INTO)
398
334
  CREATE PIPE kafka_ext_orders_pipe
399
335
  VIRTUAL_CLUSTER = 'pipe_kafka_vc'
400
336
  BATCH_INTERVAL_IN_SECONDS = '60'
401
337
  AS
402
338
  INSERT INTO ods.kafka_orders_from_ext
403
339
  SELECT
404
- GET_JSON_OBJECT(CAST(value AS STRING), '$.order_id') AS order_id,
405
- GET_JSON_OBJECT(CAST(value AS STRING), '$.user_id') AS user_id,
406
- CAST(GET_JSON_OBJECT(CAST(value AS STRING), '$.amount') AS DECIMAL(10,2)) AS amount,
407
- CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
408
- FROM kafka_orders_stream;
340
+ j['order_id']::STRING,
341
+ j['user_id']::STRING,
342
+ j['amount']::DECIMAL(10,2),
343
+ CAST(`timestamp` AS TIMESTAMP)
344
+ FROM (
345
+ SELECT `timestamp`, parse_json(CAST(value AS STRING)) AS j
346
+ FROM kafka_orders_stream
347
+ );
409
348
  ```
410
349
 
411
- > **清理外部表**:使用 `DROP TABLE kafka_orders_ext`(不是 `DROP EXTERNAL TABLE`)
350
+ > **Note**: `GET_JSON_OBJECT(str, '$.path')` also works but `parse_json(str)['field']::TYPE` is preferred — it's more composable for nested structures and consistent with Path One.
412
351
 
413
352
  ---
414
353
 
415
- ## 监控与运维
354
+ ## Monitoring & Operations
416
355
 
417
- ### 查看 Kafka 消费延迟
356
+ ### Check Pipe Status and Lag
418
357
 
419
358
  ```sql
420
359
  DESC PIPE EXTENDED kafka_orders_pipe;
421
360
  ```
422
361
 
423
- 关键字段 `pipe_latency`(JSON 格式):
424
- - `lastConsumeTimestamp`:上一次消费的位点时间
425
- - `offsetLag`:Kafka 数据堆积量
426
- - `timeLag`:消费延迟(毫秒),当前时间减去上一次消费位点。异常时为 -1
362
+ Key field `pipe_latency` (JSON):
363
+ - `lastConsumeTimestamp` — last consumed offset time
364
+ - `offsetLag` — message backlog count
365
+ - `timeLag` — consumer lag in ms (shows -1 when abnormal)
427
366
 
428
- > 当数据新鲜度为 60 秒且算力冗余一倍时,`timeLag` 应在 0~90 秒之间波动。持续上涨说明 Pipe 积压。
367
+ > Normal: `timeLag` fluctuates 0–90s (with 60s batch interval + 2x headroom). Continuously increasing = backlog.
429
368
 
430
- ### 端到端延迟监控(需要 `__kafka_timestamp__` 字段)
369
+ ### End-to-End Latency (requires `__kafka_timestamp__`)
431
370
 
432
371
  ```sql
433
- -- 查看最近 1 小时的端到端延迟
434
372
  SELECT
435
- MAX(DATEDIFF('second', __kafka_timestamp__, CURRENT_TIMESTAMP())) AS max_delay_seconds,
436
- AVG(DATEDIFF('second', __kafka_timestamp__, CURRENT_TIMESTAMP())) AS avg_delay_seconds
373
+ MAX(DATEDIFF('second', __kafka_timestamp__, CURRENT_TIMESTAMP())) AS max_delay_s,
374
+ AVG(DATEDIFF('second', __kafka_timestamp__, CURRENT_TIMESTAMP())) AS avg_delay_s
437
375
  FROM ods.kafka_orders
438
376
  WHERE __kafka_timestamp__ >= CURRENT_TIMESTAMP() - INTERVAL 1 HOUR;
439
377
  ```
440
378
 
441
- ### 暂停 / 恢复 Pipe
379
+ ### Pause / Resume
442
380
 
443
381
  ```sql
444
- -- 暂停
445
- ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = true;
446
-
447
- -- 恢复
448
- ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = false;
382
+ ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = true; -- pause
383
+ ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = false; -- resume
449
384
  ```
450
385
 
451
- ### 修改 Pipe 属性
452
-
453
- ```sql
454
- -- 修改 VCluster
455
- ALTER PIPE kafka_orders_pipe SET VIRTUAL_CLUSTER = 'new_vc';
456
-
457
- -- 修改 COPY_JOB_HINT
458
- ALTER PIPE kafka_orders_pipe SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
459
- ```
386
+ ### Modify Pipe Properties
460
387
 
461
- > ⚠️ **ALTER PIPE 支持的属性**(经验证):
462
- > - ✅ `PIPE_EXECUTION_PAUSED`
463
- > - ✅ `VIRTUAL_CLUSTER`
464
- > - ✅ `COPY_JOB_HINT`
465
- > - ❌ `BATCH_INTERVAL_IN_SECONDS`(不支持修改,需删除重建 Pipe)
466
- > - ❌ `BATCH_SIZE_PER_KAFKA_PARTITION`(不支持修改,需删除重建 Pipe)
467
- >
468
- > 每次 ALTER 只能修改一个属性。不支持修改 COPY/INSERT 语句逻辑,需删除重建。
388
+ Only `PIPE_EXECUTION_PAUSED`, `VIRTUAL_CLUSTER`, and `COPY_JOB_HINT` are alterable (one per ALTER call). Everything else — including `BATCH_INTERVAL_IN_SECONDS`, `BATCH_SIZE_PER_KAFKA_PARTITION`, and SELECT logic — requires drop + recreate. See `references/kafka-pipe-syntax.md` § ALTER PIPE for the full support matrix.
469
389
 
470
- ### 修改 Pipe SQL 逻辑(需删除重建)
390
+ ### Modify Pipe SQL Logic (Drop + Recreate)
471
391
 
472
392
  ```sql
473
- -- 1. 删除当前 Pipe
474
393
  DROP PIPE kafka_orders_pipe;
475
394
 
476
- -- 2. 重建 Pipe(不要设置 RESET_KAFKA_GROUP_OFFSETS,保持从上次位点继续)
477
- -- ⚠️ 注意:ClickZetta 不支持 CREATE OR REPLACE PIPE,使用 CREATE PIPE
395
+ -- Recreate with same group_id, do NOT set RESET_KAFKA_GROUP_OFFSETS → continues from last offset
478
396
  CREATE PIPE kafka_orders_pipe
479
397
  VIRTUAL_CLUSTER = 'pipe_kafka_vc'
480
398
  BATCH_INTERVAL_IN_SECONDS = '60'
@@ -484,286 +402,134 @@ COPY INTO ods.kafka_orders FROM (
484
402
  j['order_id']::STRING,
485
403
  j['user_id']::STRING,
486
404
  j['amount']::DECIMAL(10,2),
487
- UPPER(j['status']::STRING), -- 修改了转换逻辑
405
+ UPPER(j['status']::STRING), -- changed logic
488
406
  j['created_at']::TIMESTAMP,
489
407
  CAST(`timestamp` AS TIMESTAMP) AS __kafka_timestamp__
490
408
  FROM (
491
409
  SELECT `timestamp`, parse_json(value::string) AS j
492
- FROM read_kafka(
493
- 'kafka.example.com:9092',
494
- 'orders',
495
- '',
496
- 'lakehouse_orders', -- 保持相同 group_id
497
- '', '', '', '',
498
- 'raw', 'raw', 0,
499
- MAP('kafka.security.protocol', 'PLAINTEXT')
500
- )
410
+ FROM read_kafka('kafka.example.com:9092','orders','','lakehouse_orders',
411
+ '','','','','raw','raw',0,MAP('kafka.security.protocol','PLAINTEXT'))
501
412
  )
502
413
  );
503
414
  ```
504
415
 
505
- > **关键**:重建时保持相同的 `group_id`,且不设置 `RESET_KAFKA_GROUP_OFFSETS`,Pipe 会从上次消费位点继续。
506
-
507
416
  ---
508
417
 
509
- ## 生产调优
510
-
511
- ### 判断是否积压
418
+ ## Production Tuning
512
419
 
513
- 多次执行 `DESC PIPE EXTENDED` 查看 `pipe_latency` 中的 `timeLag`:
514
- - 在 0~90 秒波动 → 正常(60 秒新鲜度 + 一倍冗余)
515
- - 持续上涨 → 积压,需调优
420
+ Run `DESC PIPE EXTENDED` multiple times — if `timeLag` continuously increases, the Pipe is backlogged.
516
421
 
517
- ### 调优参数
422
+ | Problem | Fix |
423
+ |---------|-----|
424
+ | Batch can't consume a full interval's data | Increase `BATCH_SIZE_PER_KAFKA_PARTITION` (drop + recreate, e.g., `'1000000'`) |
425
+ | Job needs multiple rounds | Increase VCluster size so cores ≥ partitions: `ALTER VCLUSTER ... SET VCLUSTER_SIZE = 16` |
426
+ | Few partitions, large volume | Split tasks by count: `ALTER PIPE ... SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}'` |
518
427
 
519
- | 问题 | 调优方向 | 操作 |
520
- |------|---------|------|
521
- | 每批读取不完一个周期的数据 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION` | 删除重建 Pipe 时设置更大的值(如 `BATCH_SIZE_PER_KAFKA_PARTITION = '1000000'`) |
522
- | 作业需要多轮才能完成 | 增大 VCluster 规格(使 core 数 ≥ partition 数) | `ALTER VCLUSTER ... SET VCLUSTER_SIZE = 16` |
523
- | partition 少但数据量大 | 按条数切分 task | `ALTER PIPE ... SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}'` |
524
-
525
- ### COPY_JOB_HINT 参数
526
-
527
- | Key | 默认值 | 说明 |
528
- |-----|--------|------|
529
- | `cz.sql.split.kafka.strategy` | `simple` | `simple`=每 partition 一个 task;`size`=按条数切分 |
530
- | `cz.mapper.kafka.message.size` | `1000000` | 当 strategy=size 时,每个 task 处理的消息条数 |
428
+ > **VCluster size-to-core mapping** (GENERAL type, 1 CRU = 8 cores):
429
+ > | VCLUSTER_SIZE (CRU) | Cores | Suitable for |
430
+ > |---------------------|-------|--------------|
431
+ > | 4 | 32 | 32 partitions, moderate throughput |
432
+ > | 8 | 64 | 64 partitions, high throughput |
433
+ > | 16 | 128 | Large-scale ingestion |
434
+ > | 32 | 256 | Very high partition count / throughput |
435
+ >
436
+ > Rule of thumb: set cores ≥ Kafka partition count so each partition gets a dedicated task slot.
531
437
 
532
- > ⚠️ **格式要求**:`COPY_JOB_HINT` 必须是合法 JSON,键值都要用双引号包围:
533
- > ```sql
534
- > -- ✅ 正确
535
- > ALTER PIPE my_pipe SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
536
- > -- ❌ 错误(非 JSON 格式)
537
- > ALTER PIPE my_pipe SET COPY_JOB_HINT = 'cz.sql.split.kafka.strategy=size';
538
- > ```
539
- > 修改 `COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数。
438
+ > `COPY_JOB_HINT` must be valid JSON with double-quoted keys/values. Setting it overwrites all previous hints.
540
439
 
541
440
  ---
542
441
 
543
- ## 典型场景
442
+ ## Schema Evolution
544
443
 
545
- ### 场景 A:简单 JSON Topic 接入
546
-
547
- ```sql
548
- -- 1. 探查
549
- SELECT parse_json(value::string)['id']::STRING, parse_json(value::string)['name']::STRING
550
- FROM read_kafka(
551
- 'kafka:9092', 'metrics', '', 'test',
552
- '', '', '', '', 'raw', 'raw', 0,
553
- MAP('kafka.security.protocol', 'PLAINTEXT', 'kafka.auto.offset.reset', 'earliest')
554
- ) LIMIT 5;
555
-
556
- -- 2. 建表
557
- CREATE TABLE ods.metrics (id STRING, name STRING, value DOUBLE, kafka_ts TIMESTAMP);
558
-
559
- -- 3. 建 Pipe
560
- CREATE PIPE metrics_pipe
561
- VIRTUAL_CLUSTER = 'pipe_vc'
562
- BATCH_INTERVAL_IN_SECONDS = '60'
563
- AS
564
- COPY INTO ods.metrics FROM (
565
- SELECT
566
- j['id']::STRING, j['name']::STRING, j['value']::DOUBLE,
567
- CAST(`timestamp` AS TIMESTAMP)
568
- FROM (
569
- SELECT `timestamp`, parse_json(value::string) AS j
570
- FROM read_kafka(
571
- 'kafka:9092', 'metrics', '', 'cz_metrics',
572
- '', '', '', '', 'raw', 'raw', 0,
573
- MAP('kafka.security.protocol', 'PLAINTEXT')
574
- )
575
- )
576
- );
577
- ```
578
-
579
- ### 场景 B:Kafka → ODS → DWD 实时 ETL
580
-
581
- ```sql
582
- -- 1. Pipe 接入 ODS 层
583
- CREATE PIPE kafka_events_pipe
584
- VIRTUAL_CLUSTER = 'pipe_vc'
585
- BATCH_INTERVAL_IN_SECONDS = '60'
586
- AS
587
- COPY INTO ods.events FROM (
588
- SELECT
589
- j['event_id']::STRING, j['user_id']::STRING, j['action']::STRING, j['ts']::TIMESTAMP
590
- FROM (
591
- SELECT parse_json(value::string) AS j
592
- FROM read_kafka(
593
- 'kafka:9092', 'user_events', '', 'cz_events',
594
- '', '', '', '', 'raw', 'raw', 0,
595
- MAP('kafka.security.protocol', 'PLAINTEXT')
596
- )
597
- )
598
- );
599
-
600
- -- 2. Dynamic Table 清洗到 DWD 层
601
- -- ⚠️ 注意:Dynamic Table 支持 CREATE OR REPLACE,与 Pipe 不同
602
- CREATE OR REPLACE DYNAMIC TABLE dwd.events_clean
603
- REFRESH INTERVAL 1 MINUTE vcluster default
604
- AS
605
- SELECT event_id, user_id, UPPER(action) AS action, ts, DATE(ts) AS dt
606
- FROM ods.events
607
- WHERE event_id IS NOT NULL AND action IS NOT NULL;
608
-
609
- -- 3. Dynamic Table 聚合到 DWS 层
610
- CREATE OR REPLACE DYNAMIC TABLE dws.events_hourly
611
- REFRESH INTERVAL 5 MINUTE vcluster default
612
- AS
613
- SELECT DATE_TRUNC('hour', ts) AS hour, action, COUNT(*) AS cnt, COUNT(DISTINCT user_id) AS uv
614
- FROM dwd.events_clean
615
- GROUP BY 1, 2;
616
- ```
444
+ When the Kafka topic adds new fields:
617
445
 
618
- ### 场景 C:SASL 认证 + 指定时间点消费
446
+ 1. **Add columns** to the target table:
447
+ ```sql
448
+ ALTER TABLE ods.kafka_orders ADD COLUMN new_field STRING;
449
+ ```
619
450
 
620
- ```sql
621
- CREATE PIPE kafka_auth_pipe
622
- VIRTUAL_CLUSTER = 'pipe_vc'
623
- BATCH_INTERVAL_IN_SECONDS = '60'
624
- RESET_KAFKA_GROUP_OFFSETS = '1737789688000'
625
- AS
626
- COPY INTO ods.secure_events FROM (
627
- SELECT
628
- j['id']::STRING AS event_id,
629
- j['payload']::STRING AS payload,
630
- CAST(`timestamp` AS TIMESTAMP) AS kafka_ts
631
- FROM (
632
- SELECT `timestamp`, parse_json(value::string) AS j
633
- FROM read_kafka(
634
- 'kafka.example.com:9092',
635
- 'secure_events',
636
- '',
637
- 'cz_secure',
638
- '', '', '', '',
639
- 'raw', 'raw', 0,
640
- MAP(
641
- 'kafka.security.protocol', 'SASL_PLAINTEXT',
642
- 'kafka.sasl.mechanism', 'PLAIN',
643
- 'kafka.sasl.username', 'my_user',
644
- 'kafka.sasl.password', 'my_password'
645
- )
646
- )
647
- )
648
- );
649
- ```
451
+ 2. **Drop and recreate Pipe** with updated SELECT (keep same `group_id`, omit `RESET_KAFKA_GROUP_OFFSETS`):
452
+ ```sql
453
+ DROP PIPE kafka_orders_pipe;
454
+ CREATE PIPE kafka_orders_pipe ... -- add j['new_field']::STRING to SELECT
455
+ ```
650
456
 
651
- ---
457
+ 3. Existing rows will have `NULL` in the new column. New messages will populate it.
652
458
 
653
- ## 故障排除
654
-
655
- | 问题 | 排查方向 |
656
- |------|---------|
657
- | READ_KAFKA 语法报错 `Syntax error at or near '('` | ❌ 不要用 `TABLE(READ_KAFKA(...))` 或 `=>` 命名参数。✅ 正确:`FROM read_kafka('broker', 'topic', '', 'group', '', '', '', '', 'raw', 'raw', 0, MAP(...))` |
658
- | READ_KAFKA 报错 `cannot resolve column` | 使用了 `=` 赋值语法(如 `KAFKA_BROKER = 'xxx'`)。READ_KAFKA 只支持位置参数 |
659
- | READ_KAFKA 探查无数据 | 检查 broker 地址/端口、topic 名称、网络连通性;在 MAP 中设置 `'kafka.auto.offset.reset', 'earliest'` |
660
- | Pipe 创建后无数据加载 | `DESC PIPE EXTENDED` 检查是否暂停;确认 group_id 的消费位点(默认 latest,新数据才会消费) |
661
- | Table Stream Pipe 语法报错 `Syntax error at or near 'SELECT'` | ❌ 不要用 `COPY INTO ... SELECT`。✅ 正确:`INSERT INTO ... SELECT FROM stream` |
662
- | `CREATE OR REPLACE PIPE` 报错 AlreadyExist | ❌ ClickZetta 不支持 `CREATE OR REPLACE PIPE`。Pipe 不存在时 `CREATE OR REPLACE` 会创建成功,但 Pipe 已存在时报 AlreadyExist 错误。✅ 正确:用 `DROP PIPE` + `CREATE PIPE` 重建(与 Dynamic Table 不同,DT 支持 `CREATE OR REPLACE`) |
663
- | JSON 解析报错 | 使用 `parse_json(value::string)['field']::TYPE` 语法;嵌套 JSON 需逐层 `parse_json()` 展开 |
664
- | SASL 认证失败 | 确认安全协议为 SASL_PLAINTEXT(不支持 SSL);在 MAP 中设置 `kafka.sasl.mechanism`、`kafka.sasl.username`、`kafka.sasl.password` |
665
- | 消费延迟持续增大 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION`;增大 VCluster 规格;使用 `COPY_JOB_HINT` 切分 task |
666
- | 重建 Pipe 后数据重复 | 保持相同 group_id 且不设置 `RESET_KAFKA_GROUP_OFFSETS` |
667
- | 重建 Pipe 后数据丢失 | 检查 group_id 的位点是否过期;如需回溯用 `RESET_KAFKA_GROUP_OFFSETS` 指定时间戳 |
668
- | `COPY_JOB_HINT` 修改后参数丢失 | `SET COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数 |
669
- | Pipe 作业 Failover | 查看作业详情;通常为 Kafka 连接中断或 Lakehouse 服务升级,会自动恢复 |
459
+ > There is no ALTER PIPE to change the SELECT — always drop + recreate. Keep the same `group_id` to avoid reprocessing.
670
460
 
671
461
  ---
672
462
 
673
- ## 参考文档
463
+ ## Error Recovery Playbook
674
464
 
675
- - [Pipe 简介](https://www.yunqi.tech/documents/pipe-summary)
676
- - [借助 read_kafka 函数持续导入](https://www.yunqi.tech/documents/pipe-kafka)
677
- - [借助 Kafka 外表 Table Stream 持续导入](https://www.yunqi.tech/documents/pipe-kafka-table-stream)
678
- - [最佳实践:使用 Pipe 高效接入 Kafka 数据](https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1)
679
- - [read_kafka 函数](https://www.yunqi.tech/documents/read_kafka)
680
- - [Kafka 外部表](https://www.yunqi.tech/documents/kafka-external-table)
681
- - [Kafka Storage Connection](https://www.yunqi.tech/documents/Kafka_connection)
682
- - [PIPE 导入语法](https://www.yunqi.tech/documents/pipe-syntax)
465
+ | Scenario | Recovery |
466
+ |----------|----------|
467
+ | **Kafka broker failover** | Pipe auto-retries. If stuck > 5 min, pause then resume: `ALTER PIPE ... SET PIPE_EXECUTION_PAUSED = true` then `false` |
468
+ | **Consumer group offset expired** (data loss on resume) | Recreate Pipe with `RESET_KAFKA_GROUP_OFFSETS = '<epoch_millis>'` to replay from a known timestamp |
469
+ | **Pipe job keeps failing** (bad message) | Check `MAX_SKIP_BATCH_COUNT_ON_ERROR` (default 30). If exceeded, Pipe pauses. Fix data or increase skip count via drop + recreate |
470
+ | **Duplicate data after recreate** | Caused by setting `RESET_KAFKA_GROUP_OFFSETS` unnecessarily. Omit it to continue from last committed offset |
471
+ | **Target table schema mismatch** | Pipe will fail if SELECT output doesn't match table columns. ALTER TABLE + recreate Pipe |
472
+ | **Lakehouse service upgrade** | Pipe jobs may failover temporarily. Auto-recovers. No action needed |
473
+ | **VCluster suspended** | Set `AUTO_SUSPEND_IN_SECOND = 0` for Pipe VClusters, or resume manually: `ALTER VCLUSTER ... RESUME` |
683
474
 
684
475
  ---
685
476
 
686
- ## cz-cli 执行路径
687
-
477
+ ## Troubleshooting
688
478
 
689
- ### 路径一:READ_KAFKA Pipe(cz-cli 版)
479
+ | Error | Cause & Fix |
480
+ |-------|-------------|
481
+ | `Syntax error at or near '('` | Using `TABLE(READ_KAFKA(...))` or `=>` named params. Use positional: `FROM read_kafka(...)` |
482
+ | `cannot resolve column` | Using `=` assignment (e.g., `KAFKA_BROKER = 'x'`). READ_KAFKA is positional only |
483
+ | No data from exploration | Wrong broker/port/topic, or offset is `latest`. Add `'kafka.auto.offset.reset','earliest'` to MAP |
484
+ | Pipe created, no data loading | Check `DESC PIPE EXTENDED` — may be paused, or group offset is at latest with no new messages |
485
+ | `Syntax error at or near 'SELECT'` (Table Stream Pipe) | Using `COPY INTO ... SELECT`. Table Stream Pipe must use `INSERT INTO ... SELECT` |
486
+ | `AlreadyExist` on CREATE OR REPLACE PIPE | Not supported. Use `DROP PIPE` + `CREATE PIPE` |
487
+ | SASL auth failure | Confirm protocol is `SASL_PLAINTEXT` (not SSL). Check mechanism/username/password in MAP |
488
+ | `COPY_JOB_HINT` params lost | SET overwrites all hints. Include all keys in one JSON string |
690
489
 
691
- #### 步骤 1-2:验证 Kafka 连接和探查数据结构
490
+ ---
692
491
 
693
- ```bash
694
- cz-cli agent run "验证 Kafka 连接并探查数据结构:broker 地址 <kafka-host:9092>,topic <topic-name>,消费组 test_explore,从 earliest 开始读取 10 条消息,展示原始 JSON 内容和字段结构" \
695
- --format a2a --dangerously-skip-permissions
696
- ```
492
+ ## Execution via cz-cli
697
493
 
698
- #### 步骤 3:创建目标表
494
+ All operations use `cz-cli sql --sync`. Examples:
699
495
 
700
496
  ```bash
701
- cz-cli agent run "在 schema <my_schema> 下创建目标表 <table_name>,字段包括:<field1> <type1>, <field2> <type2>,以及 __kafka_timestamp__ TIMESTAMP 字段用于延迟监控" \
702
- --format a2a --dangerously-skip-permissions
703
- ```
497
+ # Explore topic
498
+ cz-cli sql "SELECT value::string FROM read_kafka('broker:9092','topic','','test','','','','','raw','raw',0,MAP('kafka.security.protocol','PLAINTEXT','kafka.auto.offset.reset','earliest')) LIMIT 5" --sync
704
499
 
705
- #### 步骤 4:创建专用 VCluster(可选)
500
+ # Create table
501
+ cz-cli sql "CREATE TABLE IF NOT EXISTS ods.my_table (id STRING, ts TIMESTAMP)" --sync
706
502
 
707
- ```bash
708
- cz-cli agent run "创建名为 pipe_kafka_vc GENERAL 类型 VCluster,大小 4,AUTO_SUSPEND_IN_SECOND 设为 0(常驻运行),用于 Kafka Pipe 专用" \
709
- --format a2a --dangerously-skip-permissions
710
- ```
503
+ # Create Pipe
504
+ cz-cli sql "CREATE PIPE my_pipe VIRTUAL_CLUSTER='pipe_vc' BATCH_INTERVAL_IN_SECONDS='60' AS COPY INTO ods.my_table FROM (SELECT j['id']::STRING, CAST(\`timestamp\` AS TIMESTAMP) FROM (SELECT \`timestamp\`, parse_json(value::string) AS j FROM read_kafka('broker:9092','topic','','cz_group','','','','','raw','raw',0,MAP('kafka.security.protocol','PLAINTEXT'))))" --sync
711
505
 
712
- #### 步骤 5:创建 Kafka Pipe
506
+ # Check status
507
+ cz-cli sql "DESC PIPE EXTENDED my_pipe" --sync
713
508
 
714
- ```bash
715
- cz-cli agent run "创建 Kafka Pipe,名称 <pipe_name>,使用 VCluster pipe_kafka_vc,BATCH_INTERVAL_IN_SECONDS=60,从 Kafka broker <host:port> 的 topic <topic> 消费数据(消费组 <group_id>,JSON 格式),将字段 <field1>, <field2> 写入目标表 <schema>.<table>" \
716
- --format a2a --dangerously-skip-permissions
717
- ```
509
+ # Pause
510
+ cz-cli sql "ALTER PIPE my_pipe SET PIPE_EXECUTION_PAUSED = true" --sync
718
511
 
719
- #### 步骤 6:验证 Pipe 运行状态
512
+ # Resume
513
+ cz-cli sql "ALTER PIPE my_pipe SET PIPE_EXECUTION_PAUSED = false" --sync
720
514
 
721
- ```bash
722
- cz-cli agent run "查看 Pipe <pipe_name> 的详细状态,包括是否暂停、延迟信息,以及目标表 <schema>.<table> 的数据量和最近加载历史" \
723
- --format a2a --dangerously-skip-permissions
515
+ # Drop and recreate (to change logic)
516
+ cz-cli sql "DROP PIPE my_pipe" --sync
517
+ cz-cli sql "CREATE PIPE my_pipe ..." --sync
724
518
  ```
725
519
 
726
- ---
727
-
728
- ### 路径二:Kafka 外部表 + Table Stream Pipe(cz-cli 版)
729
-
730
- #### 步骤 1-4:完整创建流程
731
-
732
- ```bash
733
- # 步骤 1:创建 Kafka Storage Connection
734
- cz-cli agent run "创建 Kafka Storage Connection,名称 kafka_conn,bootstrap servers 为 <kafka-host:9092>,安全协议 PLAINTEXT" \
735
- --format a2a --dangerously-skip-permissions
736
-
737
- # 步骤 2:创建 Kafka 外部表
738
- cz-cli agent run "创建 Kafka 外部表 kafka_<topic>_ext,使用 Connection kafka_conn,消费组 lakehouse_ext_<topic>,topic 为 <topic>,从 earliest 开始" \
739
- --format a2a --dangerously-skip-permissions
740
-
741
- # 步骤 3:创建 Table Stream
742
- cz-cli agent run "在 Kafka 外部表 kafka_<topic>_ext 上创建 APPEND_ONLY 模式的 Table Stream,名称 kafka_<topic>_stream" \
743
- --format a2a --dangerously-skip-permissions
744
-
745
- # 步骤 4:创建目标表和 Pipe
746
- cz-cli agent run "创建目标表 <schema>.<target_table>,然后创建 Pipe kafka_ext_<topic>_pipe,使用 VCluster pipe_kafka_vc,BATCH_INTERVAL_IN_SECONDS=60,从 Table Stream kafka_<topic>_stream 消费数据,解析 JSON value 字段写入目标表" \
747
- --format a2a --dangerously-skip-permissions
748
- ```
520
+ > For multi-statement workflows, chain `cz-cli sql` calls in a shell script. Each statement must be a separate invocation.
749
521
 
750
522
  ---
751
523
 
752
- ### 监控与运维(cz-cli 版)
524
+ ## Reference Documentation
753
525
 
754
- ```bash
755
- # 查看 Pipe 延迟状态
756
- cz-cli agent run "查看 Pipe <pipe_name> 的延迟信息,包括 timeLag 和 offsetLag,判断是否有积压" \
757
- --format a2a --dangerously-skip-permissions
758
-
759
- # 暂停/恢复 Pipe
760
- cz-cli agent run "暂停 Pipe <pipe_name>" \
761
- --format a2a --dangerously-skip-permissions
762
-
763
- cz-cli agent run "恢复 Pipe <pipe_name>" \
764
- --format a2a --dangerously-skip-permissions
526
+ - [Pipe Overview](https://www.yunqi.tech/documents/pipe-summary)
527
+ - [Continuous Import with read_kafka](https://www.yunqi.tech/documents/pipe-kafka)
528
+ - [Kafka External Table + Table Stream](https://www.yunqi.tech/documents/pipe-kafka-table-stream)
529
+ - [Best Practice: Kafka Pipe Tuning](https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1)
530
+ - [read_kafka Function](https://www.yunqi.tech/documents/read_kafka)
531
+ - [Kafka External Table](https://www.yunqi.tech/documents/kafka-external-table)
532
+ - [Kafka Storage Connection](https://www.yunqi.tech/documents/Kafka_connection)
533
+ - [PIPE Syntax](https://www.yunqi.tech/documents/pipe-syntax)
765
534
 
766
- # 修改 Pipe 属性
767
- cz-cli agent run "修改 Pipe <pipe_name> 的 BATCH_INTERVAL_IN_SECONDS 为 120" \
768
- --format a2a --dangerously-skip-permissions
769
- ```
535
+ > **Syntax details** (parameter tables, DDL templates, MAP options): see `references/kafka-pipe-syntax.md`