@clickzetta/cz-cli-linux-x64 0.3.2 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/bin/cz-cli +0 -0
  2. package/package.json +1 -1
  3. package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
  4. package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
  5. package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
  6. package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
  7. package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
  8. package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
  9. package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
  10. package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
  11. package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
  12. package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
  13. package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
  14. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
  15. package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
  16. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
  17. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -450
  18. package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
  19. package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
  20. package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
  21. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
  22. package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
  23. package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
  24. package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
  25. package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
  26. package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
  27. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
  28. package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
  29. package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
  30. package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
  31. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
  32. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
  33. package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
  34. package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
  35. package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
  36. package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
  37. package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
  38. package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -86
  39. package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
  40. package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
  41. package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
  42. package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
  43. package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
  44. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  45. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
  46. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
  47. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
  48. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
  49. package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
  50. package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
  51. package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
  52. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
  53. package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -117
  54. package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
  55. package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
  56. package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
  57. package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
  58. package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
  59. package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
  60. package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
  61. package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
  62. package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
  63. package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
  64. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
  65. package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
  66. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -531
  67. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -186
  68. package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
  69. package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
  70. package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
  71. package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
  72. package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
  73. package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
  74. package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
  75. package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
  76. package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
  77. package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
  78. package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
  79. package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
  80. package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
  81. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -402
  82. package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
  83. package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
  84. package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
  85. package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
  86. package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
  87. package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
  88. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
  89. package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
  90. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
  91. package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
  92. package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
  93. package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
  94. package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -353
  95. package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
  96. package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -173
  97. package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
  98. package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -160
  99. package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -123
  100. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
  101. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  102. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  103. package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
  104. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  105. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  106. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
  107. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  108. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  109. package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
  110. package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
  111. package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -155
  112. package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
  113. package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
  114. package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
  115. package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -249
  116. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -194
  117. package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
  118. package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
@@ -1,531 +0,0 @@
1
- ---
2
- name: clickzetta-kafka-ingest-pipeline
3
- description: |
4
- 搭建 ClickZetta Lakehouse Kafka 数据接入管道,覆盖从连接验证、数据探查、目标表创建
5
- 到 Pipe 持续导入的端到端工作流。支持两种接入路径:READ_KAFKA Pipe(推荐)和
6
- Kafka 外部表 + Table Stream Pipe。
7
- 当用户说"Kafka 接入"、"Kafka 导入"、"Kafka Pipe"、"read_kafka"、"Kafka 数据管道"、
8
- "Kafka 外部表"、"Kafka 消费"、"消息队列导入"、"Kafka 到 Lakehouse"、
9
- "Kafka 实时导入"、"Kafka 持续导入"、"Kafka topic 导入"、"Kafka JSON 解析"、
10
- "Kafka 延迟监控"、"Kafka 积压"时触发。
11
- 包含 READ_KAFKA 函数探查、JSON 多层嵌套解析、Kafka Pipe DDL、Kafka 外部表 + Table Stream、
12
- SASL 认证配置、生产调优(BATCH_SIZE / COPY_JOB_HINT / VCluster 规格)、
13
- 延迟监控(pipe_latency / query_tag)等 ClickZetta 特有逻辑。
14
- Keywords: Kafka, READ_KAFKA, Pipe, streaming ingestion, topic, consumer
15
- ---
16
-
17
- # Kafka 数据接入管道工作流
18
-
19
- ## 适用场景
20
-
21
- - 将 Kafka Topic 数据持续导入 ClickZetta Lakehouse 表
22
- - 需要近实时(分钟级)数据新鲜度
23
- - Kafka 消息格式为 JSON / CSV / Avro
24
- - 需要在导入前对 JSON 消息进行多层嵌套解析和转换
25
- - 关键词:Kafka Pipe、read_kafka、Kafka 外部表、消息队列导入、Kafka 持续导入
26
-
27
- ## 两种接入路径
28
-
29
- | 路径 | 适用场景 | 核心对象 |
30
- |------|---------|---------|
31
- | **READ_KAFKA Pipe**(推荐) | 通用场景,支持复杂 SQL 转换 | `CREATE PIPE ... AS INSERT INTO ... FROM TABLE(READ_KAFKA(...))` |
32
- | **Kafka 外部表 + Table Stream Pipe** | 需要先落原始数据再增量消费 | Kafka 外部表 → Table Stream → Pipe COPY INTO |
33
-
34
- **选择建议**:大多数场景用 READ_KAFKA Pipe 即可,更简洁高效。Kafka 外部表路径适合需要保留原始消息、多个下游消费同一 Topic 的场景。
35
-
36
- ## 前置依赖
37
-
38
- - ClickZetta Lakehouse 账户,具备创建 Pipe、表、VCluster 等权限
39
- - Kafka 集群网络可达(确认 bootstrap 地址和端口)
40
- - 已知 Kafka Topic 名称和消息格式
41
- - 认证信息(如需要):SASL 用户名/密码
42
-
43
- ## ⚠️ 关键注意事项
44
-
45
- - Kafka Pipe 仅支持 **PLAINTEXT** 和 **SASL_PLAINTEXT** 两种安全协议,不支持 SSL 证书方式
46
- - Pipe 创建后**自动启动**,无需手动 RESUME
47
- - Pipe 不支持修改 COPY 语句逻辑,需删除后重建
48
- - 建议为 Kafka Pipe 分配**专用 GP 集群**,避免与其他查询争抢资源
49
- - `RESET_KAFKA_GROUP_OFFSETS` 仅在创建时生效,会强制改写消费位点,谨慎使用
50
-
51
- ---
52
-
53
- ## 路径一:READ_KAFKA Pipe(推荐)
54
-
55
- ### 步骤 1:验证 Kafka 连接和探查数据
56
-
57
- 先用 `READ_KAFKA` 函数验证网络连通性和消息格式:
58
-
59
- ```sql
60
- -- 无认证 Kafka
61
- SELECT *
62
- FROM TABLE(
63
- READ_KAFKA(
64
- KAFKA_BROKER => 'kafka.example.com:9092',
65
- KAFKA_TOPIC => 'orders',
66
- KAFKA_GROUP_ID => 'test_explore',
67
- KAFKA_OFFSET => 'earliest',
68
- KAFKA_DATA_FORMAT => 'json'
69
- )
70
- )
71
- LIMIT 10;
72
-
73
- -- SASL_PLAINTEXT 认证
74
- SELECT *
75
- FROM TABLE(
76
- READ_KAFKA(
77
- KAFKA_BROKER => 'kafka.example.com:9092',
78
- KAFKA_TOPIC => 'orders',
79
- KAFKA_GROUP_ID => 'test_explore',
80
- KAFKA_OFFSET => 'earliest',
81
- KAFKA_DATA_FORMAT => 'json',
82
- KAFKA_SASL_USERNAME => 'my_user',
83
- KAFKA_SASL_PASSWORD => 'my_password'
84
- )
85
- )
86
- LIMIT 10;
87
- ```
88
-
89
- > 探查用的 `KAFKA_GROUP_ID` 建议用临时名称(如 `test_explore`),避免影响正式消费组。
90
-
91
- ### 步骤 2:探查 JSON 结构并确定目标表 Schema
92
-
93
- Kafka 的 key 和 value 都是 binary 类型。用 `$1` 引用整行 JSON,用 `$1:field::TYPE` 提取字段:
94
-
95
- ```sql
96
- -- 将 value 转为字符串查看原始内容
97
- SELECT CAST(value AS STRING) AS raw_value
98
- FROM TABLE(
99
- READ_KAFKA(
100
- KAFKA_BROKER => 'kafka.example.com:9092',
101
- KAFKA_TOPIC => 'orders',
102
- KAFKA_GROUP_ID => 'test_schema',
103
- KAFKA_OFFSET => 'earliest',
104
- KAFKA_DATA_FORMAT => 'json'
105
- )
106
- )
107
- LIMIT 5;
108
-
109
- -- 提取 JSON 字段(单层)
110
- SELECT
111
- $1:order_id::STRING AS order_id,
112
- $1:user_id::STRING AS user_id,
113
- $1:amount::DECIMAL(10,2) AS amount,
114
- $1:status::STRING AS status,
115
- $1:created_at::TIMESTAMP AS created_at
116
- FROM TABLE(
117
- READ_KAFKA(
118
- KAFKA_BROKER => 'kafka.example.com:9092',
119
- KAFKA_TOPIC => 'orders',
120
- KAFKA_GROUP_ID => 'test_schema',
121
- KAFKA_OFFSET => 'earliest',
122
- KAFKA_DATA_FORMAT => 'json'
123
- )
124
- )
125
- LIMIT 5;
126
-
127
- -- 多层嵌套 JSON 解析(使用 PARSE_JSON 逐层展开)
128
- SELECT
129
- $1:id::STRING AS id,
130
- $1:type::STRING AS event_type,
131
- PARSE_JSON($1:event::STRING):action::STRING AS action,
132
- PARSE_JSON(PARSE_JSON($1:event::STRING):payload::STRING):ref::STRING AS ref
133
- FROM TABLE(
134
- READ_KAFKA(
135
- KAFKA_BROKER => 'kafka.example.com:9092',
136
- KAFKA_TOPIC => 'events',
137
- KAFKA_GROUP_ID => 'test_nested',
138
- KAFKA_OFFSET => 'earliest',
139
- KAFKA_DATA_FORMAT => 'json'
140
- )
141
- )
142
- LIMIT 5;
143
- ```
144
-
145
- > **最佳实践**:在 SELECT 中将所有嵌套 JSON 字符串都 `PARSE_JSON` 展开后再落表,避免下游查询重复计算。
146
-
147
- ### 步骤 3:创建目标表
148
-
149
- 根据探查结果创建目标表:
150
-
151
- ```sql
152
- CREATE TABLE IF NOT EXISTS ods.kafka_orders (
153
- order_id STRING,
154
- user_id STRING,
155
- amount DECIMAL(10,2),
156
- status STRING,
157
- created_at TIMESTAMP,
158
- __kafka_timestamp__ TIMESTAMP COMMENT 'Kafka 消息时间戳,用于端到端延迟监控'
159
- );
160
- ```
161
-
162
- > 建议额外添加 `__kafka_timestamp__` 字段记录 Kafka 消息时间戳,用于后续端到端延迟监控。
163
-
164
- ### 步骤 4:创建专用 VCluster(推荐)
165
-
166
- ```sql
167
- CREATE VCLUSTER IF NOT EXISTS pipe_kafka_vc
168
- VCLUSTER_TYPE = GENERAL
169
- VCLUSTER_SIZE = 4
170
- AUTO_SUSPEND_IN_SECOND = 0
171
- COMMENT 'Kafka Pipe 专用集群,常驻运行';
172
- ```
173
-
174
- > 数据新鲜度要求 1 分钟时,建议 VCluster 常驻(`AUTO_SUSPEND_IN_SECOND = 0`),避免冷启动延迟。
175
-
176
- ### 步骤 5:创建 Kafka Pipe
177
-
178
- ```sql
179
- CREATE OR REPLACE PIPE kafka_orders_pipe
180
- VIRTUAL_CLUSTER = pipe_kafka_vc
181
- BATCH_INTERVAL_IN_SECONDS = 60
182
- BATCH_SIZE_PER_KAFKA_PARTITION = 500000
183
- AS
184
- INSERT INTO ods.kafka_orders (order_id, user_id, amount, status, created_at, __kafka_timestamp__)
185
- SELECT
186
- $1:order_id::STRING,
187
- $1:user_id::STRING,
188
- $1:amount::DECIMAL(10,2),
189
- $1:status::STRING,
190
- $1:created_at::TIMESTAMP,
191
- CAST(timestamp AS TIMESTAMP)
192
- FROM TABLE(
193
- READ_KAFKA(
194
- KAFKA_BROKER => 'kafka.example.com:9092',
195
- KAFKA_TOPIC => 'orders',
196
- KAFKA_GROUP_ID => 'lakehouse_orders',
197
- KAFKA_DATA_FORMAT => 'json'
198
- )
199
- );
200
- ```
201
-
202
- **关键参数说明:**
203
-
204
- | 参数 | 默认值 | 说明 |
205
- |------|--------|------|
206
- | `VIRTUAL_CLUSTER` | — | 必填,指定执行 Pipe 的计算集群 |
207
- | `BATCH_INTERVAL_IN_SECONDS` | 60 | 批处理间隔(秒),即数据新鲜度 |
208
- | `BATCH_SIZE_PER_KAFKA_PARTITION` | 500000 | 每个 Kafka 分区每批最大消息数 |
209
- | `MAX_SKIP_BATCH_COUNT_ON_ERROR` | 30 | 出错时跳过批次的最大重试次数 |
210
- | `INITIAL_DELAY_IN_SECONDS` | 0 | 首个作业调度延迟 |
211
- | `RESET_KAFKA_GROUP_OFFSETS` | — | 可选,指定起始消费位点(仅创建时生效) |
212
-
213
- **RESET_KAFKA_GROUP_OFFSETS 可选值:**
214
-
215
- | 值 | 说明 |
216
- |----|------|
217
- | `'none'` | 无操作,使用 Kafka 的 `auto.offset.reset` 配置(默认 latest) |
218
- | `'valid'` | 检查当前位点是否过期,将过期分区重置到 earliest |
219
- | `'earliest'` | 重置到最早位点(消费全部历史数据) |
220
- | `'latest'` | 重置到最新位点(仅消费新数据) |
221
- | `'1737789688000'` | 重置到指定毫秒时间戳对应的位点 |
222
-
223
- > **注意**:Pipe 中的 READ_KAFKA 不要设置 `KAFKA_OFFSET` 参数(由 Pipe 自动管理消费位点),与独立使用 READ_KAFKA 探查时不同。
224
-
225
- ### 步骤 6:验证 Pipe 运行状态
226
-
227
- ```sql
228
- -- 查看 Pipe 详情
229
- DESC PIPE EXTENDED kafka_orders_pipe;
230
- -- 关键字段:pipe_execution_paused(是否暂停)、pipe_latency(延迟信息)
231
-
232
- -- 查看目标表数据
233
- SELECT COUNT(*) FROM ods.kafka_orders;
234
- SELECT * FROM ods.kafka_orders LIMIT 10;
235
-
236
- -- 查看加载历史(保留 7 天)
237
- SELECT * FROM TABLE(load_history('ods.kafka_orders'))
238
- ORDER BY last_load_time DESC
239
- LIMIT 20;
240
-
241
- -- 通过 query_tag 查看 Pipe 作业
242
- SHOW JOBS WHERE query_tag = 'pipe.my_workspace.ods.kafka_orders_pipe';
243
- ```
244
-
245
- ---
246
-
247
- ## 路径二:Kafka 外部表 + Table Stream Pipe
248
-
249
- 适合需要保留原始消息、或多个下游消费同一 Topic 的场景。
250
-
251
- ### 步骤 1:创建 Kafka Storage Connection
252
-
253
- ```sql
254
- CREATE STORAGE CONNECTION IF NOT EXISTS kafka_conn
255
- TYPE KAFKA
256
- BOOTSTRAP_SERVERS = ['kafka.example.com:9092']
257
- SECURITY_PROTOCOL = 'PLAINTEXT';
258
- ```
259
-
260
- ### 步骤 2:创建 Kafka 外部表
261
-
262
- ```sql
263
- CREATE EXTERNAL TABLE kafka_orders_ext
264
- USING KAFKA
265
- OPTIONS (
266
- 'group_id' = 'lakehouse_ext_orders',
267
- 'topics' = 'orders',
268
- 'starting_offset' = 'earliest'
269
- )
270
- CONNECTION kafka_conn;
271
- ```
272
-
273
- 外部表固定字段:`topic`、`partition`、`offset`、`timestamp`、`timestamp_type`、`headers`、`key`(BINARY)、`value`(BINARY)
274
-
275
- ### 步骤 3:创建 Table Stream
276
-
277
- ```sql
278
- CREATE TABLE STREAM kafka_orders_stream
279
- ON TABLE kafka_orders_ext
280
- WITH PROPERTIES ('TABLE_STREAM_MODE' = 'APPEND_ONLY');
281
- ```
282
-
283
- ### 步骤 4:创建目标表和 Pipe
284
-
285
- ```sql
286
- -- 目标表
287
- CREATE TABLE IF NOT EXISTS ods.kafka_orders_from_ext (
288
- order_id STRING,
289
- user_id STRING,
290
- amount DECIMAL(10,2),
291
- kafka_ts TIMESTAMP
292
- );
293
-
294
- -- Pipe(从 Table Stream 消费)
295
- CREATE PIPE kafka_ext_orders_pipe
296
- VIRTUAL_CLUSTER = pipe_kafka_vc
297
- BATCH_INTERVAL_IN_SECONDS = 60
298
- AS
299
- COPY INTO ods.kafka_orders_from_ext
300
- SELECT
301
- GET_JSON_OBJECT(CAST(value AS STRING), '$.order_id') AS order_id,
302
- GET_JSON_OBJECT(CAST(value AS STRING), '$.user_id') AS user_id,
303
- CAST(GET_JSON_OBJECT(CAST(value AS STRING), '$.amount') AS DECIMAL(10,2)) AS amount,
304
- CAST(timestamp AS TIMESTAMP) AS kafka_ts
305
- FROM kafka_orders_stream;
306
- ```
307
-
308
- ---
309
-
310
- ## 监控与运维
311
-
312
- ### 查看 Kafka 消费延迟
313
-
314
- ```sql
315
- DESC PIPE EXTENDED kafka_orders_pipe;
316
- ```
317
-
318
- 关键字段 `pipe_latency`(JSON 格式):
319
- - `lastConsumeTimestamp`:上一次消费的位点时间
320
- - `offsetLag`:Kafka 数据堆积量
321
- - `timeLag`:消费延迟(毫秒),当前时间减去上一次消费位点。异常时为 -1
322
-
323
- > 当数据新鲜度为 60 秒且算力冗余一倍时,`timeLag` 应在 0~90 秒之间波动。持续上涨说明 Pipe 积压。
324
-
325
- ### 端到端延迟监控(需要 `__kafka_timestamp__` 字段)
326
-
327
- ```sql
328
- -- 查看最近 1 小时的端到端延迟
329
- SELECT
330
- MAX(DATEDIFF('second', __kafka_timestamp__, CURRENT_TIMESTAMP())) AS max_delay_seconds,
331
- AVG(DATEDIFF('second', __kafka_timestamp__, CURRENT_TIMESTAMP())) AS avg_delay_seconds
332
- FROM ods.kafka_orders
333
- WHERE __kafka_timestamp__ >= CURRENT_TIMESTAMP() - INTERVAL 1 HOUR;
334
- ```
335
-
336
- ### 暂停 / 恢复 Pipe
337
-
338
- ```sql
339
- -- 暂停
340
- ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = true;
341
-
342
- -- 恢复
343
- ALTER PIPE kafka_orders_pipe SET PIPE_EXECUTION_PAUSED = false;
344
- ```
345
-
346
- ### 修改 Pipe 属性
347
-
348
- ```sql
349
- -- 修改批处理间隔
350
- ALTER PIPE kafka_orders_pipe SET BATCH_INTERVAL_IN_SECONDS = 120;
351
-
352
- -- 修改每分区批大小
353
- ALTER PIPE kafka_orders_pipe SET BATCH_SIZE_PER_KAFKA_PARTITION = 1000000;
354
-
355
- -- 修改 VCluster
356
- ALTER PIPE kafka_orders_pipe SET VIRTUAL_CLUSTER = 'new_vc';
357
- ```
358
-
359
- > 每次 ALTER 只能修改一个属性。不支持修改 COPY/INSERT 语句逻辑,需删除重建。
360
-
361
- ### 修改 Pipe SQL 逻辑(需删除重建)
362
-
363
- ```sql
364
- -- 1. 删除当前 Pipe
365
- DROP PIPE kafka_orders_pipe;
366
-
367
- -- 2. 重建 Pipe(不要设置 RESET_KAFKA_GROUP_OFFSETS,保持从上次位点继续)
368
- CREATE PIPE kafka_orders_pipe
369
- VIRTUAL_CLUSTER = pipe_kafka_vc
370
- BATCH_INTERVAL_IN_SECONDS = 60
371
- AS
372
- INSERT INTO ods.kafka_orders (order_id, user_id, amount, status, created_at, __kafka_timestamp__)
373
- SELECT
374
- $1:order_id::STRING,
375
- $1:user_id::STRING,
376
- $1:amount::DECIMAL(10,2),
377
- UPPER($1:status::STRING), -- 修改了转换逻辑
378
- $1:created_at::TIMESTAMP,
379
- CAST(timestamp AS TIMESTAMP)
380
- FROM TABLE(
381
- READ_KAFKA(
382
- KAFKA_BROKER => 'kafka.example.com:9092',
383
- KAFKA_TOPIC => 'orders',
384
- KAFKA_GROUP_ID => 'lakehouse_orders', -- 保持相同 group_id
385
- KAFKA_DATA_FORMAT => 'json'
386
- )
387
- );
388
- ```
389
-
390
- > **关键**:重建时保持相同的 `KAFKA_GROUP_ID`,且不设置 `RESET_KAFKA_GROUP_OFFSETS`,Pipe 会从上次消费位点继续。
391
-
392
- ---
393
-
394
- ## 生产调优
395
-
396
- ### 判断是否积压
397
-
398
- 多次执行 `DESC PIPE EXTENDED` 查看 `pipe_latency` 中的 `timeLag`:
399
- - 在 0~90 秒波动 → 正常(60 秒新鲜度 + 一倍冗余)
400
- - 持续上涨 → 积压,需调优
401
-
402
- ### 调优参数
403
-
404
- | 问题 | 调优方向 | 操作 |
405
- |------|---------|------|
406
- | 每批读取不完一个周期的数据 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION` | `ALTER PIPE ... SET BATCH_SIZE_PER_KAFKA_PARTITION = 1000000` |
407
- | 作业需要多轮才能完成 | 增大 VCluster 规格(使 core 数 ≥ partition 数) | `ALTER VCLUSTER ... SET VCLUSTER_SIZE = 16` |
408
- | partition 少但数据量大 | 按条数切分 task | `ALTER PIPE ... SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}'` |
409
-
410
- ### COPY_JOB_HINT 参数
411
-
412
- | Key | 默认值 | 说明 |
413
- |-----|--------|------|
414
- | `cz.sql.split.kafka.strategy` | `simple` | `simple`=每 partition 一个 task;`size`=按条数切分 |
415
- | `cz.mapper.kafka.message.size` | `1000000` | 当 strategy=size 时,每个 task 处理的消息条数 |
416
-
417
- > ⚠️ **格式要求**:`COPY_JOB_HINT` 必须是合法 JSON,键值都要用双引号包围:
418
- > ```sql
419
- > -- ✅ 正确
420
- > ALTER PIPE my_pipe SET COPY_JOB_HINT = '{"cz.sql.split.kafka.strategy":"size","cz.mapper.kafka.message.size":"200000"}';
421
- > -- ❌ 错误(非 JSON 格式)
422
- > ALTER PIPE my_pipe SET COPY_JOB_HINT = 'cz.sql.split.kafka.strategy=size';
423
- > ```
424
- > 修改 `COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数。
425
-
426
- ---
427
-
428
- ## 典型场景
429
-
430
- ### 场景 A:简单 JSON Topic 接入
431
-
432
- ```sql
433
- -- 1. 探查
434
- SELECT $1:id::STRING, $1:name::STRING, $1:value::DOUBLE
435
- FROM TABLE(READ_KAFKA(
436
- KAFKA_BROKER => 'kafka:9092', KAFKA_TOPIC => 'metrics',
437
- KAFKA_GROUP_ID => 'test', KAFKA_OFFSET => 'earliest', KAFKA_DATA_FORMAT => 'json'
438
- )) LIMIT 5;
439
-
440
- -- 2. 建表
441
- CREATE TABLE ods.metrics (id STRING, name STRING, value DOUBLE, kafka_ts TIMESTAMP);
442
-
443
- -- 3. 建 Pipe
444
- CREATE PIPE metrics_pipe VIRTUAL_CLUSTER = pipe_vc AS
445
- INSERT INTO ods.metrics
446
- SELECT $1:id::STRING, $1:name::STRING, $1:value::DOUBLE, CAST(timestamp AS TIMESTAMP)
447
- FROM TABLE(READ_KAFKA(
448
- KAFKA_BROKER => 'kafka:9092', KAFKA_TOPIC => 'metrics',
449
- KAFKA_GROUP_ID => 'cz_metrics', KAFKA_DATA_FORMAT => 'json'
450
- ));
451
- ```
452
-
453
- ### 场景 B:Kafka → ODS → DWD 实时 ETL
454
-
455
- ```sql
456
- -- 1. Pipe 接入 ODS 层
457
- CREATE PIPE kafka_events_pipe VIRTUAL_CLUSTER = pipe_vc AS
458
- INSERT INTO ods.events (event_id, user_id, action, ts)
459
- SELECT $1:event_id::STRING, $1:user_id::STRING, $1:action::STRING, $1:ts::TIMESTAMP
460
- FROM TABLE(READ_KAFKA(
461
- KAFKA_BROKER => 'kafka:9092', KAFKA_TOPIC => 'user_events',
462
- KAFKA_GROUP_ID => 'cz_events', KAFKA_DATA_FORMAT => 'json'
463
- ));
464
-
465
- -- 2. Dynamic Table 清洗到 DWD 层
466
- CREATE OR REPLACE DYNAMIC TABLE dwd.events_clean
467
- REFRESH interval 1 MINUTE VCLUSTER default_ap
468
- AS
469
- SELECT event_id, user_id, UPPER(action) AS action, ts, DATE(ts) AS dt
470
- FROM ods.events
471
- WHERE event_id IS NOT NULL AND action IS NOT NULL;
472
-
473
- -- 3. Dynamic Table 聚合到 DWS 层
474
- CREATE OR REPLACE DYNAMIC TABLE dws.events_hourly
475
- REFRESH interval 5 MINUTE VCLUSTER default_ap
476
- AS
477
- SELECT DATE_TRUNC('hour', ts) AS hour, action, COUNT(*) AS cnt, COUNT(DISTINCT user_id) AS uv
478
- FROM dwd.events_clean
479
- GROUP BY 1, 2;
480
- ```
481
-
482
- ### 场景 C:SASL 认证 + 指定时间点消费
483
-
484
- ```sql
485
- CREATE PIPE kafka_auth_pipe
486
- VIRTUAL_CLUSTER = pipe_vc
487
- BATCH_INTERVAL_IN_SECONDS = 60
488
- RESET_KAFKA_GROUP_OFFSETS = '1737789688000'
489
- AS
490
- INSERT INTO ods.secure_events (event_id, payload, kafka_ts)
491
- SELECT $1:id::STRING, $1:payload::STRING, CAST(timestamp AS TIMESTAMP)
492
- FROM TABLE(
493
- READ_KAFKA(
494
- KAFKA_BROKER => 'kafka.example.com:9092',
495
- KAFKA_TOPIC => 'secure_events',
496
- KAFKA_GROUP_ID => 'cz_secure',
497
- KAFKA_DATA_FORMAT => 'json',
498
- KAFKA_SASL_USERNAME => 'my_user',
499
- KAFKA_SASL_PASSWORD => 'my_password'
500
- )
501
- );
502
- ```
503
-
504
- ---
505
-
506
- ## 故障排除
507
-
508
- | 问题 | 排查方向 |
509
- |------|---------|
510
- | READ_KAFKA 探查无数据 | 检查 broker 地址/端口、topic 名称、网络连通性;尝试 `KAFKA_OFFSET => 'earliest'` |
511
- | Pipe 创建后无数据加载 | `DESC PIPE EXTENDED` 检查是否暂停;确认 group_id 的消费位点(默认 latest,新数据才会消费) |
512
- | JSON 解析报错 | 检查 `$1:field::TYPE` 语法;嵌套 JSON 需先 `PARSE_JSON()` 展开 |
513
- | SASL 认证失败 | 确认安全协议为 SASL_PLAINTEXT(不支持 SSL);检查用户名密码 |
514
- | 消费延迟持续增大 | 增大 `BATCH_SIZE_PER_KAFKA_PARTITION`;增大 VCluster 规格;使用 `COPY_JOB_HINT` 切分 task |
515
- | 重建 Pipe 后数据重复 | 保持相同 `KAFKA_GROUP_ID` 且不设置 `RESET_KAFKA_GROUP_OFFSETS` |
516
- | 重建 Pipe 后数据丢失 | 检查 group_id 的位点是否过期;如需回溯用 `RESET_KAFKA_GROUP_OFFSETS` 指定时间戳 |
517
- | `COPY_JOB_HINT` 修改后参数丢失 | `SET COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数 |
518
- | Pipe 作业 Failover | 查看作业详情;通常为 Kafka 连接中断或 Lakehouse 服务升级,会自动恢复 |
519
-
520
- ---
521
-
522
- ## 参考文档
523
-
524
- - [Pipe 简介](https://www.yunqi.tech/documents/pipe-summary)
525
- - [借助 read_kafka 函数持续导入](https://www.yunqi.tech/documents/pipe-kafka)
526
- - [借助 Kafka 外表 Table Stream 持续导入](https://www.yunqi.tech/documents/pipe-kafka-table-stream)
527
- - [最佳实践:使用 Pipe 高效接入 Kafka 数据](https://www.yunqi.tech/documents/pipe-kafka-bestpractice-1)
528
- - [read_kafka 函数](https://www.yunqi.tech/documents/read_kafka)
529
- - [Kafka 外部表](https://www.yunqi.tech/documents/kafka-external-table)
530
- - [Kafka Storage Connection](https://www.yunqi.tech/documents/Kafka_connection)
531
- - [PIPE 导入语法](https://www.yunqi.tech/documents/pipe-syntax)