@clickzetta/cz-cli-darwin-x64 0.3.39 → 0.3.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +128 -287
- package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +96 -11
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +237 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
- package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
- package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +103 -11
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +58 -2
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +4 -4
- package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +34 -0
- package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +38 -20
- package/bin/skills/clickzetta-metadata/SKILL.md +51 -32
- package/bin/skills/clickzetta-monitoring/SKILL.md +18 -2
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +2 -2
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +63 -38
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +63 -16
- package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +115 -9
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
- package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
- package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
- package/bin/skills/cz-cli-inner/SKILL.md +5 -4
- package/package.json +1 -1
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +0 -4
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
|
@@ -10,6 +10,40 @@ description: |
|
|
|
10
10
|
|
|
11
11
|
# URL/文件数据导入工作流
|
|
12
12
|
|
|
13
|
+
## 向导:收集必要信息
|
|
14
|
+
|
|
15
|
+
开始导入前,优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
question({
|
|
19
|
+
questions: [
|
|
20
|
+
{
|
|
21
|
+
question: "文件来源?",
|
|
22
|
+
options: [
|
|
23
|
+
{ label: "HTTP/HTTPS URL", description: "提供完整链接,自动下载" },
|
|
24
|
+
{ label: "本地文件", description: "本地路径,上传到 User Volume" },
|
|
25
|
+
{ label: "已在 Volume 上", description: "提供 Volume 名称和文件路径" },
|
|
26
|
+
{ label: "外部 Volume(OSS/S3/COS)", description: "提供外部存储路径" }
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
question: "写入模式?",
|
|
31
|
+
options: [
|
|
32
|
+
{ label: "create(自动建表)", description: "表不存在,推断 schema 后建表" },
|
|
33
|
+
{ label: "append(追加)", description: "追加到已有表,不删除历史数据" },
|
|
34
|
+
{ label: "overwrite(覆盖)", description: "清空已有表再写入" }
|
|
35
|
+
]
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
})
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
**如果用户已经提供了足够信息,直接进入工作流,不再弹出菜单。**
|
|
42
|
+
|
|
43
|
+
**如果用户已经提供了足够信息(如"把这个 URL 的 CSV 导入到 ods.orders 表"),直接进入步骤 1,不再重复询问。**
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
13
47
|
## 指令
|
|
14
48
|
|
|
15
49
|
### 步骤 1:获取源文件并上传到 Volume
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-java-sdk
|
|
3
|
+
description: |
|
|
4
|
+
使用 ClickZetta Java SDK 将数据批量或实时写入 Lakehouse 表。
|
|
5
|
+
覆盖 BulkloadStream(本地文件/数据库批量上传)和 RealtimeStream(Kafka 实时消费写入)
|
|
6
|
+
两种接口的完整使用模式,包括 Maven 依赖、连接 URL 格式、行写入 API、
|
|
7
|
+
状态监控、Options 调优和常见错误处理。
|
|
8
|
+
当用户说"Java SDK"、"BulkloadStream"、"RealtimeStream"、
|
|
9
|
+
"Java 写入 Lakehouse"、"Java 批量上传"、"Kafka Java 写入"、
|
|
10
|
+
"clickzetta-java"、"Maven 依赖"、"Java 数据导入"时触发。
|
|
11
|
+
Keywords: Java SDK, BulkloadStream, RealtimeStream, Kafka consumer, batch write, real-time write
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# ClickZetta Java SDK
|
|
15
|
+
|
|
16
|
+
Java SDK 提供两种写入接口:
|
|
17
|
+
- **BulkloadStream** — 批量写入,适合定时 ETL、本地文件导入(不支持主键表,不适合 5 分钟以内的高频写入)
|
|
18
|
+
- **RealtimeStream** — 实时写入,适合 Kafka 消费、流式数据接入(秒级可查)
|
|
19
|
+
|
|
20
|
+
阅读 [references/bulkload.md](references/bulkload.md) 了解批量写入,[references/realtime.md](references/realtime.md) 了解实时写入。
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Maven 依赖
|
|
25
|
+
|
|
26
|
+
```xml
|
|
27
|
+
<!-- clickzetta-java 最新版本见 https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java -->
|
|
28
|
+
<dependency>
|
|
29
|
+
<groupId>com.clickzetta</groupId>
|
|
30
|
+
<artifactId>clickzetta-java</artifactId>
|
|
31
|
+
<version>2.0.0</version>
|
|
32
|
+
</dependency>
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
RealtimeStream + Kafka 还需要:
|
|
36
|
+
|
|
37
|
+
```xml
|
|
38
|
+
<dependency>
|
|
39
|
+
<groupId>org.apache.kafka</groupId>
|
|
40
|
+
<artifactId>kafka-clients</artifactId>
|
|
41
|
+
<version>3.2.0</version>
|
|
42
|
+
</dependency>
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 连接 URL 格式
|
|
48
|
+
|
|
49
|
+
```java
|
|
50
|
+
// 推荐:显式参数方式(2.0.0+ 支持,不依赖 URL 解析)
|
|
51
|
+
ClickZettaClient client = ClickZettaClient.newBuilder()
|
|
52
|
+
.service("cn-shanghai-alicloud.api.clickzetta.com")
|
|
53
|
+
.instance("your_instance")
|
|
54
|
+
.workspace("your_workspace")
|
|
55
|
+
.schema("public")
|
|
56
|
+
.username("your_user")
|
|
57
|
+
.password("your_password")
|
|
58
|
+
.vcluster("default")
|
|
59
|
+
.build();
|
|
60
|
+
|
|
61
|
+
// 兼容:URL 方式(BulkloadStream 用 virtualcluster=,RealtimeStream 用 vcluster=)
|
|
62
|
+
String bulkUrl = MessageFormat.format(
|
|
63
|
+
"jdbc:clickzetta://{0}.{1}/{2}?schema={3}&username={4}&password={5}&virtualcluster={6}",
|
|
64
|
+
instance, region_endpoint, workspace, schema, username, password, vcluster
|
|
65
|
+
);
|
|
66
|
+
String rtUrl = MessageFormat.format(
|
|
67
|
+
"jdbc:clickzetta://{0}.{1}/{2}?schema={3}&username={4}&password={5}&vcluster={6}",
|
|
68
|
+
instance, region_endpoint, workspace, schema, username, password, vcluster
|
|
69
|
+
);
|
|
70
|
+
ClickZettaClient client = ClickZettaClient.newBuilder().url(url).build();
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
JDBC 连接(DDL / 查询):
|
|
74
|
+
|
|
75
|
+
```java
|
|
76
|
+
// 2.0.0+ 驱动类:com.clickzetta.client.jdbc.ClickZettaDriver
|
|
77
|
+
// 1.x 驱动类:com.clickzetta.jdbc.ClickZettaDriver
|
|
78
|
+
Class.forName("com.clickzetta.client.jdbc.ClickZettaDriver");
|
|
79
|
+
Connection conn = DriverManager.getConnection(jdbcUrl);
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## BulkloadStream 快速示例
|
|
85
|
+
|
|
86
|
+
```java
|
|
87
|
+
// 创建 BulkloadStream
|
|
88
|
+
BulkloadStream stream = client.newBulkloadStreamBuilder()
|
|
89
|
+
.schema("public")
|
|
90
|
+
.table("orders")
|
|
91
|
+
.operate(RowStream.BulkLoadOperate.APPEND)
|
|
92
|
+
.build();
|
|
93
|
+
|
|
94
|
+
// 写入数据(列索引从 0 开始,顺序与建表 DDL 一致)
|
|
95
|
+
Row row = stream.createRow();
|
|
96
|
+
row.setValue(0, "order-001"); // STRING
|
|
97
|
+
row.setValue(1, 1); // INT
|
|
98
|
+
row.setValue(2, 299.99); // DOUBLE
|
|
99
|
+
stream.apply(row); // ⚠️ 必须调用,否则数据不发送到服务端
|
|
100
|
+
|
|
101
|
+
// 关闭并等待完成
|
|
102
|
+
stream.close();
|
|
103
|
+
while (stream.getState() == StreamState.RUNNING) {
|
|
104
|
+
Thread.sleep(1000);
|
|
105
|
+
}
|
|
106
|
+
if (stream.getState() == StreamState.FAILED) {
|
|
107
|
+
throw new RuntimeException(stream.getErrorMessage());
|
|
108
|
+
}
|
|
109
|
+
client.close();
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## RealtimeStream 快速示例
|
|
115
|
+
|
|
116
|
+
```java
|
|
117
|
+
// Options 调优
|
|
118
|
+
Options options = Options.builder()
|
|
119
|
+
.withMutationBufferLinesNum(10) // 缓冲行数
|
|
120
|
+
.build();
|
|
121
|
+
|
|
122
|
+
// 创建 RealtimeStream(普通表,APPEND_ONLY)
|
|
123
|
+
RealtimeStream stream = client.newRealtimeStreamBuilder()
|
|
124
|
+
.operate(RowStream.RealTimeOperate.APPEND_ONLY)
|
|
125
|
+
.options(options)
|
|
126
|
+
.schema("public")
|
|
127
|
+
.table("events")
|
|
128
|
+
.build();
|
|
129
|
+
|
|
130
|
+
// 写入数据(用列名,不用索引)
|
|
131
|
+
Row row = stream.createRow(Stream.Operator.INSERT);
|
|
132
|
+
row.setValue("id", 1);
|
|
133
|
+
row.setValue("event", "{\"type\":\"click\"}");
|
|
134
|
+
stream.apply(row);
|
|
135
|
+
stream.close();
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## RealtimeStream CDC 示例(主键表 UPSERT / DELETE)
|
|
139
|
+
|
|
140
|
+
```java
|
|
141
|
+
// 建表:CREATE TABLE orders (txid STRING NOT NULL PRIMARY KEY, amount DOUBLE, status STRING);
|
|
142
|
+
|
|
143
|
+
RealtimeStream stream = client.newRealtimeStreamBuilder()
|
|
144
|
+
.operate(RowStream.RealTimeOperate.CDC) // 主键表必须用 CDC
|
|
145
|
+
.options(options)
|
|
146
|
+
.schema("public")
|
|
147
|
+
.table("orders")
|
|
148
|
+
.build();
|
|
149
|
+
|
|
150
|
+
// UPSERT:存在则更新,不存在则插入
|
|
151
|
+
Row row = stream.createRow(Stream.Operator.UPSERT);
|
|
152
|
+
row.setValue("txid", "order-001");
|
|
153
|
+
row.setValue("amount", 299.99);
|
|
154
|
+
row.setValue("status", "paid");
|
|
155
|
+
stream.apply(row);
|
|
156
|
+
|
|
157
|
+
// DELETE_IGNORE:删除,目标行不存在时自动忽略
|
|
158
|
+
Row del = stream.createRow(Stream.Operator.DELETE_IGNORE);
|
|
159
|
+
del.setValue("txid", "order-001");
|
|
160
|
+
stream.apply(del);
|
|
161
|
+
|
|
162
|
+
stream.close();
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## 选择指南
|
|
168
|
+
|
|
169
|
+
| 场景 | 推荐接口 |
|
|
170
|
+
|---|---|
|
|
171
|
+
| 定时批量 ETL(每小时/每天) | BulkloadStream |
|
|
172
|
+
| Kafka 实时消费 | RealtimeStream |
|
|
173
|
+
| 5 分钟以内高频写入 | RealtimeStream |
|
|
174
|
+
| 主键表写入(UPSERT / DELETE) | RealtimeStream CDC 模式 |
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## 使用限制
|
|
179
|
+
|
|
180
|
+
| 限制 | BulkloadStream | RealtimeStream |
|
|
181
|
+
|---|---|---|
|
|
182
|
+
| 主键表 | ❌ 不支持 | ✅ CDC 模式支持 |
|
|
183
|
+
| 高频写入(< 5 分钟) | ❌ 不适合 | ✅ 支持 |
|
|
184
|
+
| 数据可见延迟 | 写完 close() 后可见 | ~1 分钟后可见 |
|
|
185
|
+
| Table Stream/Dynamic Table 可见 | close() 后 | ~1 分钟后 |
|
|
186
|
+
| 表结构变更 | 重建 Stream | 停止任务,变更后约 90 分钟重启 |
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"用 Java SDK BulkloadStream 批量写入数据到 Lakehouse","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"Java 怎么消费 Kafka 实时写入 Lakehouse","expected_skill":"clickzetta-java-sdk","expected_output_contains":["RealtimeStream"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"clickzetta-java 的 Maven 依赖怎么配","expected_skill":"clickzetta-java-sdk","expected_output_contains":["groupId","clickzetta-java"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"BulkloadStream 和 RealtimeStream 有什么区别","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream","RealtimeStream"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"Java SDK 连接 URL 格式是什么","expected_skill":"clickzetta-java-sdk","expected_output_contains":["URL"]}
|
|
6
|
+
{"case_id":"006","type":"should_call","user_input":"Java 批量上传本地文件到 Lakehouse","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream"]}
|
|
7
|
+
{"case_id":"007","type":"should_call","user_input":"RealtimeStream 的 setValue 怎么用","expected_skill":"clickzetta-java-sdk","expected_output_contains":["setValue"]}
|
|
8
|
+
{"case_id":"008","type":"should_not_call","user_input":"Python SDK 怎么连接 Lakehouse","forbidden_skill":"clickzetta-java-sdk"}
|
|
9
|
+
{"case_id":"009","type":"should_not_call","user_input":"帮我写一个 Spring Boot 应用","forbidden_skill":"clickzetta-java-sdk"}
|
|
10
|
+
{"case_id":"010","type":"should_not_call","user_input":"Flink 怎么写入 Lakehouse","forbidden_skill":"clickzetta-java-sdk"}
|
|
11
|
+
{"case_id":"011","type":"should_not_call","user_input":"怎么创建 VCluster","forbidden_skill":"clickzetta-java-sdk"}
|
|
12
|
+
{"case_id":"012","type":"should_not_call","user_input":"MySQL JDBC 连接怎么配置","forbidden_skill":"clickzetta-java-sdk"}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# BulkloadStream 详细参考
|
|
2
|
+
|
|
3
|
+
> 适合:定时 ETL、本地文件导入、数据库迁移
|
|
4
|
+
> 不适合:主键表、5 分钟以内高频写入
|
|
5
|
+
|
|
6
|
+
## Maven 依赖
|
|
7
|
+
|
|
8
|
+
```xml
|
|
9
|
+
<!-- 最新版本见 https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java -->
|
|
10
|
+
<dependency>
|
|
11
|
+
<groupId>com.clickzetta</groupId>
|
|
12
|
+
<artifactId>clickzetta-java</artifactId>
|
|
13
|
+
<version>2.0.0</version>
|
|
14
|
+
</dependency>
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
最新版本见 [Maven Central](https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java)
|
|
18
|
+
|
|
19
|
+
## 使用限制
|
|
20
|
+
|
|
21
|
+
- **不支持主键(pk)表写入**
|
|
22
|
+
- **不适合时间间隔小于 5 分钟的高频写入**
|
|
23
|
+
- 写入完成 `close()` 后数据才可见
|
|
24
|
+
|
|
25
|
+
## 完整示例:读取本地 CSV 写入 Lakehouse
|
|
26
|
+
|
|
27
|
+
### 建表
|
|
28
|
+
|
|
29
|
+
```sql
|
|
30
|
+
CREATE TABLE bulk_order_items (
|
|
31
|
+
order_id STRING,
|
|
32
|
+
order_item_id INT,
|
|
33
|
+
product_id STRING,
|
|
34
|
+
seller_id STRING,
|
|
35
|
+
shipping_limit_date STRING,
|
|
36
|
+
price DOUBLE,
|
|
37
|
+
freight_value DOUBLE
|
|
38
|
+
);
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Java 代码(BulkloadFile 类)
|
|
42
|
+
|
|
43
|
+
```java
|
|
44
|
+
import com.clickzetta.client.BulkloadStream;
|
|
45
|
+
import com.clickzetta.client.ClickZettaClient;
|
|
46
|
+
import com.clickzetta.client.RowStream;
|
|
47
|
+
import com.clickzetta.client.StreamState;
|
|
48
|
+
import com.clickzetta.platform.client.api.Row;
|
|
49
|
+
|
|
50
|
+
import java.io.BufferedReader;
|
|
51
|
+
import java.io.File;
|
|
52
|
+
import java.io.FileReader;
|
|
53
|
+
import java.text.MessageFormat;
|
|
54
|
+
|
|
55
|
+
public class BulkloadFile {
|
|
56
|
+
private static ClickZettaClient client;
|
|
57
|
+
private static final String password = "";
|
|
58
|
+
private static final String table = "bulk_order_items";
|
|
59
|
+
private static final String workspace = "";
|
|
60
|
+
private static final String schema = "public";
|
|
61
|
+
private static final String vc = "default";
|
|
62
|
+
private static final String user = "";
|
|
63
|
+
static BulkloadStream bulkloadStream;
|
|
64
|
+
|
|
65
|
+
public static void main(String[] args) throws Exception {
|
|
66
|
+
initialize();
|
|
67
|
+
File csvFile = new File("olist_order_items_dataset.csv");
|
|
68
|
+
BufferedReader reader = new BufferedReader(new FileReader(csvFile));
|
|
69
|
+
reader.readLine(); // 跳过 header 行
|
|
70
|
+
|
|
71
|
+
String line;
|
|
72
|
+
while ((line = reader.readLine()) != null) {
|
|
73
|
+
String[] values = line.split(",");
|
|
74
|
+
// 类型转换必须与建表 DDL 一致
|
|
75
|
+
String orderId = values[0];
|
|
76
|
+
int orderItemId = Integer.parseInt(values[1]);
|
|
77
|
+
String productId = values[2];
|
|
78
|
+
String sellerId = values[3];
|
|
79
|
+
String shippingLimitDate = values[4];
|
|
80
|
+
double price = Double.parseDouble(values[5]);
|
|
81
|
+
double freightValue = Double.parseDouble(values[6]);
|
|
82
|
+
|
|
83
|
+
Row row = bulkloadStream.createRow();
|
|
84
|
+
// ⚠️ BulkloadStream 用列索引(从 0 开始),顺序与建表 DDL 一致
|
|
85
|
+
row.setValue(0, orderId);
|
|
86
|
+
row.setValue(1, orderItemId);
|
|
87
|
+
row.setValue(2, productId);
|
|
88
|
+
row.setValue(3, sellerId);
|
|
89
|
+
row.setValue(4, shippingLimitDate);
|
|
90
|
+
row.setValue(5, price);
|
|
91
|
+
row.setValue(6, freightValue);
|
|
92
|
+
// ⚠️ 必须调用 apply(),否则数据不发送到服务端
|
|
93
|
+
bulkloadStream.apply(row);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
reader.close();
|
|
97
|
+
bulkloadStream.close();
|
|
98
|
+
waitForBulkloadCompletion();
|
|
99
|
+
client.close();
|
|
100
|
+
System.out.println("Data inserted successfully!");
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
private static void initialize() throws Exception {
|
|
104
|
+
// 推荐:显式参数方式(2.0.0+ 支持)
|
|
105
|
+
client = ClickZettaClient.newBuilder()
|
|
106
|
+
.service("cn-shanghai-alicloud.api.clickzetta.com")
|
|
107
|
+
.instance("your_instance")
|
|
108
|
+
.workspace(workspace)
|
|
109
|
+
.schema(schema)
|
|
110
|
+
.username(user)
|
|
111
|
+
.password(password)
|
|
112
|
+
.vcluster(vc)
|
|
113
|
+
.build();
|
|
114
|
+
bulkloadStream = client.newBulkloadStreamBuilder()
|
|
115
|
+
.schema(schema)
|
|
116
|
+
.table(table)
|
|
117
|
+
.operate(RowStream.BulkLoadOperate.APPEND)
|
|
118
|
+
.build();
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
private static void waitForBulkloadCompletion() throws InterruptedException {
|
|
122
|
+
while (bulkloadStream.getState() == StreamState.RUNNING) {
|
|
123
|
+
Thread.sleep(1000);
|
|
124
|
+
}
|
|
125
|
+
if (bulkloadStream.getState() == StreamState.FAILED) {
|
|
126
|
+
throw new RuntimeException(bulkloadStream.getErrorMessage());
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## 关键 API
|
|
133
|
+
|
|
134
|
+
| API | 说明 |
|
|
135
|
+
|---|---|
|
|
136
|
+
| `bulkloadStream.createRow()` | 创建行对象(无参数) |
|
|
137
|
+
| `row.setValue(int index, Object value)` | 按列索引设值(从 0 开始) |
|
|
138
|
+
| `bulkloadStream.apply(row)` | 发送行到服务端(必须调用) |
|
|
139
|
+
| `bulkloadStream.close()` | 关闭并触发提交 |
|
|
140
|
+
| `bulkloadStream.getState()` | 获取状态:RUNNING / SUCCEEDED / FAILED |
|
|
141
|
+
| `bulkloadStream.getErrorMessage()` | 获取失败原因 |
|
|
142
|
+
|
|
143
|
+
## 类型映射
|
|
144
|
+
|
|
145
|
+
| Java 类型 | Lakehouse 类型 |
|
|
146
|
+
|---|---|
|
|
147
|
+
| `Long` / `long` | BIGINT |
|
|
148
|
+
| `Integer` / `int` | INT |
|
|
149
|
+
| `Double` / `double` | DOUBLE |
|
|
150
|
+
| `String` | STRING / VARCHAR |
|
|
151
|
+
| `Boolean` | BOOLEAN |
|
|
152
|
+
| `java.sql.Timestamp` | TIMESTAMP |
|
|
153
|
+
| `java.sql.Date` | DATE |
|
|
154
|
+
| `BigDecimal` | DECIMAL |
|
|
155
|
+
|
|
156
|
+
## 常见问题
|
|
157
|
+
|
|
158
|
+
| 问题 | 原因 | 解决方案 |
|
|
159
|
+
|---|---|---|
|
|
160
|
+
| 数据写入后查不到 | 未调用 `apply()` 或未等待 RUNNING 结束 | 确认每行都调用 `apply()`,等待状态变为 SUCCEEDED |
|
|
161
|
+
| 主键表写入报错 | BulkloadStream 不支持主键表 | 改用 JDBC + MERGE 或 Flink igs-dynamic-table |
|
|
162
|
+
| 列值类型不匹配 | Java 类型与建表 DDL 不一致 | 写入前做类型转换(parseInt、parseDouble 等) |
|
|
163
|
+
| 连接失败 | URL 参数名错误 | BulkloadStream 用 `virtualcluster=`,不是 `vcluster=` |
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# RealtimeStream 实时写入参考
|
|
2
|
+
|
|
3
|
+
> 适合:Kafka 消费写入、高频实时数据接入(秒级可查)、主键表 CDC 写入
|
|
4
|
+
|
|
5
|
+
## Maven 依赖
|
|
6
|
+
|
|
7
|
+
```xml
|
|
8
|
+
<!-- 最新版本见 https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java -->
|
|
9
|
+
<dependency>
|
|
10
|
+
<groupId>com.clickzetta</groupId>
|
|
11
|
+
<artifactId>clickzetta-java</artifactId>
|
|
12
|
+
<version>2.0.0</version>
|
|
13
|
+
</dependency>
|
|
14
|
+
<dependency>
|
|
15
|
+
<groupId>org.apache.kafka</groupId>
|
|
16
|
+
<artifactId>kafka-clients</artifactId>
|
|
17
|
+
<version>3.2.0</version>
|
|
18
|
+
</dependency>
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## 使用限制
|
|
22
|
+
|
|
23
|
+
- 实时写入的数据可以秒级查询
|
|
24
|
+
- table stream、dynamic table 需等待约 **1 分钟**才能看到写入数据
|
|
25
|
+
- 表结构变更时,需停止任务,变更后约 **90 分钟**重新启动
|
|
26
|
+
|
|
27
|
+
## 操作模式
|
|
28
|
+
|
|
29
|
+
| 模式 | 适用表 | 可用 Operator |
|
|
30
|
+
|---|---|---|
|
|
31
|
+
| `RealTimeOperate.APPEND_ONLY` | 普通表 | `Stream.Operator.INSERT` |
|
|
32
|
+
| `RealTimeOperate.CDC` | 主键表 | `Stream.Operator.UPSERT`、`Stream.Operator.DELETE_IGNORE` |
|
|
33
|
+
|
|
34
|
+
## 普通表写入(APPEND_ONLY)
|
|
35
|
+
|
|
36
|
+
```java
|
|
37
|
+
// 推荐:显式参数方式(2.0.0+ 支持,不依赖 URL 解析)
|
|
38
|
+
ClickZettaClient client = ClickZettaClient.newBuilder()
|
|
39
|
+
.service("cn-shanghai-alicloud.api.clickzetta.com")
|
|
40
|
+
.instance("your_instance")
|
|
41
|
+
.workspace(workspace)
|
|
42
|
+
.schema(schema)
|
|
43
|
+
.username(user)
|
|
44
|
+
.password(password)
|
|
45
|
+
.vcluster(vc)
|
|
46
|
+
.build();
|
|
47
|
+
Options options = Options.builder().withMutationBufferLinesNum(10).build();
|
|
48
|
+
|
|
49
|
+
RealtimeStream stream = client.newRealtimeStreamBuilder()
|
|
50
|
+
.operate(RowStream.RealTimeOperate.APPEND_ONLY)
|
|
51
|
+
.options(options)
|
|
52
|
+
.schema(schema)
|
|
53
|
+
.table("events")
|
|
54
|
+
.build();
|
|
55
|
+
|
|
56
|
+
// ⚠️ RealtimeStream 用列名(不是索引)
|
|
57
|
+
Row row = stream.createRow(Stream.Operator.INSERT);
|
|
58
|
+
row.setValue("id", 1);
|
|
59
|
+
row.setValue("event", "{\"type\":\"click\"}");
|
|
60
|
+
stream.apply(row);
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## 主键表写入(CDC 模式)
|
|
64
|
+
|
|
65
|
+
```java
|
|
66
|
+
// 建表(主键表)
|
|
67
|
+
// CREATE TABLE orders (`txid` STRING PRIMARY KEY, `amount` DOUBLE, `status` STRING);
|
|
68
|
+
|
|
69
|
+
RealtimeStream stream = client.newRealtimeStreamBuilder()
|
|
70
|
+
.operate(RowStream.RealTimeOperate.CDC)
|
|
71
|
+
.options(options)
|
|
72
|
+
.schema(schema)
|
|
73
|
+
.table("orders")
|
|
74
|
+
.build();
|
|
75
|
+
|
|
76
|
+
// UPSERT:存在则更新,不存在则插入
|
|
77
|
+
Row row = stream.createRow(Stream.Operator.UPSERT);
|
|
78
|
+
row.setValue("txid", "order-001");
|
|
79
|
+
row.setValue("amount", 299.99);
|
|
80
|
+
row.setValue("status", "paid");
|
|
81
|
+
stream.apply(row);
|
|
82
|
+
|
|
83
|
+
// DELETE_IGNORE:删除,目标行不存在时自动忽略
|
|
84
|
+
Row delRow = stream.createRow(Stream.Operator.DELETE_IGNORE);
|
|
85
|
+
delRow.setValue("txid", "order-001");
|
|
86
|
+
stream.apply(delRow);
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## 完整示例:Kafka → Lakehouse
|
|
90
|
+
|
|
91
|
+
### KafkaReader 类
|
|
92
|
+
|
|
93
|
+
```java
|
|
94
|
+
import org.apache.kafka.clients.consumer.ConsumerConfig;
|
|
95
|
+
import org.apache.kafka.clients.consumer.KafkaConsumer;
|
|
96
|
+
import java.util.Collections;
|
|
97
|
+
import java.util.Properties;
|
|
98
|
+
|
|
99
|
+
public class KafkaReader {
|
|
100
|
+
private KafkaConsumer<String, String> consumer;
|
|
101
|
+
|
|
102
|
+
public KafkaReader() {
|
|
103
|
+
Properties props = new Properties();
|
|
104
|
+
props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
|
|
105
|
+
props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-group");
|
|
106
|
+
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,
|
|
107
|
+
"org.apache.kafka.common.serialization.StringDeserializer");
|
|
108
|
+
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
|
|
109
|
+
"org.apache.kafka.common.serialization.StringDeserializer");
|
|
110
|
+
props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
|
|
111
|
+
props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000");
|
|
112
|
+
consumer = new KafkaConsumer<>(props);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
public KafkaConsumer<String, String> readFromTopic(String topic) {
|
|
116
|
+
consumer.subscribe(Collections.singleton(topic));
|
|
117
|
+
return consumer;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Kafka2Lakehouse 主类
|
|
123
|
+
|
|
124
|
+
```java
|
|
125
|
+
import com.clickzetta.client.ClickZettaClient;
|
|
126
|
+
import com.clickzetta.client.RealtimeStream;
|
|
127
|
+
import com.clickzetta.client.RowStream;
|
|
128
|
+
import com.clickzetta.platform.client.api.Options;
|
|
129
|
+
import com.clickzetta.platform.client.api.Row;
|
|
130
|
+
import com.clickzetta.platform.client.api.Stream;
|
|
131
|
+
import org.apache.kafka.clients.consumer.ConsumerRecord;
|
|
132
|
+
import org.apache.kafka.clients.consumer.ConsumerRecords;
|
|
133
|
+
import org.apache.kafka.clients.consumer.KafkaConsumer;
|
|
134
|
+
import java.time.Duration;
|
|
135
|
+
|
|
136
|
+
public class Kafka2Lakehouse {
|
|
137
|
+
private static ClickZettaClient client;
|
|
138
|
+
private static final String password = "";
|
|
139
|
+
private static final String table = "realtime_stream";
|
|
140
|
+
private static final String workspace = "";
|
|
141
|
+
private static final String schema = "public";
|
|
142
|
+
private static final String user = "";
|
|
143
|
+
private static final String vc = "default";
|
|
144
|
+
static RealtimeStream realtimeStream;
|
|
145
|
+
|
|
146
|
+
public static void main(String[] args) throws Exception {
|
|
147
|
+
initialize();
|
|
148
|
+
KafkaReader kafkaReader = new KafkaReader();
|
|
149
|
+
final KafkaConsumer<String, String> consumer = kafkaReader.readFromTopic("lakehouse-stream");
|
|
150
|
+
int i = 1;
|
|
151
|
+
while (true) {
|
|
152
|
+
ConsumerRecords<String, String> records = consumer.poll(Duration.ofSeconds(1));
|
|
153
|
+
for (ConsumerRecord<String, String> record : records) {
|
|
154
|
+
Row row = realtimeStream.createRow(Stream.Operator.INSERT);
|
|
155
|
+
row.setValue("id", i++);
|
|
156
|
+
row.setValue("event", record.value());
|
|
157
|
+
realtimeStream.apply(row);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
private static void initialize() throws Exception {
|
|
163
|
+
Options options = Options.builder().withMutationBufferLinesNum(10).build();
|
|
164
|
+
client = ClickZettaClient.newBuilder()
|
|
165
|
+
.service("cn-shanghai-alicloud.api.clickzetta.com")
|
|
166
|
+
.instance("your_instance")
|
|
167
|
+
.workspace(workspace)
|
|
168
|
+
.schema(schema)
|
|
169
|
+
.username(user)
|
|
170
|
+
.password(password)
|
|
171
|
+
.vcluster(vc)
|
|
172
|
+
.build();
|
|
173
|
+
realtimeStream = client.newRealtimeStreamBuilder()
|
|
174
|
+
.operate(RowStream.RealTimeOperate.APPEND_ONLY)
|
|
175
|
+
.options(options)
|
|
176
|
+
.schema(schema)
|
|
177
|
+
.table(table)
|
|
178
|
+
.build();
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## 关键 API
|
|
184
|
+
|
|
185
|
+
| API | 说明 |
|
|
186
|
+
|---|---|
|
|
187
|
+
| `realtimeStream.createRow(Stream.Operator.INSERT)` | 普通表插入行 |
|
|
188
|
+
| `realtimeStream.createRow(Stream.Operator.UPSERT)` | 主键表 upsert 行 |
|
|
189
|
+
| `realtimeStream.createRow(Stream.Operator.DELETE_IGNORE)` | 主键表删除行 |
|
|
190
|
+
| `row.setValue(String columnName, Object value)` | 按列名设值(不是索引) |
|
|
191
|
+
| `realtimeStream.apply(row)` | 发送行到服务端 |
|
|
192
|
+
| `Options.builder().withMutationBufferLinesNum(n)` | 设置缓冲行数(默认 10) |
|
|
193
|
+
|
|
194
|
+
## BulkloadStream vs RealtimeStream 对比
|
|
195
|
+
|
|
196
|
+
| 维度 | BulkloadStream | RealtimeStream |
|
|
197
|
+
|---|---|---|
|
|
198
|
+
| 列设值方式 | `setValue(int index, value)` | `setValue(String name, value)` |
|
|
199
|
+
| URL 参数 | `virtualcluster=` | `vcluster=` |
|
|
200
|
+
| createRow 参数 | 无参数 | `Stream.Operator.INSERT/UPSERT/DELETE_IGNORE` |
|
|
201
|
+
| 适用频率 | 低频(≥5 分钟/批) | 高频(秒级) |
|
|
202
|
+
| 数据可见延迟 | close() 后可见 | ~1 分钟后可见 |
|
|
203
|
+
| 主键表 | ❌ | ✅ CDC 模式 |
|
|
204
|
+
|
|
205
|
+
## 常见问题
|
|
206
|
+
|
|
207
|
+
| 问题 | 原因 | 解决方案 |
|
|
208
|
+
|---|---|---|
|
|
209
|
+
| 连接失败 | URL 参数名错误 | RealtimeStream 用 `vcluster=`,不是 `virtualcluster=` |
|
|
210
|
+
| 列名找不到 | 列名拼写错误 | 列名区分大小写,与建表 DDL 保持一致 |
|
|
211
|
+
| 表结构变更后写入失败 | 旧 Stream 实例缓存了旧 schema | 停止任务,变更后等约 90 分钟再重启 |
|
|
212
|
+
| dynamic table 看不到数据 | 实时写入有 ~1 分钟确认延迟 | 等待 1 分钟后再查询 |
|