@clickzetta/cz-cli-darwin-x64 0.3.91 → 0.3.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
  3. package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
  4. package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
  5. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
  6. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
  7. package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
  8. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
  9. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
  10. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
  11. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
  12. package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
  13. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
  14. package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
  15. package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
  16. package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
  17. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
  18. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
  19. package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
  20. package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
  21. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
  22. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
  23. package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
  24. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
  25. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
  26. package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
  27. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
  28. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
  29. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
  30. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
  31. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
  32. package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
  33. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
  34. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
  35. package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
  36. package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
  37. package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
  38. package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
  39. package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
  40. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
  41. package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
  42. package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
  43. package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
  44. package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
  45. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
  46. package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
  47. package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
  48. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
  49. package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
  50. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
  51. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
  52. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
  53. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
  54. package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
  55. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
  56. package/package.json +1 -1
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  58. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  60. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
  61. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  62. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  63. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  64. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  65. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  66. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  67. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  68. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  69. /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
@@ -1,18 +1,20 @@
1
- # External Function DDL 参考
1
+ # External Function DDL Reference
2
2
 
3
- > 来源:https://www.yunqi.tech/documents/CREATE_EXTERNATL_FUNCTION
3
+ > Source: https://www.yunqi.tech/documents/CREATE_EXTERNATL_FUNCTION
4
4
 
5
- ## 概念
5
+ ## Concepts
6
6
 
7
- External Function(外部函数)是通过 Python/Java 编写、在云函数服务(阿里云 FC / 腾讯云 SCF / AWS Lambda)上执行的自定义 UDF。可调用:
8
- - **在线服务**:LLM API、图像识别 API
9
- - **离线模型**:打包上传的 Hugging Face 模型等
7
+ An External Function is a custom UDF written in Python or Java and executed on a cloud function service (Alibaba Cloud FC / Tencent Cloud SCF / AWS Lambda). It can call:
8
+ - **Online services**: image recognition APIs, custom REST services, etc.
9
+ - **Offline models**: Hugging Face models packaged and uploaded
10
10
 
11
- 支持函数类型:UDF(标量)、UDAF(聚合,仅 Java)、UDTF(表函数,仅 Java
11
+ Supported function types: UDF (scalar), UDAF (aggregate, Java only), UDTF (table function, Java only)
12
+
13
+ > For built-in LLM functions (AI_COMPLETE, AI_EMBEDDING), see the `clickzetta-ai-function` skill.
12
14
 
13
15
  ---
14
16
 
15
- ## CREATE API CONNECTION(云函数连接)
17
+ ## CREATE API CONNECTION (Cloud Function)
16
18
 
17
19
  ```sql
18
20
  CREATE API CONNECTION IF NOT EXISTS my_fc_conn
@@ -20,17 +22,17 @@ CREATE API CONNECTION IF NOT EXISTS my_fc_conn
20
22
  PROVIDER = 'aliyun' -- 'aliyun' | 'tencent' | 'aws'
21
23
  REGION = 'cn-shanghai'
22
24
  ROLE_ARN = 'acs:ram::1234567890:role/CzUDFRole'
23
- NAMESPACE = 'default' -- 腾讯云必填,其他填 'default'
25
+ NAMESPACE = 'default' -- Required for Tencent Cloud; use 'default' for others
24
26
  CODE_BUCKET = 'my-oss-bucket';
25
27
  ```
26
28
 
27
- | 参数 | 说明 |
29
+ | Parameter | Description |
28
30
  |---|---|
29
31
  | PROVIDER | `'aliyun'` / `'tencent'` / `'aws'` |
30
- | REGION | 阿里云:`cn-shanghai`;腾讯云:`ap-beijing`;AWS:`cn-northwest-1` |
31
- | ROLE_ARN | 授权给 Lakehouse RAM 角色 ARN |
32
- | NAMESPACE | 腾讯云命名空间(必填);其他填 `'default'` |
33
- | CODE_BUCKET | 存放函数代码包的 OSS/COS/S3 bucket 名称 |
32
+ | REGION | Alibaba Cloud: `cn-shanghai`; Tencent Cloud: `ap-beijing`; AWS: `cn-northwest-1` |
33
+ | ROLE_ARN | RAM role ARN granted to Lakehouse |
34
+ | NAMESPACE | Tencent Cloud namespace (required); use `'default'` for others |
35
+ | CODE_BUCKET | OSS/COS/S3 bucket name where the function code package is stored |
34
36
 
35
37
  ---
36
38
 
@@ -44,10 +46,10 @@ CREATE EXTERNAL FUNCTION IF NOT EXISTS my_schema.my_udf
44
46
  WITH PROPERTIES (
45
47
  'remote.udf.api' = 'python3.mc.v0' -- Python: python3.mc.v0 | Java: java8.hive2.v0
46
48
  )
47
- COMMENT '自定义函数说明';
49
+ COMMENT 'Custom function description';
48
50
  ```
49
51
 
50
- ### 资源文件地址格式
52
+ ### Resource File Path Formats
51
53
 
52
54
  ```
53
55
  -- OSS/COS/S3
@@ -55,24 +57,24 @@ oss://bucket-name/path/to/code.zip
55
57
  cos://bucket-name/path/to/code.zip
56
58
  s3://bucket-name/path/to/code.zip
57
59
 
58
- -- User Volume(无需开通对象存储)
60
+ -- User Volume (no object storage required)
59
61
  volume:user://~/code.zip
60
62
 
61
63
  -- External Volume
62
64
  volume://workspace.schema.volume_name/code.zip
63
65
  ```
64
66
 
65
- ### WITH PROPERTIES 参数
67
+ ### WITH PROPERTIES Parameters
66
68
 
67
- | 参数 | | 说明 |
69
+ | Parameter | Value | Description |
68
70
  |---|---|---|
69
- | `remote.udf.api` | `python3.mc.v0` | Python 3.10 运行时 |
70
- | `remote.udf.api` | `java8.hive2.v0` | Java 8 Hive 风格 UDF |
71
- | `remote.udf.protocol` | `http.arrow.v0` | 默认,访问云函数的协议 |
71
+ | `remote.udf.api` | `python3.mc.v0` | Python 3.10 runtime |
72
+ | `remote.udf.api` | `java8.hive2.v0` | Java 8 Hive-style UDF |
73
+ | `remote.udf.protocol` | `http.arrow.v0` | Default protocol for accessing the cloud function |
72
74
 
73
75
  ---
74
76
 
75
- ## Python UDF 代码结构
77
+ ## Python UDF Code Structure
76
78
 
77
79
  ```python
78
80
  #!/usr/bin/env python
@@ -81,7 +83,7 @@ try:
81
83
  except ImportError:
82
84
  annotate = lambda _: lambda _: _
83
85
 
84
- @annotate("string->string") # 函数签名:输入类型->返回类型
86
+ @annotate("string->string") # Function signature: input_type->return_type
85
87
  class Upper(object):
86
88
  def evaluate(self, arg):
87
89
  if arg is None:
@@ -89,83 +91,43 @@ class Upper(object):
89
91
  return arg.upper()
90
92
  ```
91
93
 
92
- ### 函数签名格式
94
+ ### Function Signature Format
93
95
 
94
96
  ```
95
97
  "input_type1,input_type2->return_type"
96
98
 
97
- # 示例
98
- "string->string" # 字符串转字符串
99
- "string,int->double" # 两个输入,返回 double
100
- "string->array<string>" # 返回数组
99
+ # Examples
100
+ "string->string" # String in, string out
101
+ "string,int->double" # Two inputs, returns double
102
+ "string->array<string>" # Returns an array
101
103
  ```
102
104
 
103
- 支持类型:`string`、`int`、`bigint`、`double`、`float`、`boolean`、`array<T>`、`map<K,V>`
105
+ Supported types: `string`, `int`, `bigint`, `double`, `float`, `boolean`, `array<T>`, `map<K,V>`
104
106
 
105
- ### 打包上传
107
+ ### Packaging and Upload
106
108
 
107
109
  ```bash
108
- # 安装依赖到当前目录
110
+ # Install dependencies into the current directory
109
111
  pip3 install httpx pydantic -t .
110
112
 
111
- # 打包(< 500MB)
113
+ # Package (must be < 500 MB)
112
114
  zip -rq code.zip ./*
113
115
  ```
114
116
 
115
117
  ```sql
116
- -- 上传到 User Volume(在 ClickZetta Studio CLI 中执行,source_path 使用绝对路径)
118
+ -- Upload to User Volume (run in ClickZetta Studio or CLI; source_path must be an absolute path)
117
119
  PUT '/path/to/code.zip' TO USER VOLUME;
118
120
  ```
119
121
 
120
122
  ---
121
123
 
122
- ## 管理操作
124
+ ## Management
123
125
 
124
126
  ```sql
125
- -- 查看外部函数列表
127
+ -- List external functions
126
128
  SHOW EXTERNAL FUNCTIONS;
127
129
  SHOW EXTERNAL FUNCTIONS LIKE 'my_%';
128
130
 
129
- -- 删除外部函数
131
+ -- Drop an external function
130
132
  DROP FUNCTION IF EXISTS my_schema.my_udf;
131
133
  ```
132
-
133
- ---
134
-
135
- ## 内置 AI 函数(无需部署云函数)
136
-
137
- ### AI_COMPLETE(调用 LLM)
138
-
139
- ```sql
140
- -- 通过 API Connection 调用(需先创建连接)
141
- CREATE API CONNECTION conn_bailian
142
- TYPE ai_function
143
- PROVIDER = 'bailian'
144
- BASE_URL = 'https://dashscope.aliyuncs.com/api/v1'
145
- API_KEY = '<key>';
146
-
147
- -- 调用 LLM 生成文本
148
- SELECT AI_COMPLETE('connection:conn_bailian', '请用一句话总结:' || content) AS summary
149
- FROM articles
150
- LIMIT 10;
151
-
152
- -- 通过平台 Endpoint 调用(管理员预配置)
153
- SELECT AI_COMPLETE('endpoint:my_llm_endpoint', prompt_col) AS result
154
- FROM my_table;
155
- ```
156
-
157
- ### AI_EMBEDDING(文本向量化)
158
-
159
- ```sql
160
- -- 将文本转为向量(用于语义搜索)
161
- SELECT id, content,
162
- AI_EMBEDDING('connection:conn_bailian', content) AS embedding
163
- FROM documents;
164
-
165
- -- 结合向量索引做语义搜索
166
- SELECT id, content,
167
- cosine_distance(embedding, AI_EMBEDDING('connection:conn_bailian', '查询文本')) AS dist
168
- FROM doc_embeddings
169
- ORDER BY dist
170
- LIMIT 10;
171
- ```
@@ -1,30 +1,31 @@
1
1
  ---
2
2
  name: clickzetta-java-sdk
3
3
  description: |
4
- 使用 ClickZetta Java SDK 将数据批量或实时写入 Lakehouse 表。
5
- 覆盖 BulkloadStream(本地文件/数据库批量上传)和 RealtimeStream(Kafka 实时消费写入)
6
- 两种接口的完整使用模式,包括 Maven 依赖、连接 URL 格式、行写入 API、
7
- 状态监控、Options 调优和常见错误处理。
8
- 当用户说"Java SDK""BulkloadStream""RealtimeStream"
9
- "Java 写入 Lakehouse""Java 批量上传""Kafka Java 写入"
10
- "clickzetta-java""Maven 依赖""Java 数据导入"时触发。
4
+ Use the ClickZetta Java SDK to write data to Lakehouse tables in batch or in real time.
5
+ Covers complete usage patterns for BulkloadStream (local file/database batch uploads)
6
+ and RealtimeStream (Kafka real-time consumption and writes), including Maven dependencies,
7
+ connection URL formats, row write APIs, status monitoring, Options tuning, and common error handling.
8
+ Trigger when users say "Java SDK", "BulkloadStream", "RealtimeStream",
9
+ "write to Lakehouse with Java", "Java batch upload", "Kafka Java write",
10
+ "clickzetta-java", "Maven dependency", "Java data import",
11
+ "Java 写入 Lakehouse", "Java 批量上传", or "Kafka Java 写入".
11
12
  Keywords: Java SDK, BulkloadStream, RealtimeStream, Kafka consumer, batch write, real-time write
12
13
  ---
13
14
 
14
15
  # ClickZetta Java SDK
15
16
 
16
- Java SDK 提供两种写入接口:
17
- - **BulkloadStream** 批量写入,适合定时 ETL、本地文件导入(不支持主键表,不适合 5 分钟以内的高频写入)
18
- - **RealtimeStream** 实时写入,适合 Kafka 消费、流式数据接入(秒级可查)
17
+ The Java SDK provides two write interfaces:
18
+ - **BulkloadStream** - batch writes for scheduled ETL and local file imports. It does not support primary-key tables and is not recommended for high-frequency writes under 5 minutes.
19
+ - **RealtimeStream** - real-time writes for Kafka consumption and streaming ingestion. Data can be queried within seconds.
19
20
 
20
- 阅读 [references/bulkload.md](references/bulkload.md) 了解批量写入,[references/realtime.md](references/realtime.md) 了解实时写入。
21
+ Read [references/bulkload.md](references/bulkload.md) for batch writes and [references/realtime.md](references/realtime.md) for real-time writes.
21
22
 
22
23
  ---
23
24
 
24
- ## Maven 依赖
25
+ ## Maven Dependency
25
26
 
26
27
  ```xml
27
- <!-- clickzetta-java 最新版本见 https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java -->
28
+ <!-- See https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java for the latest clickzetta-java version. -->
28
29
  <dependency>
29
30
  <groupId>com.clickzetta</groupId>
30
31
  <artifactId>clickzetta-java</artifactId>
@@ -32,7 +33,7 @@ Java SDK 提供两种写入接口:
32
33
  </dependency>
33
34
  ```
34
35
 
35
- RealtimeStream + Kafka 还需要:
36
+ RealtimeStream with Kafka also requires:
36
37
 
37
38
  ```xml
38
39
  <dependency>
@@ -44,10 +45,10 @@ RealtimeStream + Kafka 还需要:
44
45
 
45
46
  ---
46
47
 
47
- ## 连接 URL 格式
48
+ ## Connection URL Format
48
49
 
49
50
  ```java
50
- // 推荐:显式参数方式(2.0.0+ 支持,不依赖 URL 解析)
51
+ // Recommended: explicit parameters. Supported in 2.0.0+ and does not depend on URL parsing.
51
52
  ClickZettaClient client = ClickZettaClient.newBuilder()
52
53
  .service("cn-shanghai-alicloud.api.clickzetta.com")
53
54
  .instance("your_instance")
@@ -58,7 +59,7 @@ ClickZettaClient client = ClickZettaClient.newBuilder()
58
59
  .vcluster("default")
59
60
  .build();
60
61
 
61
- // 兼容:URL 方式(BulkloadStream virtualcluster=,RealtimeStream vcluster=)
62
+ // Compatible URL-based mode. BulkloadStream uses virtualcluster=, while RealtimeStream uses vcluster=.
62
63
  String bulkUrl = MessageFormat.format(
63
64
  "jdbc:clickzetta://{0}.{1}/{2}?schema={3}&username={4}&password={5}&virtualcluster={6}",
64
65
  instance, region_endpoint, workspace, schema, username, password, vcluster
@@ -70,35 +71,35 @@ String rtUrl = MessageFormat.format(
70
71
  ClickZettaClient client = ClickZettaClient.newBuilder().url(url).build();
71
72
  ```
72
73
 
73
- JDBC 连接(DDL / 查询):
74
+ JDBC connection for DDL and queries:
74
75
 
75
76
  ```java
76
- // 2.0.0+ 驱动类:com.clickzetta.client.jdbc.ClickZettaDriver
77
- // 1.x 驱动类:com.clickzetta.jdbc.ClickZettaDriver
77
+ // Driver class for 2.0.0+: com.clickzetta.client.jdbc.ClickZettaDriver
78
+ // Driver class for 1.x: com.clickzetta.jdbc.ClickZettaDriver
78
79
  Class.forName("com.clickzetta.client.jdbc.ClickZettaDriver");
79
80
  Connection conn = DriverManager.getConnection(jdbcUrl);
80
81
  ```
81
82
 
82
83
  ---
83
84
 
84
- ## BulkloadStream 快速示例
85
+ ## BulkloadStream Quick Example
85
86
 
86
87
  ```java
87
- // 创建 BulkloadStream
88
+ // Create a BulkloadStream.
88
89
  BulkloadStream stream = client.newBulkloadStreamBuilder()
89
90
  .schema("public")
90
91
  .table("orders")
91
92
  .operate(RowStream.BulkLoadOperate.APPEND)
92
93
  .build();
93
94
 
94
- // 写入数据(列索引从 0 开始,顺序与建表 DDL 一致)
95
+ // Write data. Column indexes start at 0 and must match the table DDL order.
95
96
  Row row = stream.createRow();
96
97
  row.setValue(0, "order-001"); // STRING
97
98
  row.setValue(1, 1); // INT
98
99
  row.setValue(2, 299.99); // DOUBLE
99
- stream.apply(row); // ⚠️ 必须调用,否则数据不发送到服务端
100
+ stream.apply(row); // Required. Otherwise the row is not sent to the server.
100
101
 
101
- // 关闭并等待完成
102
+ // Close and wait for completion.
102
103
  stream.close();
103
104
  while (stream.getState() == StreamState.RUNNING) {
104
105
  Thread.sleep(1000);
@@ -111,15 +112,15 @@ client.close();
111
112
 
112
113
  ---
113
114
 
114
- ## RealtimeStream 快速示例
115
+ ## RealtimeStream Quick Example
115
116
 
116
117
  ```java
117
- // Options 调优
118
+ // Options tuning.
118
119
  Options options = Options.builder()
119
- .withMutationBufferLinesNum(10) // 缓冲行数
120
+ .withMutationBufferLinesNum(10) // Number of buffered rows.
120
121
  .build();
121
122
 
122
- // 创建 RealtimeStream(普通表,APPEND_ONLY
123
+ // Create a RealtimeStream for a regular table in APPEND_ONLY mode.
123
124
  RealtimeStream stream = client.newRealtimeStreamBuilder()
124
125
  .operate(RowStream.RealTimeOperate.APPEND_ONLY)
125
126
  .options(options)
@@ -127,7 +128,7 @@ RealtimeStream stream = client.newRealtimeStreamBuilder()
127
128
  .table("events")
128
129
  .build();
129
130
 
130
- // 写入数据(用列名,不用索引)
131
+ // Write data by column name, not by index.
131
132
  Row row = stream.createRow(Stream.Operator.INSERT);
132
133
  row.setValue("id", 1);
133
134
  row.setValue("event", "{\"type\":\"click\"}");
@@ -135,26 +136,26 @@ stream.apply(row);
135
136
  stream.close();
136
137
  ```
137
138
 
138
- ## RealtimeStream CDC 示例(主键表 UPSERT / DELETE)
139
+ ## RealtimeStream CDC Example for Primary-Key Tables
139
140
 
140
141
  ```java
141
- // 建表:CREATE TABLE orders (txid STRING NOT NULL PRIMARY KEY, amount DOUBLE, status STRING);
142
+ // Table DDL: CREATE TABLE orders (txid STRING NOT NULL PRIMARY KEY, amount DOUBLE, status STRING);
142
143
 
143
144
  RealtimeStream stream = client.newRealtimeStreamBuilder()
144
- .operate(RowStream.RealTimeOperate.CDC) // 主键表必须用 CDC
145
+ .operate(RowStream.RealTimeOperate.CDC) // Primary-key tables must use CDC.
145
146
  .options(options)
146
147
  .schema("public")
147
148
  .table("orders")
148
149
  .build();
149
150
 
150
- // UPSERT:存在则更新,不存在则插入
151
+ // UPSERT: update an existing row or insert a new row.
151
152
  Row row = stream.createRow(Stream.Operator.UPSERT);
152
153
  row.setValue("txid", "order-001");
153
154
  row.setValue("amount", 299.99);
154
155
  row.setValue("status", "paid");
155
156
  stream.apply(row);
156
157
 
157
- // DELETE_IGNORE:删除,目标行不存在时自动忽略
158
+ // DELETE_IGNORE: delete the row and ignore the operation if the target row does not exist.
158
159
  Row del = stream.createRow(Stream.Operator.DELETE_IGNORE);
159
160
  del.setValue("txid", "order-001");
160
161
  stream.apply(del);
@@ -164,23 +165,23 @@ stream.close();
164
165
 
165
166
  ---
166
167
 
167
- ## 选择指南
168
+ ## Selection Guide
168
169
 
169
- | 场景 | 推荐接口 |
170
+ | Scenario | Recommended interface |
170
171
  |---|---|
171
- | 定时批量 ETL(每小时/每天) | BulkloadStream |
172
- | Kafka 实时消费 | RealtimeStream |
173
- | 5 分钟以内高频写入 | RealtimeStream |
174
- | 主键表写入(UPSERT / DELETE | RealtimeStream CDC 模式 |
172
+ | Scheduled batch ETL, hourly or daily | BulkloadStream |
173
+ | Kafka real-time consumption | RealtimeStream |
174
+ | High-frequency writes under 5 minutes | RealtimeStream |
175
+ | Primary-key table writes with UPSERT or DELETE | RealtimeStream CDC mode |
175
176
 
176
177
  ---
177
178
 
178
- ## 使用限制
179
+ ## Usage Limits
179
180
 
180
- | 限制 | BulkloadStream | RealtimeStream |
181
+ | Limit | BulkloadStream | RealtimeStream |
181
182
  |---|---|---|
182
- | 主键表 | 不支持 | CDC 模式支持 |
183
- | 高频写入(< 5 分钟) | 不适合 | 支持 |
184
- | 数据可见延迟 | 写完 close() 后可见 | ~1 分钟后可见 |
185
- | Table Stream/Dynamic Table 可见 | close() | ~1 分钟后 |
186
- | 表结构变更 | 重建 Stream | 停止任务,变更后约 90 分钟重启 |
183
+ | Primary-key tables | Not supported | Supported in CDC mode |
184
+ | High-frequency writes under 5 minutes | Not recommended | Supported |
185
+ | Data visibility latency | Visible after `close()` | Visible after about 1 minute |
186
+ | Table Stream/Dynamic Table visibility | After `close()` | After about 1 minute |
187
+ | Schema changes | Recreate the stream | Stop the task and restart about 90 minutes after the schema change |
@@ -1,12 +1,12 @@
1
- {"case_id":"001","type":"should_call","user_input":" Java SDK BulkloadStream 批量写入数据到 Lakehouse","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream"]}
2
- {"case_id":"002","type":"should_call","user_input":"Java 怎么消费 Kafka 实时写入 Lakehouse","expected_skill":"clickzetta-java-sdk","expected_output_contains":["RealtimeStream"]}
3
- {"case_id":"003","type":"should_call","user_input":"clickzetta-java Maven 依赖怎么配","expected_skill":"clickzetta-java-sdk","expected_output_contains":["groupId","clickzetta-java"]}
4
- {"case_id":"004","type":"should_call","user_input":"BulkloadStream RealtimeStream 有什么区别","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream","RealtimeStream"]}
5
- {"case_id":"005","type":"should_call","user_input":"Java SDK 连接 URL 格式是什么","expected_skill":"clickzetta-java-sdk","expected_output_contains":["URL"]}
6
- {"case_id":"006","type":"should_call","user_input":"Java 批量上传本地文件到 Lakehouse","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream"]}
7
- {"case_id":"007","type":"should_call","user_input":"RealtimeStream setValue 怎么用","expected_skill":"clickzetta-java-sdk","expected_output_contains":["setValue"]}
8
- {"case_id":"008","type":"should_not_call","user_input":"Python SDK 怎么连接 Lakehouse","forbidden_skill":"clickzetta-java-sdk"}
9
- {"case_id":"009","type":"should_not_call","user_input":"帮我写一个 Spring Boot 应用","forbidden_skill":"clickzetta-java-sdk"}
10
- {"case_id":"010","type":"should_not_call","user_input":"Flink 怎么写入 Lakehouse","forbidden_skill":"clickzetta-java-sdk"}
11
- {"case_id":"011","type":"should_not_call","user_input":"怎么创建 VCluster","forbidden_skill":"clickzetta-java-sdk"}
12
- {"case_id":"012","type":"should_not_call","user_input":"MySQL JDBC 连接怎么配置","forbidden_skill":"clickzetta-java-sdk"}
1
+ {"case_id":"001","type":"should_call","user_input":"Use Java SDK BulkloadStream to batch write data to Lakehouse","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream"]}
2
+ {"case_id":"002","type":"should_call","user_input":"How can Java consume Kafka and write to Lakehouse in real time","expected_skill":"clickzetta-java-sdk","expected_output_contains":["RealtimeStream"]}
3
+ {"case_id":"003","type":"should_call","user_input":"How do I configure the Maven dependency for clickzetta-java","expected_skill":"clickzetta-java-sdk","expected_output_contains":["groupId","clickzetta-java"]}
4
+ {"case_id":"004","type":"should_call","user_input":"What is the difference between BulkloadStream and RealtimeStream","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream","RealtimeStream"]}
5
+ {"case_id":"005","type":"should_call","user_input":"What is the Java SDK connection URL format","expected_skill":"clickzetta-java-sdk","expected_output_contains":["URL"]}
6
+ {"case_id":"006","type":"should_call","user_input":"Batch upload a local file to Lakehouse with Java","expected_skill":"clickzetta-java-sdk","expected_output_contains":["BulkloadStream"]}
7
+ {"case_id":"007","type":"should_call","user_input":"How do I use setValue with RealtimeStream","expected_skill":"clickzetta-java-sdk","expected_output_contains":["setValue"]}
8
+ {"case_id":"008","type":"should_not_call","user_input":"How do I connect to Lakehouse with the Python SDK","forbidden_skill":"clickzetta-java-sdk"}
9
+ {"case_id":"009","type":"should_not_call","user_input":"Help me write a Spring Boot application","forbidden_skill":"clickzetta-java-sdk"}
10
+ {"case_id":"010","type":"should_not_call","user_input":"How does Flink write to Lakehouse","forbidden_skill":"clickzetta-java-sdk"}
11
+ {"case_id":"011","type":"should_not_call","user_input":"How do I create a VCluster","forbidden_skill":"clickzetta-java-sdk"}
12
+ {"case_id":"012","type":"should_not_call","user_input":"How do I configure a MySQL JDBC connection","forbidden_skill":"clickzetta-java-sdk"}
@@ -1,12 +1,12 @@
1
- # BulkloadStream 详细参考
1
+ # BulkloadStream Detailed Reference
2
2
 
3
- > 适合:定时 ETL、本地文件导入、数据库迁移
4
- > 不适合:主键表、5 分钟以内高频写入
3
+ > Best for: scheduled ETL, local file imports, and database migration.
4
+ > Not for: primary-key tables or high-frequency writes under 5 minutes.
5
5
 
6
- ## Maven 依赖
6
+ ## Maven Dependency
7
7
 
8
8
  ```xml
9
- <!-- 最新版本见 https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java -->
9
+ <!-- See https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java for the latest version. -->
10
10
  <dependency>
11
11
  <groupId>com.clickzetta</groupId>
12
12
  <artifactId>clickzetta-java</artifactId>
@@ -14,17 +14,17 @@
14
14
  </dependency>
15
15
  ```
16
16
 
17
- 最新版本见 [Maven Central](https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java)
17
+ See [Maven Central](https://central.sonatype.com/artifact/com.clickzetta/clickzetta-java) for the latest version.
18
18
 
19
- ## 使用限制
19
+ ## Usage Limits
20
20
 
21
- - **不支持主键(pk)表写入**
22
- - **不适合时间间隔小于 5 分钟的高频写入**
23
- - 写入完成 `close()` 后数据才可见
21
+ - **Primary-key table writes are not supported.**
22
+ - **High-frequency writes at intervals shorter than 5 minutes are not recommended.**
23
+ - Data becomes visible only after writing is complete and `close()` has been called.
24
24
 
25
- ## 完整示例:读取本地 CSV 写入 Lakehouse
25
+ ## Complete Example: Read a Local CSV and Write to Lakehouse
26
26
 
27
- ### 建表
27
+ ### Create the Table
28
28
 
29
29
  ```sql
30
30
  CREATE TABLE bulk_order_items (
@@ -38,7 +38,7 @@ CREATE TABLE bulk_order_items (
38
38
  );
39
39
  ```
40
40
 
41
- ### Java 代码(BulkloadFile 类)
41
+ ### Java Code: BulkloadFile Class
42
42
 
43
43
  ```java
44
44
  import com.clickzetta.client.BulkloadStream;
@@ -66,12 +66,12 @@ public class BulkloadFile {
66
66
  initialize();
67
67
  File csvFile = new File("olist_order_items_dataset.csv");
68
68
  BufferedReader reader = new BufferedReader(new FileReader(csvFile));
69
- reader.readLine(); // 跳过 header
69
+ reader.readLine(); // Skip the header row.
70
70
 
71
71
  String line;
72
72
  while ((line = reader.readLine()) != null) {
73
73
  String[] values = line.split(",");
74
- // 类型转换必须与建表 DDL 一致
74
+ // Type conversion must match the table DDL.
75
75
  String orderId = values[0];
76
76
  int orderItemId = Integer.parseInt(values[1]);
77
77
  String productId = values[2];
@@ -81,7 +81,7 @@ public class BulkloadFile {
81
81
  double freightValue = Double.parseDouble(values[6]);
82
82
 
83
83
  Row row = bulkloadStream.createRow();
84
- // ⚠️ BulkloadStream 用列索引(从 0 开始),顺序与建表 DDL 一致
84
+ // BulkloadStream uses column indexes starting at 0. The order must match the table DDL.
85
85
  row.setValue(0, orderId);
86
86
  row.setValue(1, orderItemId);
87
87
  row.setValue(2, productId);
@@ -89,7 +89,7 @@ public class BulkloadFile {
89
89
  row.setValue(4, shippingLimitDate);
90
90
  row.setValue(5, price);
91
91
  row.setValue(6, freightValue);
92
- // ⚠️ 必须调用 apply(),否则数据不发送到服务端
92
+ // apply() is required. Otherwise the row is not sent to the server.
93
93
  bulkloadStream.apply(row);
94
94
  }
95
95
 
@@ -101,7 +101,7 @@ public class BulkloadFile {
101
101
  }
102
102
 
103
103
  private static void initialize() throws Exception {
104
- // 推荐:显式参数方式(2.0.0+ 支持)
104
+ // Recommended: explicit parameters. Supported in 2.0.0+.
105
105
  client = ClickZettaClient.newBuilder()
106
106
  .service("cn-shanghai-alicloud.api.clickzetta.com")
107
107
  .instance("your_instance")
@@ -129,20 +129,20 @@ public class BulkloadFile {
129
129
  }
130
130
  ```
131
131
 
132
- ## 关键 API
132
+ ## Key APIs
133
133
 
134
- | API | 说明 |
134
+ | API | Description |
135
135
  |---|---|
136
- | `bulkloadStream.createRow()` | 创建行对象(无参数) |
137
- | `row.setValue(int index, Object value)` | 按列索引设值(从 0 开始) |
138
- | `bulkloadStream.apply(row)` | 发送行到服务端(必须调用) |
139
- | `bulkloadStream.close()` | 关闭并触发提交 |
140
- | `bulkloadStream.getState()` | 获取状态:RUNNING / SUCCEEDED / FAILED |
141
- | `bulkloadStream.getErrorMessage()` | 获取失败原因 |
136
+ | `bulkloadStream.createRow()` | Create a row object without arguments. |
137
+ | `row.setValue(int index, Object value)` | Set a value by column index, starting at 0. |
138
+ | `bulkloadStream.apply(row)` | Send the row to the server. This call is required. |
139
+ | `bulkloadStream.close()` | Close the stream and trigger the commit. |
140
+ | `bulkloadStream.getState()` | Get the state: RUNNING, SUCCEEDED, or FAILED. |
141
+ | `bulkloadStream.getErrorMessage()` | Get the failure reason. |
142
142
 
143
- ## 类型映射
143
+ ## Type Mapping
144
144
 
145
- | Java 类型 | Lakehouse 类型 |
145
+ | Java type | Lakehouse type |
146
146
  |---|---|
147
147
  | `Long` / `long` | BIGINT |
148
148
  | `Integer` / `int` | INT |
@@ -153,11 +153,11 @@ public class BulkloadFile {
153
153
  | `java.sql.Date` | DATE |
154
154
  | `BigDecimal` | DECIMAL |
155
155
 
156
- ## 常见问题
156
+ ## FAQ
157
157
 
158
- | 问题 | 原因 | 解决方案 |
158
+ | Issue | Cause | Solution |
159
159
  |---|---|---|
160
- | 数据写入后查不到 | 未调用 `apply()` 或未等待 RUNNING 结束 | 确认每行都调用 `apply()`,等待状态变为 SUCCEEDED |
161
- | 主键表写入报错 | BulkloadStream 不支持主键表 | 改用 JDBC + MERGE Flink igs-dynamic-table |
162
- | 列值类型不匹配 | Java 类型与建表 DDL 不一致 | 写入前做类型转换(parseIntparseDouble 等) |
163
- | 连接失败 | URL 参数名错误 | BulkloadStream `virtualcluster=`,不是 `vcluster=` |
160
+ | Data cannot be queried after writing | `apply()` was not called or the RUNNING state has not finished | Call `apply()` for every row and wait until the state becomes SUCCEEDED. |
161
+ | Primary-key table write fails | BulkloadStream does not support primary-key tables | Use JDBC with MERGE or Flink `igs-dynamic-table` instead. |
162
+ | Column value type mismatch | Java types do not match the table DDL | Convert values before writing, for example with `parseInt` or `parseDouble`. |
163
+ | Connection fails | Wrong URL parameter name | BulkloadStream uses `virtualcluster=`, not `vcluster=`. |