@clickzetta/cz-cli-darwin-x64 0.3.39 → 0.3.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +128 -287
- package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +96 -11
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +237 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
- package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
- package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +103 -11
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +58 -2
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +4 -4
- package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +34 -0
- package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +38 -20
- package/bin/skills/clickzetta-metadata/SKILL.md +51 -32
- package/bin/skills/clickzetta-monitoring/SKILL.md +18 -2
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +2 -2
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +63 -38
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +63 -16
- package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +115 -9
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
- package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
- package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
- package/bin/skills/cz-cli-inner/SKILL.md +5 -4
- package/package.json +1 -1
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +0 -4
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-semantic-view
|
|
3
|
+
description: |
|
|
4
|
+
创建和查询 ClickZetta Lakehouse 语义视图(Semantic View)。语义视图是架构级逻辑
|
|
5
|
+
数据模型对象,通过声明逻辑表、维度、指标、过滤器,将复杂的多表 JOIN 和聚合逻辑封装
|
|
6
|
+
为业务友好的语义层,使用 semantic_view() 函数查询,无需手写 JOIN。
|
|
7
|
+
当前为邀测功能(1.3 版本起)。
|
|
8
|
+
当用户说"创建语义视图"、"semantic view"、"语义层"、"定义指标"、"定义维度"、
|
|
9
|
+
"semantic_view() 怎么用"、"统一指标口径"、"业务语义模型"、"逻辑表"、
|
|
10
|
+
"DIMENSIONS"、"METRICS"、"FILTERS"、"DROP SEMANTIC VIEW"、
|
|
11
|
+
"SHOW SEMANTIC VIEWS"时触发。
|
|
12
|
+
Keywords: semantic view, dimension, metric, logical model, unified metrics, semantic layer
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
# ClickZetta 语义视图(Semantic View)
|
|
16
|
+
|
|
17
|
+
阅读 [references/semantic-view-reference.md](references/semantic-view-reference.md) 了解完整语法。
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 概述
|
|
22
|
+
|
|
23
|
+
语义视图是 ClickZetta Lakehouse 的**架构级逻辑数据模型对象**,解决两类核心问题:
|
|
24
|
+
|
|
25
|
+
- **数据分析**:统一维度和指标定义,业务用户无需编写复杂 JOIN 即可查询跨表数据
|
|
26
|
+
- **数据治理**:集中管理表关系、维度、指标定义,确保全组织使用相同数据口径
|
|
27
|
+
|
|
28
|
+
> ⚠️ 当前为**邀测功能**(1.3 版本),需联系技术支持开通。
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 四大组件
|
|
33
|
+
|
|
34
|
+
| 组件 | 关键字 | 说明 |
|
|
35
|
+
|---|---|---|
|
|
36
|
+
| 逻辑表 | `TABLES` | 映射物理表,声明主键和外键关系,引擎自动处理 JOIN |
|
|
37
|
+
| 维度 | `DIMENSIONS` | 分类属性(谁/什么/哪里/何时),支持计算维度 |
|
|
38
|
+
| 指标 | `METRICS` | 聚合度量(SUM/AVG/COUNT/MIN/MAX),业务 KPI |
|
|
39
|
+
| 过滤器 | `FILTERS` | 预定义可重用过滤条件(语义注解,不可直接传入查询) |
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## 创建语义视图
|
|
44
|
+
|
|
45
|
+
```sql
|
|
46
|
+
CREATE SEMANTIC VIEW <视图名>
|
|
47
|
+
TABLES (
|
|
48
|
+
<表别名> AS <schema>.<物理表>
|
|
49
|
+
PRIMARY KEY (<列名>)
|
|
50
|
+
[ FOREIGN KEY (<列名>) REFERENCES <其他表别名> ]
|
|
51
|
+
[ WITH SYNONYMS ('<同义词>') ]
|
|
52
|
+
[ COMMENT = '<说明>' ]
|
|
53
|
+
[ , ... ]
|
|
54
|
+
)
|
|
55
|
+
[ FILTERS (
|
|
56
|
+
<表别名>.<过滤器名> AS <布尔表达式>
|
|
57
|
+
[ , ... ]
|
|
58
|
+
) ]
|
|
59
|
+
DIMENSIONS (
|
|
60
|
+
{ <表别名>.<维度名> | <维度名> } AS <表达式>
|
|
61
|
+
[ WITH SYNONYMS = ('<同义词>' [ , ... ]) ]
|
|
62
|
+
[ is_unique = { true | false } ]
|
|
63
|
+
[ is_time = { true | false } ]
|
|
64
|
+
[ enum_values = [ <值1>, <值2>, ... ] ]
|
|
65
|
+
[ COMMENT = '<说明>' ]
|
|
66
|
+
[ , ... ]
|
|
67
|
+
)
|
|
68
|
+
METRICS (
|
|
69
|
+
<表别名>.<指标名> AS <聚合表达式>
|
|
70
|
+
[ COMMENT = '<说明>' ]
|
|
71
|
+
[ , ... ]
|
|
72
|
+
)
|
|
73
|
+
[ COMMENT = '<视图说明>' ];
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 完整示例(TPC-H 收入分析)
|
|
77
|
+
|
|
78
|
+
```sql
|
|
79
|
+
DROP SEMANTIC VIEW IF EXISTS tpch_rev_analysis;
|
|
80
|
+
CREATE SEMANTIC VIEW tpch_rev_analysis
|
|
81
|
+
TABLES (
|
|
82
|
+
customers AS tpch.customer
|
|
83
|
+
PRIMARY KEY (c_custkey)
|
|
84
|
+
COMMENT = '客户主表',
|
|
85
|
+
orders AS tpch.orders
|
|
86
|
+
PRIMARY KEY (o_orderkey)
|
|
87
|
+
FOREIGN KEY (o_custkey) REFERENCES customers
|
|
88
|
+
WITH SYNONYMS ('销售订单')
|
|
89
|
+
COMMENT = '订单表',
|
|
90
|
+
line_items AS tpch.lineitem
|
|
91
|
+
PRIMARY KEY (l_orderkey, l_linenumber)
|
|
92
|
+
FOREIGN KEY (l_orderkey) REFERENCES orders
|
|
93
|
+
COMMENT = '订单明细'
|
|
94
|
+
)
|
|
95
|
+
FILTERS (
|
|
96
|
+
customers.is_building AS customers.c_mktsegment = 'BUILDING'
|
|
97
|
+
)
|
|
98
|
+
DIMENSIONS (
|
|
99
|
+
customers.customer_name AS c_name
|
|
100
|
+
WITH SYNONYMS = ('客户名称', 'customer name')
|
|
101
|
+
is_unique = true
|
|
102
|
+
COMMENT = '客户名称',
|
|
103
|
+
orders.order_date AS o_orderdate
|
|
104
|
+
is_time = true
|
|
105
|
+
COMMENT = '下单日期',
|
|
106
|
+
orders.order_year AS YEAR(o_orderdate)
|
|
107
|
+
COMMENT = '下单年份',
|
|
108
|
+
orders.order_status AS o_orderstatus
|
|
109
|
+
enum_values = ['O', 'F', 'P']
|
|
110
|
+
COMMENT = '订单状态'
|
|
111
|
+
)
|
|
112
|
+
METRICS (
|
|
113
|
+
customers.customer_count AS COUNT(c_custkey)
|
|
114
|
+
COMMENT = '客户总数',
|
|
115
|
+
orders.avg_order_value AS AVG(o_totalprice)
|
|
116
|
+
COMMENT = '平均订单金额',
|
|
117
|
+
orders.total_revenue AS SUM(o_totalprice)
|
|
118
|
+
COMMENT = '总收入'
|
|
119
|
+
)
|
|
120
|
+
COMMENT = '收入分析语义视图';
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## 查询语义视图
|
|
126
|
+
|
|
127
|
+
使用 `semantic_view()` 表函数,**无需手写 JOIN 和 GROUP BY**:
|
|
128
|
+
|
|
129
|
+
```sql
|
|
130
|
+
-- 基础查询:按订单日期统计平均订单金额
|
|
131
|
+
SELECT * FROM semantic_view(
|
|
132
|
+
tpch_rev_analysis,
|
|
133
|
+
DIMENSIONS orders.order_date,
|
|
134
|
+
METRICS orders.avg_order_value
|
|
135
|
+
);
|
|
136
|
+
|
|
137
|
+
-- 多维度查询:按日期和客户名称
|
|
138
|
+
SELECT * FROM semantic_view(
|
|
139
|
+
tpch_rev_analysis,
|
|
140
|
+
DIMENSIONS orders.order_date,
|
|
141
|
+
DIMENSIONS customers.customer_name,
|
|
142
|
+
METRICS orders.avg_order_value
|
|
143
|
+
);
|
|
144
|
+
|
|
145
|
+
-- 使用短名称(名称唯一时可省略表别名前缀)
|
|
146
|
+
SELECT * FROM semantic_view(
|
|
147
|
+
tpch_rev_analysis,
|
|
148
|
+
DIMENSIONS order_date,
|
|
149
|
+
DIMENSIONS customer_name,
|
|
150
|
+
METRICS avg_order_value
|
|
151
|
+
);
|
|
152
|
+
|
|
153
|
+
-- 加 WHERE 过滤(需将过滤列定义为 DIMENSION)
|
|
154
|
+
SELECT * FROM semantic_view(
|
|
155
|
+
tpch_rev_analysis,
|
|
156
|
+
DIMENSIONS customers.customer_name,
|
|
157
|
+
DIMENSIONS orders.order_status,
|
|
158
|
+
METRICS orders.total_revenue
|
|
159
|
+
) WHERE order_status = 'O';
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### 与传统 SQL 对比
|
|
163
|
+
|
|
164
|
+
```sql
|
|
165
|
+
-- 传统 SQL(需手写 JOIN + GROUP BY)
|
|
166
|
+
SELECT o.o_orderdate, c.c_name, AVG(o.o_totalprice)
|
|
167
|
+
FROM tpch.orders o
|
|
168
|
+
JOIN tpch.customer c ON o.o_custkey = c.c_custkey
|
|
169
|
+
GROUP BY o.o_orderdate, c.c_name;
|
|
170
|
+
|
|
171
|
+
-- 语义视图(自动处理 JOIN 和聚合)
|
|
172
|
+
SELECT * FROM semantic_view(
|
|
173
|
+
tpch_rev_analysis,
|
|
174
|
+
DIMENSIONS order_date,
|
|
175
|
+
DIMENSIONS customer_name,
|
|
176
|
+
METRICS avg_order_value
|
|
177
|
+
);
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## 管理命令
|
|
183
|
+
|
|
184
|
+
```sql
|
|
185
|
+
-- 删除(推荐先删再建,确保幂等)
|
|
186
|
+
DROP SEMANTIC VIEW IF EXISTS tpch_rev_analysis;
|
|
187
|
+
|
|
188
|
+
-- 列出当前 Schema 的所有语义视图
|
|
189
|
+
SHOW SEMANTIC VIEWS;
|
|
190
|
+
SHOW SEMANTIC VIEWS IN my_schema;
|
|
191
|
+
|
|
192
|
+
-- 查看详细定义(逻辑表、维度、指标、外键)
|
|
193
|
+
DESC EXTENDED tpch_rev_analysis;
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## 注意事项
|
|
199
|
+
|
|
200
|
+
1. **TABLES 定义顺序**:被外键引用的表必须先定义(如 `customers` 必须在 `orders` 之前)
|
|
201
|
+
2. **FILTERS 是语义注解**:`FILTERS` 中的命名过滤器不能作为 `semantic_view()` 的参数,WHERE 子句只能引用 `DIMENSIONS` 中定义的列名(短名),不能用物理列名
|
|
202
|
+
3. **WHERE 只能用 DIMENSION 短名**:`WHERE customer_name = 'Alice'` ✅,`WHERE c_name = 'Alice'` ❌
|
|
203
|
+
4. **短名称 vs 限定名称**:名称在视图内唯一时可用短名称,有冲突时必须用 `表别名.名称`
|
|
204
|
+
5. **幂等创建**:始终先 `DROP SEMANTIC VIEW IF EXISTS` 再创建,避免重复执行报错
|
|
205
|
+
6. **计算维度**:DIMENSIONS 支持表达式,如 `YEAR(CAST(order_date AS DATE))` 提取年份
|
|
206
|
+
7. **指标聚合函数**:仅支持 `COUNT`、`AVG`、`SUM`、`MIN`、`MAX`
|
|
207
|
+
8. **DIMENSIONS 和 METRICS 可单独使用**:可以只查 METRICS(全局聚合),也可以只查 DIMENSIONS(去重列表)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"怎么创建语义视图","expected_skill":"clickzetta-semantic-view","expected_output_contains":["CREATE SEMANTIC VIEW"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"semantic_view() 函数怎么查询","expected_skill":"clickzetta-semantic-view","expected_output_contains":["semantic_view"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"语义视图里怎么定义维度和指标","expected_skill":"clickzetta-semantic-view","expected_output_contains":["DIMENSIONS","METRICS"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"怎么用语义层统一指标口径","expected_skill":"clickzetta-semantic-view","expected_output_contains":["语义"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"SHOW SEMANTIC VIEWS 怎么用","expected_skill":"clickzetta-semantic-view","expected_output_contains":["SHOW SEMANTIC VIEWS"]}
|
|
6
|
+
{"case_id":"006","type":"should_call","user_input":"语义视图的 FILTERS 怎么定义","expected_skill":"clickzetta-semantic-view","expected_output_contains":["FILTERS"]}
|
|
7
|
+
{"case_id":"007","type":"should_call","user_input":"DROP SEMANTIC VIEW 怎么删除","expected_skill":"clickzetta-semantic-view","expected_output_contains":["DROP SEMANTIC VIEW"]}
|
|
8
|
+
{"case_id":"008","type":"should_not_call","user_input":"怎么创建普通视图","forbidden_skill":"clickzetta-semantic-view"}
|
|
9
|
+
{"case_id":"009","type":"should_not_call","user_input":"帮我写一个数据看板","forbidden_skill":"clickzetta-semantic-view"}
|
|
10
|
+
{"case_id":"010","type":"should_not_call","user_input":"怎么做数据建模","forbidden_skill":"clickzetta-semantic-view"}
|
|
11
|
+
{"case_id":"011","type":"should_not_call","user_input":"Superset 怎么连接","forbidden_skill":"clickzetta-semantic-view"}
|
|
12
|
+
{"case_id":"012","type":"should_not_call","user_input":"怎么创建物化视图","forbidden_skill":"clickzetta-semantic-view"}
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# 语义视图完整语法参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/semantic_view
|
|
4
|
+
> 功能状态:邀测(1.3 版本起)
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## CREATE SEMANTIC VIEW 完整语法
|
|
9
|
+
|
|
10
|
+
```sql
|
|
11
|
+
CREATE SEMANTIC VIEW <视图名称>
|
|
12
|
+
TABLES (
|
|
13
|
+
<逻辑表定义> [ , ... ]
|
|
14
|
+
)
|
|
15
|
+
[ FILTERS (
|
|
16
|
+
<过滤器定义> [ , ... ]
|
|
17
|
+
) ]
|
|
18
|
+
DIMENSIONS (
|
|
19
|
+
<维度定义> [ , ... ]
|
|
20
|
+
)
|
|
21
|
+
METRICS (
|
|
22
|
+
<指标定义> [ , ... ]
|
|
23
|
+
)
|
|
24
|
+
[ COMMENT = '<视图说明>' ];
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**约束**:`DIMENSIONS` 和 `METRICS` 至少包含其中一个。
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## 逻辑表定义语法
|
|
32
|
+
|
|
33
|
+
```sql
|
|
34
|
+
<表别名> AS <schema>.<物理表名>
|
|
35
|
+
PRIMARY KEY ( <列名> [ , ... ] )
|
|
36
|
+
[ FOREIGN KEY ( <列名> ) REFERENCES <其他逻辑表别名> ]
|
|
37
|
+
[ WITH SYNONYMS ( '<同义词>' [ , ... ] ) ]
|
|
38
|
+
[ COMMENT = '<说明>' ]
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
| 参数 | 说明 |
|
|
42
|
+
|---|---|
|
|
43
|
+
| `<表别名> AS <schema>.<物理表>` | 为物理表指定逻辑别名,后续维度/指标/外键均用此别名引用 |
|
|
44
|
+
| `PRIMARY KEY` | 主键列,用于确定表间关系类型(一对多/一对一) |
|
|
45
|
+
| `FOREIGN KEY ... REFERENCES` | 外键关系,引擎据此自动处理 JOIN;引用目标必须是逻辑表别名 |
|
|
46
|
+
| `WITH SYNONYMS` | 逻辑表同义词,增强可发现性 |
|
|
47
|
+
|
|
48
|
+
**注意**:被外键引用的表必须在 TABLES 子句中先定义。
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## 过滤器定义语法
|
|
53
|
+
|
|
54
|
+
```sql
|
|
55
|
+
<逻辑表别名>.<过滤器名> AS <布尔表达式>
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
示例:
|
|
59
|
+
```sql
|
|
60
|
+
FILTERS (
|
|
61
|
+
customers.is_building AS customers.c_mktsegment = 'BUILDING',
|
|
62
|
+
orders.is_open AS orders.o_orderstatus = 'O'
|
|
63
|
+
)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
**重要**:FILTERS 是面向 AI/元数据层的语义注解,**不能**作为 `semantic_view()` 函数参数直接传入。若要在查询中过滤,需将对应列定义为 DIMENSION,再用外层 WHERE 子句。
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## 维度定义语法
|
|
71
|
+
|
|
72
|
+
```sql
|
|
73
|
+
{ <逻辑表别名>.<维度名> | <维度名> } AS <表达式>
|
|
74
|
+
[ WITH SYNONYMS = ( '<同义词>' [ , ... ] ) ]
|
|
75
|
+
[ is_unique = { true | false } ]
|
|
76
|
+
[ is_time = { true | false } ]
|
|
77
|
+
[ enum_values = [ <值1>, <值2>, ... ] ]
|
|
78
|
+
[ COMMENT = '<说明>' ]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
| 参数 | 说明 |
|
|
82
|
+
|---|---|
|
|
83
|
+
| `AS <表达式>` | 可以是列名,也可以是计算表达式(如 `YEAR(o_orderdate)`) |
|
|
84
|
+
| `WITH SYNONYMS` | 维度同义词,用户可用不同业务术语引用同一维度 |
|
|
85
|
+
| `is_unique = true` | 标识该维度值唯一(如客户名称),帮助引擎优化 |
|
|
86
|
+
| `is_time = true` | 标识为时间类型维度(如订单日期) |
|
|
87
|
+
| `enum_values` | 限定允许的枚举值,提升查询准确性 |
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## 指标定义语法
|
|
92
|
+
|
|
93
|
+
```sql
|
|
94
|
+
<逻辑表别名>.<指标名> AS <聚合表达式>
|
|
95
|
+
[ COMMENT = '<说明>' ]
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
支持的聚合函数:`COUNT`、`AVG`、`SUM`、`MIN`、`MAX`
|
|
99
|
+
|
|
100
|
+
示例:
|
|
101
|
+
```sql
|
|
102
|
+
METRICS (
|
|
103
|
+
orders.total_revenue AS SUM(o_totalprice)
|
|
104
|
+
COMMENT = '总收入',
|
|
105
|
+
orders.avg_order_value AS AVG(o_totalprice)
|
|
106
|
+
COMMENT = '平均订单金额',
|
|
107
|
+
customers.customer_count AS COUNT(c_custkey)
|
|
108
|
+
COMMENT = '客户总数'
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## semantic_view() 查询函数语法
|
|
115
|
+
|
|
116
|
+
```sql
|
|
117
|
+
SELECT *
|
|
118
|
+
FROM semantic_view(
|
|
119
|
+
<视图名称>,
|
|
120
|
+
DIMENSIONS <维度名> [ , DIMENSIONS <维度名> ... ],
|
|
121
|
+
METRICS <指标名> [ , METRICS <指标名> ... ]
|
|
122
|
+
)
|
|
123
|
+
[ WHERE <过滤条件> ];
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
- 维度名可用限定名(`表别名.维度名`)或短名(名称唯一时)
|
|
127
|
+
- 结果自动按指定维度分组,无需写 GROUP BY
|
|
128
|
+
- WHERE 子句中的列名使用短名(不含表别名前缀)
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## 管理命令
|
|
133
|
+
|
|
134
|
+
| 命令 | 说明 |
|
|
135
|
+
|---|---|
|
|
136
|
+
| `CREATE SEMANTIC VIEW` | 创建语义视图 |
|
|
137
|
+
| `DROP SEMANTIC VIEW IF EXISTS <名称>` | 删除语义视图 |
|
|
138
|
+
| `SHOW SEMANTIC VIEWS` | 列出当前 Schema 所有语义视图 |
|
|
139
|
+
| `SHOW SEMANTIC VIEWS IN <schema>` | 列出指定 Schema 的语义视图 |
|
|
140
|
+
| `DESC EXTENDED <名称>` | 查看详细定义(逻辑表/维度/指标/外键/索引) |
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## 最佳实践
|
|
145
|
+
|
|
146
|
+
```sql
|
|
147
|
+
-- 1. 幂等创建(始终先删再建)
|
|
148
|
+
DROP SEMANTIC VIEW IF EXISTS my_view;
|
|
149
|
+
CREATE SEMANTIC VIEW my_view ...;
|
|
150
|
+
|
|
151
|
+
-- 2. 使用有意义的业务术语命名
|
|
152
|
+
-- 好:customer_name, total_revenue, order_date
|
|
153
|
+
-- 差:c_name, sum_totalprice, o_orderdate
|
|
154
|
+
|
|
155
|
+
-- 3. 合理设置维度元数据
|
|
156
|
+
-- is_time=true 用于日期/时间维度
|
|
157
|
+
-- is_unique=true 用于主键类维度(如客户ID、订单号)
|
|
158
|
+
-- enum_values 用于状态类维度(如订单状态)
|
|
159
|
+
|
|
160
|
+
-- 4. 计算维度示例
|
|
161
|
+
DIMENSIONS (
|
|
162
|
+
orders.order_year AS YEAR(o_orderdate) -- 从日期提取年份
|
|
163
|
+
COMMENT = '下单年份',
|
|
164
|
+
orders.order_month AS MONTH(o_orderdate) -- 从日期提取月份
|
|
165
|
+
COMMENT = '下单月份'
|
|
166
|
+
)
|
|
167
|
+
```
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-spark-flink-connector
|
|
3
|
+
description: |
|
|
4
|
+
使用 Spark Connector 或 Flink Write Connector 将数据写入 ClickZetta Lakehouse。
|
|
5
|
+
覆盖 Spark DataFrame 读写配置(Maven 依赖、连接参数、read/write 代码)、
|
|
6
|
+
Flink Table API 写入(CDC 模式 igs-dynamic-table、仅追加模式 igs-dynamic-table-append-only)、
|
|
7
|
+
checkpoint 配置、buffer/flush 调优,以及主键表限制等关键约束。
|
|
8
|
+
当用户说"Spark Connector"、"Flink Connector"、"Spark 写入 Lakehouse"、
|
|
9
|
+
"Flink 写入 Lakehouse"、"spark-clickzetta"、"igs-flink-connector"、
|
|
10
|
+
"Spark DataFrame 写入"、"Flink CDC 写入"、"Flink sink"、
|
|
11
|
+
"spark.read.format clickzetta"时触发。
|
|
12
|
+
Keywords: Spark, Flink, DataFrame, connector, read, write, CDC, igs-dynamic-table
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
# ClickZetta Spark & Flink Connector
|
|
16
|
+
|
|
17
|
+
阅读 [references/spark.md](references/spark.md) 了解 Spark Connector,[references/flink.md](references/flink.md) 了解 Flink Write Connector。
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 关键约束(必读)
|
|
22
|
+
|
|
23
|
+
| 约束 | Spark Connector | Flink Connector |
|
|
24
|
+
|---|---|---|
|
|
25
|
+
| 主键表写入 | ❌ 不支持 | ✅ 支持(igs-dynamic-table 模式) |
|
|
26
|
+
| 部分字段写入 | ❌ 必须写全部字段 | ✅ 支持 |
|
|
27
|
+
| CDC(UPDATE/DELETE) | ❌ 仅 append | ✅ igs-dynamic-table 模式支持 |
|
|
28
|
+
| Spark 版本 | 3.4.0+ | — |
|
|
29
|
+
| Flink 版本 | — | 1.15.2+ |
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Spark Connector 快速示例
|
|
34
|
+
|
|
35
|
+
```scala
|
|
36
|
+
// 写入
|
|
37
|
+
df.write.format("clickzetta")
|
|
38
|
+
.option("endpoint", "your_instance.cn-shanghai-alicloud.api.clickzetta.com")
|
|
39
|
+
.option("username", sys.env("CZ_USERNAME"))
|
|
40
|
+
.option("password", sys.env("CZ_PASSWORD"))
|
|
41
|
+
.option("workspace", "your_workspace")
|
|
42
|
+
.option("virtualCluster", "default_ap")
|
|
43
|
+
.option("schema", "public")
|
|
44
|
+
.option("table", "orders")
|
|
45
|
+
.mode("append")
|
|
46
|
+
.save()
|
|
47
|
+
|
|
48
|
+
// 读取
|
|
49
|
+
val df = spark.read.format("clickzetta")
|
|
50
|
+
.option("endpoint", "your_instance.cn-shanghai-alicloud.api.clickzetta.com")
|
|
51
|
+
.option("username", sys.env("CZ_USERNAME"))
|
|
52
|
+
.option("password", sys.env("CZ_PASSWORD"))
|
|
53
|
+
.option("workspace", "your_workspace")
|
|
54
|
+
.option("virtualCluster", "default_ap")
|
|
55
|
+
.option("schema", "public")
|
|
56
|
+
.option("table", "orders")
|
|
57
|
+
.load()
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Flink Connector 快速示例
|
|
63
|
+
|
|
64
|
+
```sql
|
|
65
|
+
-- CDC 模式(支持 INSERT/UPDATE/DELETE,目标表需有主键)
|
|
66
|
+
CREATE TABLE lakehouse_sink (
|
|
67
|
+
order_id INT,
|
|
68
|
+
status STRING,
|
|
69
|
+
amount DOUBLE,
|
|
70
|
+
PRIMARY KEY (order_id) NOT ENFORCED
|
|
71
|
+
) WITH (
|
|
72
|
+
'connector' = 'igs-dynamic-table',
|
|
73
|
+
'curl' = 'jdbc:clickzetta://your_instance.cn-shanghai-alicloud.api.clickzetta.com/default?username=user&password=***&schema=public',
|
|
74
|
+
'schema-name' = 'public',
|
|
75
|
+
'table-name' = 'orders',
|
|
76
|
+
'sink.parallelism' = '1',
|
|
77
|
+
'properties' = 'authentication:true'
|
|
78
|
+
);
|
|
79
|
+
|
|
80
|
+
INSERT INTO lakehouse_sink SELECT order_id, status, amount FROM source_table;
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## 选择指南
|
|
86
|
+
|
|
87
|
+
| 场景 | 推荐方案 |
|
|
88
|
+
|---|---|
|
|
89
|
+
| Spark ETL 批量写入(无主键表) | Spark Connector |
|
|
90
|
+
| Flink 实时流写入(无主键表) | Flink igs-dynamic-table-append-only |
|
|
91
|
+
| Flink CDC 同步(有主键表,含 UPDATE/DELETE) | Flink igs-dynamic-table |
|
|
92
|
+
| 高频实时写入(Java 应用) | Java SDK RealtimeStream |
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"怎么用 Spark DataFrame 写入 ClickZetta Lakehouse?","expected_skill":"clickzetta-spark-flink-connector","expected_output_contains":["format(\"clickzetta\")","write"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"Flink CDC 写入 ClickZetta 支持 UPDATE 和 DELETE 吗?","expected_skill":"clickzetta-spark-flink-connector","expected_output_contains":["igs-dynamic-table","CDC"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"Spark Connector 支持写入主键表吗?","expected_skill":"clickzetta-spark-flink-connector","expected_output_contains":["不支持","Flink"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"Flink 写入 ClickZetta 需要什么版本?","expected_skill":"clickzetta-spark-flink-connector","expected_output_contains":["1.15"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"Flink 写入主键表的 sink.parallelism 有什么限制?","expected_skill":"clickzetta-spark-flink-connector","expected_output_contains":["1"]}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Flink Write Connector 详细参考
|
|
2
|
+
|
|
3
|
+
## Maven 依赖
|
|
4
|
+
|
|
5
|
+
```xml
|
|
6
|
+
<dependency>
|
|
7
|
+
<groupId>com.clickzetta</groupId>
|
|
8
|
+
<artifactId>igs-flink-connector-1.15</artifactId> <!-- 按 Flink 版本替换 -->
|
|
9
|
+
<version>联系 ClickZetta 支持获取版本号</version>
|
|
10
|
+
</dependency>
|
|
11
|
+
<!-- Flink 核心(provided) -->
|
|
12
|
+
<dependency>
|
|
13
|
+
<groupId>org.apache.flink</groupId>
|
|
14
|
+
<artifactId>flink-streaming-java</artifactId>
|
|
15
|
+
<version>1.15.2</version>
|
|
16
|
+
<scope>provided</scope>
|
|
17
|
+
</dependency>
|
|
18
|
+
<dependency>
|
|
19
|
+
<groupId>org.apache.flink</groupId>
|
|
20
|
+
<artifactId>flink-table-api-java-bridge</artifactId>
|
|
21
|
+
<version>1.15.2</version>
|
|
22
|
+
<scope>provided</scope>
|
|
23
|
+
</dependency>
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## 两种写入模式
|
|
27
|
+
|
|
28
|
+
### 模式 1:igs-dynamic-table(CDC,支持主键表)
|
|
29
|
+
|
|
30
|
+
```sql
|
|
31
|
+
-- 目标表必须有主键
|
|
32
|
+
CREATE TABLE lakehouse_orders_sink (
|
|
33
|
+
order_id INT,
|
|
34
|
+
customer STRING,
|
|
35
|
+
amount DOUBLE,
|
|
36
|
+
status STRING,
|
|
37
|
+
updated_at TIMESTAMP(3),
|
|
38
|
+
PRIMARY KEY (order_id) NOT ENFORCED
|
|
39
|
+
) WITH (
|
|
40
|
+
'connector' = 'igs-dynamic-table',
|
|
41
|
+
'curl' = 'jdbc:clickzetta://your_instance.cn-shanghai-alicloud.api.clickzetta.com/default?username=user&password=***&schema=public&virtualcluster=default_ap',
|
|
42
|
+
'schema-name' = 'public',
|
|
43
|
+
'table-name' = 'orders',
|
|
44
|
+
'sink.parallelism' = '1', -- 主键表必须为 1
|
|
45
|
+
'properties' = 'authentication:true'
|
|
46
|
+
);
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### 模式 2:igs-dynamic-table-append-only(仅追加,无主键表)
|
|
50
|
+
|
|
51
|
+
```sql
|
|
52
|
+
CREATE TABLE lakehouse_events_sink (
|
|
53
|
+
event_id BIGINT,
|
|
54
|
+
user_id BIGINT,
|
|
55
|
+
event_type STRING,
|
|
56
|
+
event_time TIMESTAMP(3)
|
|
57
|
+
) WITH (
|
|
58
|
+
'connector' = 'igs-dynamic-table-append-only',
|
|
59
|
+
'curl' = 'jdbc:clickzetta://your_instance.cn-shanghai-alicloud.api.clickzetta.com/default?username=user&password=***&schema=public&virtualcluster=default_ap',
|
|
60
|
+
'schema-name' = 'public',
|
|
61
|
+
'table-name' = 'events',
|
|
62
|
+
'sink.parallelism' = '4', -- 无主键表可提高并行度
|
|
63
|
+
'properties' = 'authentication:true'
|
|
64
|
+
);
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## 完整 CDC 同步示例(MySQL → Lakehouse)
|
|
68
|
+
|
|
69
|
+
```sql
|
|
70
|
+
-- 1. MySQL CDC 源表
|
|
71
|
+
CREATE TABLE mysql_orders_source (
|
|
72
|
+
order_id INT,
|
|
73
|
+
customer STRING,
|
|
74
|
+
amount DOUBLE,
|
|
75
|
+
status STRING,
|
|
76
|
+
updated_at TIMESTAMP(3),
|
|
77
|
+
PRIMARY KEY (order_id) NOT ENFORCED
|
|
78
|
+
) WITH (
|
|
79
|
+
'connector' = 'mysql-cdc',
|
|
80
|
+
'hostname' = 'mysql-host',
|
|
81
|
+
'port' = '3306',
|
|
82
|
+
'username' = 'cdc_user',
|
|
83
|
+
'password' = 'cdc_password',
|
|
84
|
+
'database-name' = 'orders_db',
|
|
85
|
+
'table-name' = 'orders'
|
|
86
|
+
);
|
|
87
|
+
|
|
88
|
+
-- 2. Lakehouse Sink(CDC 模式)
|
|
89
|
+
CREATE TABLE lakehouse_orders_sink (
|
|
90
|
+
order_id INT,
|
|
91
|
+
customer STRING,
|
|
92
|
+
amount DOUBLE,
|
|
93
|
+
status STRING,
|
|
94
|
+
updated_at TIMESTAMP(3),
|
|
95
|
+
PRIMARY KEY (order_id) NOT ENFORCED
|
|
96
|
+
) WITH (
|
|
97
|
+
'connector' = 'igs-dynamic-table',
|
|
98
|
+
'curl' = 'jdbc:clickzetta://...',
|
|
99
|
+
'schema-name' = 'public',
|
|
100
|
+
'table-name' = 'orders',
|
|
101
|
+
'sink.parallelism' = '1',
|
|
102
|
+
'properties' = 'authentication:true'
|
|
103
|
+
);
|
|
104
|
+
|
|
105
|
+
-- 3. 同步
|
|
106
|
+
INSERT INTO lakehouse_orders_sink SELECT * FROM mysql_orders_source;
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Buffer 与 Flush 调优
|
|
110
|
+
|
|
111
|
+
```sql
|
|
112
|
+
-- 在 WITH 子句中添加调优参数
|
|
113
|
+
'mutation.buffer.lines.num' = '500' -- 每批缓冲行数(默认 100)
|
|
114
|
+
'mutation.buffer.space' = '10MB' -- 缓冲区大小(默认 5MB)
|
|
115
|
+
'mutation.buffer.max.num' = '8' -- 并发缓冲区数(默认 5)
|
|
116
|
+
'mutation.flush.interval' = '5000' -- flush 间隔毫秒(默认 10000)
|
|
117
|
+
'flush.mode' = 'AUTO_FLUSH_BACKGROUND' -- 异步 flush(默认)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Checkpoint 配置(Java)
|
|
121
|
+
|
|
122
|
+
```java
|
|
123
|
+
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
|
124
|
+
|
|
125
|
+
// 生产环境必须开启 checkpoint
|
|
126
|
+
env.enableCheckpointing(60000); // 每 60 秒一次
|
|
127
|
+
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
|
|
128
|
+
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
|
|
129
|
+
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(30000);
|
|
130
|
+
env.getCheckpointConfig().setCheckpointTimeout(120000);
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## 私有网络访问
|
|
134
|
+
|
|
135
|
+
```sql
|
|
136
|
+
-- 内网访问(VPC 内部)
|
|
137
|
+
'properties' = 'authentication:true,isInternal:true,isDirect:false'
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## 常见问题
|
|
141
|
+
|
|
142
|
+
| 问题 | 原因 | 解决方案 |
|
|
143
|
+
|---|---|---|
|
|
144
|
+
| 写入主键表数据不更新 | 使用了 append-only 模式 | 改用 `igs-dynamic-table` 模式 |
|
|
145
|
+
| 并行度 > 1 时数据乱序 | 主键表要求顺序写入 | 主键表 `sink.parallelism` 必须设为 `1` |
|
|
146
|
+
| checkpoint 失败 | 未配置 checkpoint 或超时 | 增大 `setCheckpointTimeout`,检查网络 |
|
|
147
|
+
| 连接超时 | 网络不通或认证失败 | 检查 `curl` 中的 username/password,确认 VPC 配置 |
|