@clickzetta/cz-cli-darwin-x64 0.3.40 → 0.3.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +128 -287
- package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
- package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
- package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +103 -11
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +58 -2
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +4 -4
- package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +34 -0
- package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +31 -0
- package/bin/skills/clickzetta-metadata/SKILL.md +28 -30
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +39 -0
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +115 -9
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
- package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
- package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +0 -4
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-zettapark
|
|
3
|
+
description: |
|
|
4
|
+
使用 ZettaPark Python 库操作 ClickZetta Lakehouse 数据。ZettaPark 提供类 pandas 的
|
|
5
|
+
DataFrame API,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行。
|
|
6
|
+
覆盖 Session 创建、DataFrame 构建与转换(filter/select/join/groupBy)、
|
|
7
|
+
结果收集(collect/to_pandas/show)、写入表(save_as_table)、
|
|
8
|
+
文件操作(PUT/GET)、执行 SQL 等完整工作流。
|
|
9
|
+
当用户说"ZettaPark"、"zettapark"、"DataFrame API"、"Python 操作 Lakehouse"、
|
|
10
|
+
"save_as_table"、"session.table"、"session.sql"、"collect()"、"to_pandas"、
|
|
11
|
+
"Python 数据工程"、"Python 写入 Lakehouse"、"Python 读取 Lakehouse"、
|
|
12
|
+
"clickzetta_zettapark_python"时触发。
|
|
13
|
+
Keywords: ZettaPark, DataFrame, pandas-like, Python, SQL translation, distributed compute
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
# ClickZetta ZettaPark
|
|
17
|
+
|
|
18
|
+
ZettaPark 是 ClickZetta Lakehouse 的 Python DataFrame 框架,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行,提供类 pandas 的开发体验。
|
|
19
|
+
|
|
20
|
+
阅读 [references/zettapark-api.md](references/zettapark-api.md) 了解完整 API。
|
|
21
|
+
|
|
22
|
+
## 安装
|
|
23
|
+
|
|
24
|
+
> ⚠️ **Python 版本要求**:推荐 **Python 3.12**(最低 3.10,不支持 3.9 及以下)
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# 方式 1:venv(Python 内置,推荐)
|
|
28
|
+
python3.12 -m venv .venv
|
|
29
|
+
source .venv/bin/activate # macOS/Linux | .venv\Scripts\activate (Windows)
|
|
30
|
+
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
31
|
+
|
|
32
|
+
# 方式 2:pyenv(需要切换 Python 版本时)
|
|
33
|
+
pyenv install 3.12.9 && pyenv local 3.12.9
|
|
34
|
+
python -m venv .venv && source .venv/bin/activate
|
|
35
|
+
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
36
|
+
|
|
37
|
+
# 方式 3:conda(数据科学环境)
|
|
38
|
+
conda create -n lakehouse python=3.12 -y && conda activate lakehouse
|
|
39
|
+
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## 创建会话
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from clickzetta.zettapark.session import Session
|
|
48
|
+
|
|
49
|
+
connection_parameters = {
|
|
50
|
+
"username": "your_username",
|
|
51
|
+
"password": "your_password",
|
|
52
|
+
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
53
|
+
"instance": "your_instance_id",
|
|
54
|
+
"workspace": "your_workspace",
|
|
55
|
+
"schema": "public",
|
|
56
|
+
"vcluster": "default_ap",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
session = Session.builder.configs(connection_parameters).create()
|
|
60
|
+
|
|
61
|
+
# 验证连接
|
|
62
|
+
session.sql("SELECT current_user(), current_workspace()").show()
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 核心工作流
|
|
68
|
+
|
|
69
|
+
### 读取数据
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from clickzetta.zettapark import functions as F
|
|
73
|
+
|
|
74
|
+
# 从表读取
|
|
75
|
+
df = session.table("orders")
|
|
76
|
+
df = session.table("my_schema.orders")
|
|
77
|
+
|
|
78
|
+
# 从 SQL 读取
|
|
79
|
+
df = session.sql("SELECT * FROM orders WHERE year = 2024")
|
|
80
|
+
|
|
81
|
+
# 从 Python 数据创建
|
|
82
|
+
df = session.create_dataframe([[1, "Alice", 100.0], [2, "Bob", 200.0]],
|
|
83
|
+
schema=["id", "name", "amount"])
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 转换数据
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
# 过滤、选择、新增列
|
|
90
|
+
result = (
|
|
91
|
+
session.table("orders")
|
|
92
|
+
.filter(F.col("status") == "completed")
|
|
93
|
+
.select("order_id", "customer_id", "amount")
|
|
94
|
+
.with_column("tax", F.col("amount") * 0.1)
|
|
95
|
+
.sort(F.col("amount").desc())
|
|
96
|
+
.limit(100)
|
|
97
|
+
)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 聚合
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
summary = (
|
|
104
|
+
session.table("orders")
|
|
105
|
+
.group_by("category")
|
|
106
|
+
.agg(
|
|
107
|
+
F.sum("amount").as_("total"),
|
|
108
|
+
F.count("*").as_("cnt"),
|
|
109
|
+
F.avg("amount").as_("avg_amount"),
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
summary.show()
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### JOIN
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
orders = session.table("orders")
|
|
119
|
+
customers = session.table("customers")
|
|
120
|
+
|
|
121
|
+
result = orders.join(
|
|
122
|
+
customers,
|
|
123
|
+
orders["customer_id"] == customers["id"],
|
|
124
|
+
"left"
|
|
125
|
+
).select(
|
|
126
|
+
orders["order_id"],
|
|
127
|
+
customers["name"],
|
|
128
|
+
orders["amount"]
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### 写入数据
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# 追加到已有表
|
|
136
|
+
df.write.save_as_table("result_table", mode="append")
|
|
137
|
+
|
|
138
|
+
# 覆盖写入(自动建表)
|
|
139
|
+
df.write.save_as_table("result_table", mode="overwrite")
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### 获取结果
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
# 打印预览
|
|
146
|
+
df.show(20)
|
|
147
|
+
|
|
148
|
+
# 收集为 Row 列表
|
|
149
|
+
rows = df.collect()
|
|
150
|
+
for row in rows:
|
|
151
|
+
print(row["id"], row["name"])
|
|
152
|
+
|
|
153
|
+
# 转为 Pandas DataFrame(小数据量)
|
|
154
|
+
pandas_df = df.to_pandas()
|
|
155
|
+
|
|
156
|
+
# 获取行数
|
|
157
|
+
print(df.count())
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## 典型场景
|
|
163
|
+
|
|
164
|
+
### 场景 1:ETL 数据处理
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from clickzetta.zettapark.session import Session
|
|
168
|
+
from clickzetta.zettapark import functions as F
|
|
169
|
+
|
|
170
|
+
session = Session.builder.configs(config).create()
|
|
171
|
+
|
|
172
|
+
# 读取原始数据
|
|
173
|
+
raw = session.table("bronze.raw_orders")
|
|
174
|
+
|
|
175
|
+
# 清洗转换
|
|
176
|
+
cleaned = (
|
|
177
|
+
raw
|
|
178
|
+
.filter(F.isnotnull(F.col("order_id")))
|
|
179
|
+
.filter(F.col("amount") > 0)
|
|
180
|
+
.with_column("order_date", F.col("created_at").cast("DATE"))
|
|
181
|
+
.with_column("year_month", F.date_format(F.col("order_date"), "yyyy-MM"))
|
|
182
|
+
.select("order_id", "customer_id", "amount", "order_date", "year_month")
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# 写入 Silver 层
|
|
186
|
+
cleaned.write.save_as_table("silver.orders_cleaned", mode="overwrite")
|
|
187
|
+
|
|
188
|
+
session.close()
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### 场景 2:特征工程(机器学习)
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
from clickzetta.zettapark import functions as F
|
|
195
|
+
|
|
196
|
+
customer = session.table("clickzetta_sample_data.tpch_100g.customer")
|
|
197
|
+
orders = session.table("clickzetta_sample_data.tpch_100g.orders")
|
|
198
|
+
|
|
199
|
+
# 构建客户消费特征
|
|
200
|
+
customer_features = (
|
|
201
|
+
orders
|
|
202
|
+
.group_by("o_custkey")
|
|
203
|
+
.agg(
|
|
204
|
+
F.sum("o_totalprice").as_("total_spend"),
|
|
205
|
+
F.count("*").as_("order_count"),
|
|
206
|
+
F.avg("o_totalprice").as_("avg_order_value"),
|
|
207
|
+
F.max("o_orderdate").as_("last_order_date"),
|
|
208
|
+
)
|
|
209
|
+
.join(customer, orders["o_custkey"] == customer["c_custkey"])
|
|
210
|
+
.select("c_custkey", "c_name", "total_spend", "order_count", "avg_order_value")
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
customer_features.write.save_as_table("ml_features.customer_features", mode="overwrite")
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### 场景 3:从本地文件导入
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
import json
|
|
220
|
+
import gzip
|
|
221
|
+
from clickzetta.zettapark.session import Session
|
|
222
|
+
|
|
223
|
+
session = Session.builder.configs(config).create()
|
|
224
|
+
|
|
225
|
+
# 读取本地 JSON 数据
|
|
226
|
+
data = []
|
|
227
|
+
with gzip.open('data.json.gz', 'rt', encoding='utf-8') as f:
|
|
228
|
+
for line in f:
|
|
229
|
+
if line.strip():
|
|
230
|
+
data.append(json.loads(line))
|
|
231
|
+
|
|
232
|
+
# 创建 DataFrame 并写入
|
|
233
|
+
df = session.create_dataframe(data)
|
|
234
|
+
df.write.save_as_table("my_table", mode="overwrite")
|
|
235
|
+
|
|
236
|
+
session.close()
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## 常见问题
|
|
242
|
+
|
|
243
|
+
| 问题 | 原因 | 解决方案 |
|
|
244
|
+
|---|---|---|
|
|
245
|
+
| `collect()` 超时 | 数据量过大或集群规格不足 | 增大 `sdk.job.timeout`,或先 `limit()` 测试 |
|
|
246
|
+
| `to_pandas()` 内存溢出 | 结果集过大 | 先聚合/过滤再转 pandas,或分批处理 |
|
|
247
|
+
| 列名冲突(JOIN 后) | 两表有同名列 | 用 `df_left["col"]` 明确指定来源 |
|
|
248
|
+
| `save_as_table` 报错 | 表已存在且 mode 不对 | 使用 `mode="overwrite"` 或 `mode="append"` |
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"用 ZettaPark 读取 orders 表并过滤 amount > 100","expected_skill":"clickzetta-zettapark","expected_output_contains":["session.table","filter"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"ZettaPark 怎么安装?需要什么 Python 版本?","expected_skill":"clickzetta-zettapark","expected_output_contains":["pip install","3.12"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"怎么用 DataFrame API 做 group by 聚合","expected_skill":"clickzetta-zettapark","expected_output_contains":["group_by","agg"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"save_as_table 怎么用?支持哪些写入模式?","expected_skill":"clickzetta-zettapark","expected_output_contains":["save_as_table","overwrite","append"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"ZettaPark 怎么把结果转成 pandas DataFrame","expected_skill":"clickzetta-zettapark","expected_output_contains":["to_pandas"]}
|
|
6
|
+
{"case_id":"006","type":"should_call","user_input":"用 session.sql 执行一段 SQL 查询","expected_skill":"clickzetta-zettapark","expected_output_contains":["session.sql"]}
|
|
7
|
+
{"case_id":"007","type":"should_call","user_input":"ZettaPark 怎么 join 两张表","expected_skill":"clickzetta-zettapark","expected_output_contains":["join"]}
|
|
8
|
+
{"case_id":"008","type":"should_not_call","user_input":"帮我写一个 Flask Web 应用","forbidden_skill":"clickzetta-zettapark"}
|
|
9
|
+
{"case_id":"009","type":"should_not_call","user_input":"pandas 怎么读取 CSV 文件","forbidden_skill":"clickzetta-zettapark"}
|
|
10
|
+
{"case_id":"010","type":"should_not_call","user_input":"怎么用 JDBC 连接 Lakehouse","forbidden_skill":"clickzetta-zettapark"}
|
|
11
|
+
{"case_id":"011","type":"should_not_call","user_input":"帮我创建一个 VCluster","forbidden_skill":"clickzetta-zettapark"}
|
|
12
|
+
{"case_id":"012","type":"should_not_call","user_input":"Spark DataFrame 怎么用","forbidden_skill":"clickzetta-zettapark"}
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# ZettaPark 快速参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
|
|
4
|
+
|
|
5
|
+
## 安装
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install clickzetta_zettapark_python -U -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## 创建会话
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from clickzetta.zettapark.session import Session
|
|
17
|
+
|
|
18
|
+
connection_parameters = {
|
|
19
|
+
"username": "your_username",
|
|
20
|
+
"password": "your_password",
|
|
21
|
+
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
22
|
+
"instance": "your_instance_id",
|
|
23
|
+
"workspace": "your_workspace",
|
|
24
|
+
"schema": "public",
|
|
25
|
+
"vcluster": "default_ap",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
session = Session.builder.configs(connection_parameters).create()
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
带 hints(超时、query_tag 等):
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
connection_parameters = {
|
|
35
|
+
"username": "your_username",
|
|
36
|
+
"password": "your_password",
|
|
37
|
+
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
38
|
+
"instance": "your_instance_id",
|
|
39
|
+
"workspace": "your_workspace",
|
|
40
|
+
"schema": "public",
|
|
41
|
+
"vcluster": "default_ap",
|
|
42
|
+
"hints": {
|
|
43
|
+
"sdk.job.timeout": 300,
|
|
44
|
+
"query_tag": "my_zettapark_app",
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
session = Session.builder.configs(connection_parameters).create()
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
从 JSON 配置文件读取:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import json
|
|
55
|
+
with open('config.json', 'r') as f:
|
|
56
|
+
config = json.load(f)
|
|
57
|
+
session = Session.builder.configs(config).create()
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
验证连接:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
session.sql("SELECT current_user(), current_workspace(), current_vcluster()").show()
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
关闭会话:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
session.close()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## 构建 DataFrame
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
# 从表创建
|
|
78
|
+
df = session.table("my_schema.my_table")
|
|
79
|
+
|
|
80
|
+
# 从 SQL 创建
|
|
81
|
+
df = session.sql("SELECT * FROM orders WHERE year = 2024")
|
|
82
|
+
|
|
83
|
+
# 从 Python 数据创建
|
|
84
|
+
df = session.create_dataframe([1, 2, 3, 4]).to_df("id")
|
|
85
|
+
df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema=["id", "name"])
|
|
86
|
+
|
|
87
|
+
# 从 Row 对象创建
|
|
88
|
+
from clickzetta.zettapark import Row
|
|
89
|
+
df = session.create_dataframe([Row(id=1, name="Alice"), Row(id=2, name="Bob")])
|
|
90
|
+
|
|
91
|
+
# 带 Schema 创建
|
|
92
|
+
from clickzetta.zettapark.types import IntegerType, StringType, StructType, StructField
|
|
93
|
+
schema = StructType([StructField("id", IntegerType()), StructField("name", StringType())])
|
|
94
|
+
df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema)
|
|
95
|
+
|
|
96
|
+
# 范围序列
|
|
97
|
+
df = session.range(1, 10, 2).to_df("n") # 1,3,5,7,9
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## DataFrame 转换操作
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from clickzetta.zettapark import functions as F
|
|
106
|
+
|
|
107
|
+
# 过滤行
|
|
108
|
+
df.filter(F.col("age") > 18)
|
|
109
|
+
df.filter(F.col("status") == "active")
|
|
110
|
+
df.where(F.col("amount") > 1000)
|
|
111
|
+
|
|
112
|
+
# 选择列
|
|
113
|
+
df.select("id", "name", "amount")
|
|
114
|
+
df.select(F.col("id"), F.col("name").as_("user_name"))
|
|
115
|
+
|
|
116
|
+
# 新增/修改列
|
|
117
|
+
df.with_column("total", F.col("price") * F.col("qty"))
|
|
118
|
+
df.with_column("upper_name", F.upper(F.col("name")))
|
|
119
|
+
|
|
120
|
+
# 重命名列
|
|
121
|
+
df.rename(F.col("old_name"), "new_name")
|
|
122
|
+
|
|
123
|
+
# 排序
|
|
124
|
+
df.sort(F.col("amount").desc())
|
|
125
|
+
df.order_by(F.col("created_at").asc())
|
|
126
|
+
|
|
127
|
+
# 去重
|
|
128
|
+
df.distinct()
|
|
129
|
+
df.drop_duplicates(["user_id"])
|
|
130
|
+
|
|
131
|
+
# 限制行数
|
|
132
|
+
df.limit(100)
|
|
133
|
+
|
|
134
|
+
# 删除列
|
|
135
|
+
df.drop("unnecessary_col")
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## 聚合操作
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from clickzetta.zettapark import functions as F
|
|
144
|
+
|
|
145
|
+
# 分组聚合
|
|
146
|
+
df.group_by("category").agg(
|
|
147
|
+
F.sum("amount").as_("total_amount"),
|
|
148
|
+
F.count("*").as_("order_count"),
|
|
149
|
+
F.avg("price").as_("avg_price"),
|
|
150
|
+
F.max("amount").as_("max_amount"),
|
|
151
|
+
F.min("amount").as_("min_amount"),
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# 全局聚合
|
|
155
|
+
df.agg(F.count("*"), F.sum("amount"))
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## JOIN 操作
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
# 内连接
|
|
164
|
+
df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"])
|
|
165
|
+
|
|
166
|
+
# 左连接
|
|
167
|
+
df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"], "left")
|
|
168
|
+
|
|
169
|
+
# 选择连接后的列(避免列名冲突)
|
|
170
|
+
result = df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"]) \
|
|
171
|
+
.select(df_orders["order_id"], df_customers["name"], df_orders["amount"])
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## 执行与结果获取
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
# 打印前 N 行(触发执行)
|
|
180
|
+
df.show()
|
|
181
|
+
df.show(20)
|
|
182
|
+
|
|
183
|
+
# 收集所有结果为 Row 列表
|
|
184
|
+
rows = df.collect()
|
|
185
|
+
for row in rows:
|
|
186
|
+
print(row["id"], row["name"])
|
|
187
|
+
|
|
188
|
+
# 转换为 Pandas DataFrame
|
|
189
|
+
pandas_df = df.to_pandas()
|
|
190
|
+
|
|
191
|
+
# 获取行数
|
|
192
|
+
count = df.count()
|
|
193
|
+
|
|
194
|
+
# 获取列名
|
|
195
|
+
print(df.columns)
|
|
196
|
+
|
|
197
|
+
# 查看 Schema
|
|
198
|
+
df.schema.print_tree()
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## 写入数据
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
# 写入已有表(追加)
|
|
207
|
+
df.write.save_as_table("my_table", mode="append")
|
|
208
|
+
|
|
209
|
+
# 覆盖写入
|
|
210
|
+
df.write.save_as_table("my_table", mode="overwrite")
|
|
211
|
+
|
|
212
|
+
# 自动建表并写入(overwrite 会重建表)
|
|
213
|
+
df.write.save_as_table("new_table", mode="overwrite")
|
|
214
|
+
|
|
215
|
+
# 写入指定 Schema 下的表
|
|
216
|
+
df.write.save_as_table("my_schema.my_table", mode="append")
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## 执行 SQL
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
# 执行 DDL/DML
|
|
225
|
+
session.sql("CREATE TABLE IF NOT EXISTS t (id INT, name STRING)").collect()
|
|
226
|
+
session.sql("INSERT INTO t VALUES (1, 'Alice')").collect()
|
|
227
|
+
|
|
228
|
+
# 执行查询并获取 DataFrame
|
|
229
|
+
df = session.sql("SELECT * FROM orders WHERE amount > 1000")
|
|
230
|
+
df.show()
|
|
231
|
+
|
|
232
|
+
# 切换 Schema
|
|
233
|
+
session.use_schema("my_schema")
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## 文件操作(Volume)
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
# 上传文件到 User Volume
|
|
242
|
+
session.file.put("/local/path/data.csv", "volume:user://~/data/")
|
|
243
|
+
|
|
244
|
+
# 下载文件
|
|
245
|
+
session.file.get("volume:user://~/data/data.csv", "/local/output/")
|
|
246
|
+
|
|
247
|
+
# 列出 User Volume 文件
|
|
248
|
+
session.sql("LIST USER VOLUME").show()
|
|
249
|
+
session.sql("SHOW USER VOLUME DIRECTORY").show()
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## 常用 functions 速查
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
from clickzetta.zettapark import functions as F
|
|
258
|
+
|
|
259
|
+
# 字符串
|
|
260
|
+
F.upper(col), F.lower(col), F.concat(col1, col2)
|
|
261
|
+
F.substring(col, 1, 3), F.trim(col), F.length(col)
|
|
262
|
+
|
|
263
|
+
# 数值
|
|
264
|
+
F.abs(col), F.round(col, 2), F.floor(col), F.ceil(col)
|
|
265
|
+
F.sqrt(col), F.pow(col, 2)
|
|
266
|
+
|
|
267
|
+
# 日期时间
|
|
268
|
+
F.current_date(), F.current_timestamp()
|
|
269
|
+
F.year(col), F.month(col), F.day(col)
|
|
270
|
+
F.date_add(col, 7), F.datediff(col1, col2)
|
|
271
|
+
|
|
272
|
+
# 条件
|
|
273
|
+
F.when(F.col("status") == "A", "Active").otherwise("Inactive")
|
|
274
|
+
F.coalesce(col1, col2) # 第一个非 null 值
|
|
275
|
+
F.isnull(col), F.isnotnull(col)
|
|
276
|
+
|
|
277
|
+
# 聚合
|
|
278
|
+
F.count("*"), F.sum(col), F.avg(col), F.max(col), F.min(col)
|
|
279
|
+
F.count_distinct(col)
|
|
280
|
+
|
|
281
|
+
# 类型转换
|
|
282
|
+
F.col("amount").cast(IntegerType())
|
|
283
|
+
```
|
package/package.json
CHANGED
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: clickzetta-ai-vector-search
|
|
3
|
-
description: |
|
|
4
|
-
在 ClickZetta Lakehouse 中实现向量存储、向量索引(HNSW)和向量检索,
|
|
5
|
-
构建 RAG、语义搜索、图像检索等 AI 应用。覆盖 VECTOR 数据类型定义、
|
|
6
|
-
向量索引创建(cosine/l2/hamming 距离)、向量数据插入与转换、
|
|
7
|
-
ANN 近似最近邻检索、向量+倒排索引融合检索等完整工作流。
|
|
8
|
-
当用户说"向量检索"、"向量索引"、"语义搜索"、"embedding 存储"、
|
|
9
|
-
"RAG"、"ANN 搜索"、"HNSW"、"cosine_distance"、"l2_distance"、
|
|
10
|
-
"VECTOR 类型"、"向量数据库"、"相似度搜索"、"向量 + 标量融合检索"、
|
|
11
|
-
"文本向量化"时触发。
|
|
12
|
-
Keywords: vector, HNSW, embedding, RAG, semantic search, similarity, VECTOR type
|
|
13
|
-
---
|
|
14
|
-
|
|
15
|
-
# ClickZetta 向量检索
|
|
16
|
-
|
|
17
|
-
Lakehouse 原生支持 VECTOR 数据类型和 HNSW 向量索引,无需独立向量数据库即可在同一张表中实现向量检索、全文检索和标量过滤的融合查询。
|
|
18
|
-
|
|
19
|
-
阅读 [references/vector-search.md](references/vector-search.md) 了解完整语法。
|
|
20
|
-
|
|
21
|
-
---
|
|
22
|
-
|
|
23
|
-
## 快速开始
|
|
24
|
-
|
|
25
|
-
### 1. 建表(含向量索引)
|
|
26
|
-
|
|
27
|
-
```sql
|
|
28
|
-
CREATE TABLE doc_embeddings (
|
|
29
|
-
id INT,
|
|
30
|
-
content STRING,
|
|
31
|
-
vec VECTOR(FLOAT, 1024),
|
|
32
|
-
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
33
|
-
"distance.function" = "cosine_distance",
|
|
34
|
-
"scalar.type" = "f32"
|
|
35
|
-
)
|
|
36
|
-
);
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
### 2. 插入向量数据
|
|
40
|
-
|
|
41
|
-
```sql
|
|
42
|
-
-- 直接插入
|
|
43
|
-
INSERT INTO doc_embeddings VALUES
|
|
44
|
-
(1, '云器 Lakehouse 产品介绍', vector(0.12, 0.34, ...));
|
|
45
|
-
|
|
46
|
-
-- 从字符串转换(适合 API 返回的 JSON 格式)
|
|
47
|
-
INSERT INTO doc_embeddings (id, content, vec)
|
|
48
|
-
SELECT id, content, CAST(embedding_str AS VECTOR(1024))
|
|
49
|
-
FROM staging_table;
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
### 3. 向量检索
|
|
53
|
-
|
|
54
|
-
```sql
|
|
55
|
-
-- 设置探索因子(精度 vs 速度)
|
|
56
|
-
SET cz.vector.index.search.ef = 64;
|
|
57
|
-
|
|
58
|
-
-- 余弦距离 Top-10 相似文档
|
|
59
|
-
SELECT id, content, cosine_distance(vec, CAST('[0.12, 0.34, ...]' AS VECTOR(1024))) AS dist
|
|
60
|
-
FROM doc_embeddings
|
|
61
|
-
ORDER BY dist
|
|
62
|
-
LIMIT 10;
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
---
|
|
66
|
-
|
|
67
|
-
## 向量 + 标量融合检索(RAG 场景)
|
|
68
|
-
|
|
69
|
-
```sql
|
|
70
|
-
-- 先用标量过滤缩小范围,再用向量排序
|
|
71
|
-
SELECT id, content, cosine_distance(vec, :query_embedding) AS dist
|
|
72
|
-
FROM doc_embeddings
|
|
73
|
-
WHERE category = 'product'
|
|
74
|
-
AND created_at >= '2024-01-01'
|
|
75
|
-
ORDER BY dist
|
|
76
|
-
LIMIT 5;
|
|
77
|
-
```
|
|
78
|
-
|
|
79
|
-
---
|
|
80
|
-
|
|
81
|
-
## 向量 + 全文检索融合
|
|
82
|
-
|
|
83
|
-
```sql
|
|
84
|
-
-- 建表:同时支持向量索引和倒排索引
|
|
85
|
-
CREATE TABLE hybrid_docs (
|
|
86
|
-
id INT,
|
|
87
|
-
title STRING,
|
|
88
|
-
body STRING,
|
|
89
|
-
vec VECTOR(FLOAT, 1024),
|
|
90
|
-
INDEX body_inv_idx (body) USING INVERTED,
|
|
91
|
-
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
92
|
-
"distance.function" = "cosine_distance"
|
|
93
|
-
)
|
|
94
|
-
);
|
|
95
|
-
|
|
96
|
-
-- 融合检索:关键词过滤 + 向量排序
|
|
97
|
-
SELECT id, title, cosine_distance(vec, :query_vec) AS dist
|
|
98
|
-
FROM hybrid_docs
|
|
99
|
-
WHERE body LIKE '%向量检索%'
|
|
100
|
-
ORDER BY dist
|
|
101
|
-
LIMIT 10;
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
---
|
|
105
|
-
|
|
106
|
-
## 外部系统写入向量(ARRAY → VECTOR 转换)
|
|
107
|
-
|
|
108
|
-
外部系统(Python SDK、Kafka 等)不能直接写 VECTOR 类型,需先写 ARRAY 再转换:
|
|
109
|
-
|
|
110
|
-
```sql
|
|
111
|
-
-- 暂存表(ARRAY 类型)
|
|
112
|
-
CREATE TABLE staging (id INT, vec_array ARRAY<FLOAT>);
|
|
113
|
-
|
|
114
|
-
-- 转换写入目标表
|
|
115
|
-
INSERT INTO doc_embeddings (id, vec)
|
|
116
|
-
SELECT id, CAST(vec_array AS VECTOR(FLOAT, 1024))
|
|
117
|
-
FROM staging;
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
---
|
|
121
|
-
|
|
122
|
-
## 距离函数速查
|
|
123
|
-
|
|
124
|
-
| 函数 | 适用场景 |
|
|
125
|
-
|---|---|
|
|
126
|
-
| `cosine_distance(v1, v2)` | 文本语义检索(最常用) |
|
|
127
|
-
| `l2_distance(v1, v2)` | 图像/通用向量检索 |
|
|
128
|
-
| `dot_product(v1, v2)` | 归一化向量的相似度 |
|
|
129
|
-
| `hamming_distance(v1, v2)` | 二值向量(高效压缩) |
|
|
130
|
-
| `binary_quantize(v)` | 将 float 向量压缩为二值向量 |
|
|
131
|
-
|
|
132
|
-
---
|
|
133
|
-
|
|
134
|
-
## 性能调优
|
|
135
|
-
|
|
136
|
-
```sql
|
|
137
|
-
-- 调整探索因子(默认 64,越大精度越高但越慢)
|
|
138
|
-
SET cz.vector.index.search.ef = 128;
|
|
139
|
-
|
|
140
|
-
-- 验证向量索引是否生效
|
|
141
|
-
EXPLAIN SELECT id, cosine_distance(vec, vector(0.1, 0.2)) AS dist
|
|
142
|
-
FROM doc_embeddings ORDER BY dist LIMIT 10;
|
|
143
|
-
-- 查看执行计划中是否有 vector_index_search_type 字样
|
|
144
|
-
```
|
|
145
|
-
|
|
146
|
-
**最佳实践:**
|
|
147
|
-
- 向量检索建议**单独占用 VCluster**,避免与其他查询争抢缓存
|
|
148
|
-
- 大批量写入后执行 `BUILD INDEX vec_idx ON table_name` 为存量数据构建索引
|
|
149
|
-
- 外部系统写入时先写 ARRAY,再批量 CAST 转换,避免频繁小文件
|
|
150
|
-
|
|
151
|
-
---
|
|
152
|
-
|
|
153
|
-
## 常见问题
|
|
154
|
-
|
|
155
|
-
| 问题 | 原因 | 解决方案 |
|
|
156
|
-
|---|---|---|
|
|
157
|
-
| 向量索引未生效 | 存量数据未构建索引 | 执行 `BUILD INDEX idx ON table` |
|
|
158
|
-
| 检索精度低 | ef 值太小 | 增大 `cz.vector.index.search.ef` |
|
|
159
|
-
| 外部写入报错 | 不支持直接写 VECTOR | 先写 ARRAY,再 CAST 转换 |
|
|
160
|
-
| 向量检索慢 | 与其他查询共用 VCluster | 为向量检索单独分配 VCluster |
|