@clickzetta/cz-cli-darwin-arm64 0.3.40 → 0.3.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
  3. package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
  4. package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
  5. package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
  6. package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
  7. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +128 -287
  8. package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
  9. package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
  10. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
  11. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
  12. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
  13. package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
  14. package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
  15. package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
  16. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
  17. package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
  18. package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
  19. package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
  20. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
  21. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
  22. package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
  23. package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
  24. package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
  25. package/bin/skills/clickzetta-dw-modeling/SKILL.md +103 -11
  26. package/bin/skills/clickzetta-dynamic-table/SKILL.md +58 -2
  27. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +4 -4
  28. package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
  29. package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
  30. package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
  31. package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +34 -0
  32. package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
  33. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
  34. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
  35. package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
  36. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +31 -0
  37. package/bin/skills/clickzetta-metadata/SKILL.md +28 -30
  38. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +39 -0
  39. package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
  40. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
  41. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
  42. package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
  43. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
  44. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
  45. package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
  46. package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
  47. package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
  48. package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
  49. package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +115 -9
  50. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
  51. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
  52. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
  53. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
  54. package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
  55. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
  56. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
  57. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
  58. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
  59. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
  60. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
  61. package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
  62. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
  63. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
  64. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
  65. package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
  66. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
  67. package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
  68. package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
  69. package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
  70. package/package.json +1 -1
  71. package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
  72. package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +0 -4
  73. package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
@@ -0,0 +1,248 @@
1
+ ---
2
+ name: clickzetta-zettapark
3
+ description: |
4
+ 使用 ZettaPark Python 库操作 ClickZetta Lakehouse 数据。ZettaPark 提供类 pandas 的
5
+ DataFrame API,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行。
6
+ 覆盖 Session 创建、DataFrame 构建与转换(filter/select/join/groupBy)、
7
+ 结果收集(collect/to_pandas/show)、写入表(save_as_table)、
8
+ 文件操作(PUT/GET)、执行 SQL 等完整工作流。
9
+ 当用户说"ZettaPark"、"zettapark"、"DataFrame API"、"Python 操作 Lakehouse"、
10
+ "save_as_table"、"session.table"、"session.sql"、"collect()"、"to_pandas"、
11
+ "Python 数据工程"、"Python 写入 Lakehouse"、"Python 读取 Lakehouse"、
12
+ "clickzetta_zettapark_python"时触发。
13
+ Keywords: ZettaPark, DataFrame, pandas-like, Python, SQL translation, distributed compute
14
+ ---
15
+
16
+ # ClickZetta ZettaPark
17
+
18
+ ZettaPark 是 ClickZetta Lakehouse 的 Python DataFrame 框架,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行,提供类 pandas 的开发体验。
19
+
20
+ 阅读 [references/zettapark-api.md](references/zettapark-api.md) 了解完整 API。
21
+
22
+ ## 安装
23
+
24
+ > ⚠️ **Python 版本要求**:推荐 **Python 3.12**(最低 3.10,不支持 3.9 及以下)
25
+
26
+ ```bash
27
+ # 方式 1:venv(Python 内置,推荐)
28
+ python3.12 -m venv .venv
29
+ source .venv/bin/activate # macOS/Linux | .venv\Scripts\activate (Windows)
30
+ pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
31
+
32
+ # 方式 2:pyenv(需要切换 Python 版本时)
33
+ pyenv install 3.12.9 && pyenv local 3.12.9
34
+ python -m venv .venv && source .venv/bin/activate
35
+ pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
36
+
37
+ # 方式 3:conda(数据科学环境)
38
+ conda create -n lakehouse python=3.12 -y && conda activate lakehouse
39
+ pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
40
+ ```
41
+
42
+ ---
43
+
44
+ ## 创建会话
45
+
46
+ ```python
47
+ from clickzetta.zettapark.session import Session
48
+
49
+ connection_parameters = {
50
+ "username": "your_username",
51
+ "password": "your_password",
52
+ "service": "cn-shanghai-alicloud.api.clickzetta.com",
53
+ "instance": "your_instance_id",
54
+ "workspace": "your_workspace",
55
+ "schema": "public",
56
+ "vcluster": "default_ap",
57
+ }
58
+
59
+ session = Session.builder.configs(connection_parameters).create()
60
+
61
+ # 验证连接
62
+ session.sql("SELECT current_user(), current_workspace()").show()
63
+ ```
64
+
65
+ ---
66
+
67
+ ## 核心工作流
68
+
69
+ ### 读取数据
70
+
71
+ ```python
72
+ from clickzetta.zettapark import functions as F
73
+
74
+ # 从表读取
75
+ df = session.table("orders")
76
+ df = session.table("my_schema.orders")
77
+
78
+ # 从 SQL 读取
79
+ df = session.sql("SELECT * FROM orders WHERE year = 2024")
80
+
81
+ # 从 Python 数据创建
82
+ df = session.create_dataframe([[1, "Alice", 100.0], [2, "Bob", 200.0]],
83
+ schema=["id", "name", "amount"])
84
+ ```
85
+
86
+ ### 转换数据
87
+
88
+ ```python
89
+ # 过滤、选择、新增列
90
+ result = (
91
+ session.table("orders")
92
+ .filter(F.col("status") == "completed")
93
+ .select("order_id", "customer_id", "amount")
94
+ .with_column("tax", F.col("amount") * 0.1)
95
+ .sort(F.col("amount").desc())
96
+ .limit(100)
97
+ )
98
+ ```
99
+
100
+ ### 聚合
101
+
102
+ ```python
103
+ summary = (
104
+ session.table("orders")
105
+ .group_by("category")
106
+ .agg(
107
+ F.sum("amount").as_("total"),
108
+ F.count("*").as_("cnt"),
109
+ F.avg("amount").as_("avg_amount"),
110
+ )
111
+ )
112
+ summary.show()
113
+ ```
114
+
115
+ ### JOIN
116
+
117
+ ```python
118
+ orders = session.table("orders")
119
+ customers = session.table("customers")
120
+
121
+ result = orders.join(
122
+ customers,
123
+ orders["customer_id"] == customers["id"],
124
+ "left"
125
+ ).select(
126
+ orders["order_id"],
127
+ customers["name"],
128
+ orders["amount"]
129
+ )
130
+ ```
131
+
132
+ ### 写入数据
133
+
134
+ ```python
135
+ # 追加到已有表
136
+ df.write.save_as_table("result_table", mode="append")
137
+
138
+ # 覆盖写入(自动建表)
139
+ df.write.save_as_table("result_table", mode="overwrite")
140
+ ```
141
+
142
+ ### 获取结果
143
+
144
+ ```python
145
+ # 打印预览
146
+ df.show(20)
147
+
148
+ # 收集为 Row 列表
149
+ rows = df.collect()
150
+ for row in rows:
151
+ print(row["id"], row["name"])
152
+
153
+ # 转为 Pandas DataFrame(小数据量)
154
+ pandas_df = df.to_pandas()
155
+
156
+ # 获取行数
157
+ print(df.count())
158
+ ```
159
+
160
+ ---
161
+
162
+ ## 典型场景
163
+
164
+ ### 场景 1:ETL 数据处理
165
+
166
+ ```python
167
+ from clickzetta.zettapark.session import Session
168
+ from clickzetta.zettapark import functions as F
169
+
170
+ session = Session.builder.configs(config).create()
171
+
172
+ # 读取原始数据
173
+ raw = session.table("bronze.raw_orders")
174
+
175
+ # 清洗转换
176
+ cleaned = (
177
+ raw
178
+ .filter(F.isnotnull(F.col("order_id")))
179
+ .filter(F.col("amount") > 0)
180
+ .with_column("order_date", F.col("created_at").cast("DATE"))
181
+ .with_column("year_month", F.date_format(F.col("order_date"), "yyyy-MM"))
182
+ .select("order_id", "customer_id", "amount", "order_date", "year_month")
183
+ )
184
+
185
+ # 写入 Silver 层
186
+ cleaned.write.save_as_table("silver.orders_cleaned", mode="overwrite")
187
+
188
+ session.close()
189
+ ```
190
+
191
+ ### 场景 2:特征工程(机器学习)
192
+
193
+ ```python
194
+ from clickzetta.zettapark import functions as F
195
+
196
+ customer = session.table("clickzetta_sample_data.tpch_100g.customer")
197
+ orders = session.table("clickzetta_sample_data.tpch_100g.orders")
198
+
199
+ # 构建客户消费特征
200
+ customer_features = (
201
+ orders
202
+ .group_by("o_custkey")
203
+ .agg(
204
+ F.sum("o_totalprice").as_("total_spend"),
205
+ F.count("*").as_("order_count"),
206
+ F.avg("o_totalprice").as_("avg_order_value"),
207
+ F.max("o_orderdate").as_("last_order_date"),
208
+ )
209
+ .join(customer, orders["o_custkey"] == customer["c_custkey"])
210
+ .select("c_custkey", "c_name", "total_spend", "order_count", "avg_order_value")
211
+ )
212
+
213
+ customer_features.write.save_as_table("ml_features.customer_features", mode="overwrite")
214
+ ```
215
+
216
+ ### 场景 3:从本地文件导入
217
+
218
+ ```python
219
+ import json
220
+ import gzip
221
+ from clickzetta.zettapark.session import Session
222
+
223
+ session = Session.builder.configs(config).create()
224
+
225
+ # 读取本地 JSON 数据
226
+ data = []
227
+ with gzip.open('data.json.gz', 'rt', encoding='utf-8') as f:
228
+ for line in f:
229
+ if line.strip():
230
+ data.append(json.loads(line))
231
+
232
+ # 创建 DataFrame 并写入
233
+ df = session.create_dataframe(data)
234
+ df.write.save_as_table("my_table", mode="overwrite")
235
+
236
+ session.close()
237
+ ```
238
+
239
+ ---
240
+
241
+ ## 常见问题
242
+
243
+ | 问题 | 原因 | 解决方案 |
244
+ |---|---|---|
245
+ | `collect()` 超时 | 数据量过大或集群规格不足 | 增大 `sdk.job.timeout`,或先 `limit()` 测试 |
246
+ | `to_pandas()` 内存溢出 | 结果集过大 | 先聚合/过滤再转 pandas,或分批处理 |
247
+ | 列名冲突(JOIN 后) | 两表有同名列 | 用 `df_left["col"]` 明确指定来源 |
248
+ | `save_as_table` 报错 | 表已存在且 mode 不对 | 使用 `mode="overwrite"` 或 `mode="append"` |
@@ -0,0 +1,12 @@
1
+ {"case_id":"001","type":"should_call","user_input":"用 ZettaPark 读取 orders 表并过滤 amount > 100","expected_skill":"clickzetta-zettapark","expected_output_contains":["session.table","filter"]}
2
+ {"case_id":"002","type":"should_call","user_input":"ZettaPark 怎么安装?需要什么 Python 版本?","expected_skill":"clickzetta-zettapark","expected_output_contains":["pip install","3.12"]}
3
+ {"case_id":"003","type":"should_call","user_input":"怎么用 DataFrame API 做 group by 聚合","expected_skill":"clickzetta-zettapark","expected_output_contains":["group_by","agg"]}
4
+ {"case_id":"004","type":"should_call","user_input":"save_as_table 怎么用?支持哪些写入模式?","expected_skill":"clickzetta-zettapark","expected_output_contains":["save_as_table","overwrite","append"]}
5
+ {"case_id":"005","type":"should_call","user_input":"ZettaPark 怎么把结果转成 pandas DataFrame","expected_skill":"clickzetta-zettapark","expected_output_contains":["to_pandas"]}
6
+ {"case_id":"006","type":"should_call","user_input":"用 session.sql 执行一段 SQL 查询","expected_skill":"clickzetta-zettapark","expected_output_contains":["session.sql"]}
7
+ {"case_id":"007","type":"should_call","user_input":"ZettaPark 怎么 join 两张表","expected_skill":"clickzetta-zettapark","expected_output_contains":["join"]}
8
+ {"case_id":"008","type":"should_not_call","user_input":"帮我写一个 Flask Web 应用","forbidden_skill":"clickzetta-zettapark"}
9
+ {"case_id":"009","type":"should_not_call","user_input":"pandas 怎么读取 CSV 文件","forbidden_skill":"clickzetta-zettapark"}
10
+ {"case_id":"010","type":"should_not_call","user_input":"怎么用 JDBC 连接 Lakehouse","forbidden_skill":"clickzetta-zettapark"}
11
+ {"case_id":"011","type":"should_not_call","user_input":"帮我创建一个 VCluster","forbidden_skill":"clickzetta-zettapark"}
12
+ {"case_id":"012","type":"should_not_call","user_input":"Spark DataFrame 怎么用","forbidden_skill":"clickzetta-zettapark"}
@@ -0,0 +1,283 @@
1
+ # ZettaPark 快速参考
2
+
3
+ > 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
4
+
5
+ ## 安装
6
+
7
+ ```bash
8
+ pip install clickzetta_zettapark_python -U -i https://pypi.tuna.tsinghua.edu.cn/simple
9
+ ```
10
+
11
+ ---
12
+
13
+ ## 创建会话
14
+
15
+ ```python
16
+ from clickzetta.zettapark.session import Session
17
+
18
+ connection_parameters = {
19
+ "username": "your_username",
20
+ "password": "your_password",
21
+ "service": "cn-shanghai-alicloud.api.clickzetta.com",
22
+ "instance": "your_instance_id",
23
+ "workspace": "your_workspace",
24
+ "schema": "public",
25
+ "vcluster": "default_ap",
26
+ }
27
+
28
+ session = Session.builder.configs(connection_parameters).create()
29
+ ```
30
+
31
+ 带 hints(超时、query_tag 等):
32
+
33
+ ```python
34
+ connection_parameters = {
35
+ "username": "your_username",
36
+ "password": "your_password",
37
+ "service": "cn-shanghai-alicloud.api.clickzetta.com",
38
+ "instance": "your_instance_id",
39
+ "workspace": "your_workspace",
40
+ "schema": "public",
41
+ "vcluster": "default_ap",
42
+ "hints": {
43
+ "sdk.job.timeout": 300,
44
+ "query_tag": "my_zettapark_app",
45
+ }
46
+ }
47
+
48
+ session = Session.builder.configs(connection_parameters).create()
49
+ ```
50
+
51
+ 从 JSON 配置文件读取:
52
+
53
+ ```python
54
+ import json
55
+ with open('config.json', 'r') as f:
56
+ config = json.load(f)
57
+ session = Session.builder.configs(config).create()
58
+ ```
59
+
60
+ 验证连接:
61
+
62
+ ```python
63
+ session.sql("SELECT current_user(), current_workspace(), current_vcluster()").show()
64
+ ```
65
+
66
+ 关闭会话:
67
+
68
+ ```python
69
+ session.close()
70
+ ```
71
+
72
+ ---
73
+
74
+ ## 构建 DataFrame
75
+
76
+ ```python
77
+ # 从表创建
78
+ df = session.table("my_schema.my_table")
79
+
80
+ # 从 SQL 创建
81
+ df = session.sql("SELECT * FROM orders WHERE year = 2024")
82
+
83
+ # 从 Python 数据创建
84
+ df = session.create_dataframe([1, 2, 3, 4]).to_df("id")
85
+ df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema=["id", "name"])
86
+
87
+ # 从 Row 对象创建
88
+ from clickzetta.zettapark import Row
89
+ df = session.create_dataframe([Row(id=1, name="Alice"), Row(id=2, name="Bob")])
90
+
91
+ # 带 Schema 创建
92
+ from clickzetta.zettapark.types import IntegerType, StringType, StructType, StructField
93
+ schema = StructType([StructField("id", IntegerType()), StructField("name", StringType())])
94
+ df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema)
95
+
96
+ # 范围序列
97
+ df = session.range(1, 10, 2).to_df("n") # 1,3,5,7,9
98
+ ```
99
+
100
+ ---
101
+
102
+ ## DataFrame 转换操作
103
+
104
+ ```python
105
+ from clickzetta.zettapark import functions as F
106
+
107
+ # 过滤行
108
+ df.filter(F.col("age") > 18)
109
+ df.filter(F.col("status") == "active")
110
+ df.where(F.col("amount") > 1000)
111
+
112
+ # 选择列
113
+ df.select("id", "name", "amount")
114
+ df.select(F.col("id"), F.col("name").as_("user_name"))
115
+
116
+ # 新增/修改列
117
+ df.with_column("total", F.col("price") * F.col("qty"))
118
+ df.with_column("upper_name", F.upper(F.col("name")))
119
+
120
+ # 重命名列
121
+ df.rename(F.col("old_name"), "new_name")
122
+
123
+ # 排序
124
+ df.sort(F.col("amount").desc())
125
+ df.order_by(F.col("created_at").asc())
126
+
127
+ # 去重
128
+ df.distinct()
129
+ df.drop_duplicates(["user_id"])
130
+
131
+ # 限制行数
132
+ df.limit(100)
133
+
134
+ # 删除列
135
+ df.drop("unnecessary_col")
136
+ ```
137
+
138
+ ---
139
+
140
+ ## 聚合操作
141
+
142
+ ```python
143
+ from clickzetta.zettapark import functions as F
144
+
145
+ # 分组聚合
146
+ df.group_by("category").agg(
147
+ F.sum("amount").as_("total_amount"),
148
+ F.count("*").as_("order_count"),
149
+ F.avg("price").as_("avg_price"),
150
+ F.max("amount").as_("max_amount"),
151
+ F.min("amount").as_("min_amount"),
152
+ )
153
+
154
+ # 全局聚合
155
+ df.agg(F.count("*"), F.sum("amount"))
156
+ ```
157
+
158
+ ---
159
+
160
+ ## JOIN 操作
161
+
162
+ ```python
163
+ # 内连接
164
+ df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"])
165
+
166
+ # 左连接
167
+ df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"], "left")
168
+
169
+ # 选择连接后的列(避免列名冲突)
170
+ result = df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"]) \
171
+ .select(df_orders["order_id"], df_customers["name"], df_orders["amount"])
172
+ ```
173
+
174
+ ---
175
+
176
+ ## 执行与结果获取
177
+
178
+ ```python
179
+ # 打印前 N 行(触发执行)
180
+ df.show()
181
+ df.show(20)
182
+
183
+ # 收集所有结果为 Row 列表
184
+ rows = df.collect()
185
+ for row in rows:
186
+ print(row["id"], row["name"])
187
+
188
+ # 转换为 Pandas DataFrame
189
+ pandas_df = df.to_pandas()
190
+
191
+ # 获取行数
192
+ count = df.count()
193
+
194
+ # 获取列名
195
+ print(df.columns)
196
+
197
+ # 查看 Schema
198
+ df.schema.print_tree()
199
+ ```
200
+
201
+ ---
202
+
203
+ ## 写入数据
204
+
205
+ ```python
206
+ # 写入已有表(追加)
207
+ df.write.save_as_table("my_table", mode="append")
208
+
209
+ # 覆盖写入
210
+ df.write.save_as_table("my_table", mode="overwrite")
211
+
212
+ # 自动建表并写入(overwrite 会重建表)
213
+ df.write.save_as_table("new_table", mode="overwrite")
214
+
215
+ # 写入指定 Schema 下的表
216
+ df.write.save_as_table("my_schema.my_table", mode="append")
217
+ ```
218
+
219
+ ---
220
+
221
+ ## 执行 SQL
222
+
223
+ ```python
224
+ # 执行 DDL/DML
225
+ session.sql("CREATE TABLE IF NOT EXISTS t (id INT, name STRING)").collect()
226
+ session.sql("INSERT INTO t VALUES (1, 'Alice')").collect()
227
+
228
+ # 执行查询并获取 DataFrame
229
+ df = session.sql("SELECT * FROM orders WHERE amount > 1000")
230
+ df.show()
231
+
232
+ # 切换 Schema
233
+ session.use_schema("my_schema")
234
+ ```
235
+
236
+ ---
237
+
238
+ ## 文件操作(Volume)
239
+
240
+ ```python
241
+ # 上传文件到 User Volume
242
+ session.file.put("/local/path/data.csv", "volume:user://~/data/")
243
+
244
+ # 下载文件
245
+ session.file.get("volume:user://~/data/data.csv", "/local/output/")
246
+
247
+ # 列出 User Volume 文件
248
+ session.sql("LIST USER VOLUME").show()
249
+ session.sql("SHOW USER VOLUME DIRECTORY").show()
250
+ ```
251
+
252
+ ---
253
+
254
+ ## 常用 functions 速查
255
+
256
+ ```python
257
+ from clickzetta.zettapark import functions as F
258
+
259
+ # 字符串
260
+ F.upper(col), F.lower(col), F.concat(col1, col2)
261
+ F.substring(col, 1, 3), F.trim(col), F.length(col)
262
+
263
+ # 数值
264
+ F.abs(col), F.round(col, 2), F.floor(col), F.ceil(col)
265
+ F.sqrt(col), F.pow(col, 2)
266
+
267
+ # 日期时间
268
+ F.current_date(), F.current_timestamp()
269
+ F.year(col), F.month(col), F.day(col)
270
+ F.date_add(col, 7), F.datediff(col1, col2)
271
+
272
+ # 条件
273
+ F.when(F.col("status") == "A", "Active").otherwise("Inactive")
274
+ F.coalesce(col1, col2) # 第一个非 null 值
275
+ F.isnull(col), F.isnotnull(col)
276
+
277
+ # 聚合
278
+ F.count("*"), F.sum(col), F.avg(col), F.max(col), F.min(col)
279
+ F.count_distinct(col)
280
+
281
+ # 类型转换
282
+ F.col("amount").cast(IntegerType())
283
+ ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@clickzetta/cz-cli-darwin-arm64",
3
- "version": "0.3.40",
3
+ "version": "0.3.41",
4
4
  "description": "cz-cli binary for macOS ARM64 (Apple Silicon)",
5
5
  "os": ["darwin"],
6
6
  "cpu": ["arm64"],
@@ -1,160 +0,0 @@
1
- ---
2
- name: clickzetta-ai-vector-search
3
- description: |
4
- 在 ClickZetta Lakehouse 中实现向量存储、向量索引(HNSW)和向量检索,
5
- 构建 RAG、语义搜索、图像检索等 AI 应用。覆盖 VECTOR 数据类型定义、
6
- 向量索引创建(cosine/l2/hamming 距离)、向量数据插入与转换、
7
- ANN 近似最近邻检索、向量+倒排索引融合检索等完整工作流。
8
- 当用户说"向量检索"、"向量索引"、"语义搜索"、"embedding 存储"、
9
- "RAG"、"ANN 搜索"、"HNSW"、"cosine_distance"、"l2_distance"、
10
- "VECTOR 类型"、"向量数据库"、"相似度搜索"、"向量 + 标量融合检索"、
11
- "文本向量化"时触发。
12
- Keywords: vector, HNSW, embedding, RAG, semantic search, similarity, VECTOR type
13
- ---
14
-
15
- # ClickZetta 向量检索
16
-
17
- Lakehouse 原生支持 VECTOR 数据类型和 HNSW 向量索引,无需独立向量数据库即可在同一张表中实现向量检索、全文检索和标量过滤的融合查询。
18
-
19
- 阅读 [references/vector-search.md](references/vector-search.md) 了解完整语法。
20
-
21
- ---
22
-
23
- ## 快速开始
24
-
25
- ### 1. 建表(含向量索引)
26
-
27
- ```sql
28
- CREATE TABLE doc_embeddings (
29
- id INT,
30
- content STRING,
31
- vec VECTOR(FLOAT, 1024),
32
- INDEX vec_idx (vec) USING VECTOR PROPERTIES (
33
- "distance.function" = "cosine_distance",
34
- "scalar.type" = "f32"
35
- )
36
- );
37
- ```
38
-
39
- ### 2. 插入向量数据
40
-
41
- ```sql
42
- -- 直接插入
43
- INSERT INTO doc_embeddings VALUES
44
- (1, '云器 Lakehouse 产品介绍', vector(0.12, 0.34, ...));
45
-
46
- -- 从字符串转换(适合 API 返回的 JSON 格式)
47
- INSERT INTO doc_embeddings (id, content, vec)
48
- SELECT id, content, CAST(embedding_str AS VECTOR(1024))
49
- FROM staging_table;
50
- ```
51
-
52
- ### 3. 向量检索
53
-
54
- ```sql
55
- -- 设置探索因子(精度 vs 速度)
56
- SET cz.vector.index.search.ef = 64;
57
-
58
- -- 余弦距离 Top-10 相似文档
59
- SELECT id, content, cosine_distance(vec, CAST('[0.12, 0.34, ...]' AS VECTOR(1024))) AS dist
60
- FROM doc_embeddings
61
- ORDER BY dist
62
- LIMIT 10;
63
- ```
64
-
65
- ---
66
-
67
- ## 向量 + 标量融合检索(RAG 场景)
68
-
69
- ```sql
70
- -- 先用标量过滤缩小范围,再用向量排序
71
- SELECT id, content, cosine_distance(vec, :query_embedding) AS dist
72
- FROM doc_embeddings
73
- WHERE category = 'product'
74
- AND created_at >= '2024-01-01'
75
- ORDER BY dist
76
- LIMIT 5;
77
- ```
78
-
79
- ---
80
-
81
- ## 向量 + 全文检索融合
82
-
83
- ```sql
84
- -- 建表:同时支持向量索引和倒排索引
85
- CREATE TABLE hybrid_docs (
86
- id INT,
87
- title STRING,
88
- body STRING,
89
- vec VECTOR(FLOAT, 1024),
90
- INDEX body_inv_idx (body) USING INVERTED,
91
- INDEX vec_idx (vec) USING VECTOR PROPERTIES (
92
- "distance.function" = "cosine_distance"
93
- )
94
- );
95
-
96
- -- 融合检索:关键词过滤 + 向量排序
97
- SELECT id, title, cosine_distance(vec, :query_vec) AS dist
98
- FROM hybrid_docs
99
- WHERE body LIKE '%向量检索%'
100
- ORDER BY dist
101
- LIMIT 10;
102
- ```
103
-
104
- ---
105
-
106
- ## 外部系统写入向量(ARRAY → VECTOR 转换)
107
-
108
- 外部系统(Python SDK、Kafka 等)不能直接写 VECTOR 类型,需先写 ARRAY 再转换:
109
-
110
- ```sql
111
- -- 暂存表(ARRAY 类型)
112
- CREATE TABLE staging (id INT, vec_array ARRAY<FLOAT>);
113
-
114
- -- 转换写入目标表
115
- INSERT INTO doc_embeddings (id, vec)
116
- SELECT id, CAST(vec_array AS VECTOR(FLOAT, 1024))
117
- FROM staging;
118
- ```
119
-
120
- ---
121
-
122
- ## 距离函数速查
123
-
124
- | 函数 | 适用场景 |
125
- |---|---|
126
- | `cosine_distance(v1, v2)` | 文本语义检索(最常用) |
127
- | `l2_distance(v1, v2)` | 图像/通用向量检索 |
128
- | `dot_product(v1, v2)` | 归一化向量的相似度 |
129
- | `hamming_distance(v1, v2)` | 二值向量(高效压缩) |
130
- | `binary_quantize(v)` | 将 float 向量压缩为二值向量 |
131
-
132
- ---
133
-
134
- ## 性能调优
135
-
136
- ```sql
137
- -- 调整探索因子(默认 64,越大精度越高但越慢)
138
- SET cz.vector.index.search.ef = 128;
139
-
140
- -- 验证向量索引是否生效
141
- EXPLAIN SELECT id, cosine_distance(vec, vector(0.1, 0.2)) AS dist
142
- FROM doc_embeddings ORDER BY dist LIMIT 10;
143
- -- 查看执行计划中是否有 vector_index_search_type 字样
144
- ```
145
-
146
- **最佳实践:**
147
- - 向量检索建议**单独占用 VCluster**,避免与其他查询争抢缓存
148
- - 大批量写入后执行 `BUILD INDEX vec_idx ON table_name` 为存量数据构建索引
149
- - 外部系统写入时先写 ARRAY,再批量 CAST 转换,避免频繁小文件
150
-
151
- ---
152
-
153
- ## 常见问题
154
-
155
- | 问题 | 原因 | 解决方案 |
156
- |---|---|---|
157
- | 向量索引未生效 | 存量数据未构建索引 | 执行 `BUILD INDEX idx ON table` |
158
- | 检索精度低 | ef 值太小 | 增大 `cz.vector.index.search.ef` |
159
- | 外部写入报错 | 不支持直接写 VECTOR | 先写 ARRAY,再 CAST 转换 |
160
- | 向量检索慢 | 与其他查询共用 VCluster | 为向量检索单独分配 VCluster |