@clickzetta/cz-cli-linux-x64 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
- package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
- package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
- package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -457
- package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
- package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
- package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
- package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
- package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
- package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
- package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
- package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
- package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
- package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
- package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -112
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
- package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
- package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
- package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -156
- package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
- package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
- package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
- package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
- package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
- package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
- package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -639
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -324
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
- package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
- package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
- package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
- package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -427
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
- package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -379
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -185
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -222
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -125
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
- package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -206
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
- package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -292
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -199
- package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: clickzetta-ai-vector-search
|
|
3
|
-
description: |
|
|
4
|
-
在 ClickZetta Lakehouse 中实现向量存储、向量索引(HNSW)和向量检索,
|
|
5
|
-
构建 RAG、语义搜索、图像检索等 AI 应用。覆盖 VECTOR 数据类型定义、
|
|
6
|
-
向量索引创建(cosine/l2/hamming 距离)、向量数据插入与转换、
|
|
7
|
-
ANN 近似最近邻检索、向量+倒排索引融合检索等完整工作流。
|
|
8
|
-
当用户说"向量检索"、"向量索引"、"语义搜索"、"embedding 存储"、
|
|
9
|
-
"RAG"、"ANN 搜索"、"HNSW"、"cosine_distance"、"l2_distance"、
|
|
10
|
-
"VECTOR 类型"、"向量数据库"、"相似度搜索"、"向量 + 标量融合检索"、
|
|
11
|
-
"文本向量化"时触发。
|
|
12
|
-
Keywords: vector, HNSW, embedding, RAG, semantic search, similarity, VECTOR type
|
|
13
|
-
---
|
|
14
|
-
|
|
15
|
-
# ClickZetta 向量检索
|
|
16
|
-
|
|
17
|
-
Lakehouse 原生支持 VECTOR 数据类型和 HNSW 向量索引,无需独立向量数据库即可在同一张表中实现向量检索、全文检索和标量过滤的融合查询。
|
|
18
|
-
|
|
19
|
-
阅读 [references/vector-search.md](references/vector-search.md) 了解完整语法。
|
|
20
|
-
|
|
21
|
-
---
|
|
22
|
-
|
|
23
|
-
## 快速开始
|
|
24
|
-
|
|
25
|
-
### 1. 建表(含向量索引)
|
|
26
|
-
|
|
27
|
-
```sql
|
|
28
|
-
CREATE TABLE doc_embeddings (
|
|
29
|
-
id INT,
|
|
30
|
-
content STRING,
|
|
31
|
-
vec VECTOR(FLOAT, 1024),
|
|
32
|
-
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
33
|
-
"distance.function" = "cosine_distance",
|
|
34
|
-
"scalar.type" = "f32"
|
|
35
|
-
)
|
|
36
|
-
);
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
### 2. 插入向量数据
|
|
40
|
-
|
|
41
|
-
```sql
|
|
42
|
-
-- 直接插入
|
|
43
|
-
INSERT INTO doc_embeddings VALUES
|
|
44
|
-
(1, '云器 Lakehouse 产品介绍', vector(0.12, 0.34, ...));
|
|
45
|
-
|
|
46
|
-
-- 从字符串转换(适合 API 返回的 JSON 格式)
|
|
47
|
-
INSERT INTO doc_embeddings (id, content, vec)
|
|
48
|
-
SELECT id, content, CAST(embedding_str AS VECTOR(1024))
|
|
49
|
-
FROM staging_table;
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
### 3. 向量检索
|
|
53
|
-
|
|
54
|
-
```sql
|
|
55
|
-
-- 设置探索因子(精度 vs 速度)
|
|
56
|
-
SET cz.vector.index.search.ef = 64;
|
|
57
|
-
|
|
58
|
-
-- 余弦距离 Top-10 相似文档
|
|
59
|
-
SELECT id, content, cosine_distance(vec, CAST('[0.12, 0.34, ...]' AS VECTOR(1024))) AS dist
|
|
60
|
-
FROM doc_embeddings
|
|
61
|
-
ORDER BY dist
|
|
62
|
-
LIMIT 10;
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
---
|
|
66
|
-
|
|
67
|
-
## 向量 + 标量融合检索(RAG 场景)
|
|
68
|
-
|
|
69
|
-
```sql
|
|
70
|
-
-- 先用标量过滤缩小范围,再用向量排序
|
|
71
|
-
SELECT id, content, cosine_distance(vec, :query_embedding) AS dist
|
|
72
|
-
FROM doc_embeddings
|
|
73
|
-
WHERE category = 'product'
|
|
74
|
-
AND created_at >= '2024-01-01'
|
|
75
|
-
ORDER BY dist
|
|
76
|
-
LIMIT 5;
|
|
77
|
-
```
|
|
78
|
-
|
|
79
|
-
---
|
|
80
|
-
|
|
81
|
-
## 向量 + 全文检索融合
|
|
82
|
-
|
|
83
|
-
```sql
|
|
84
|
-
-- 建表:同时支持向量索引和倒排索引
|
|
85
|
-
CREATE TABLE hybrid_docs (
|
|
86
|
-
id INT,
|
|
87
|
-
title STRING,
|
|
88
|
-
body STRING,
|
|
89
|
-
vec VECTOR(FLOAT, 1024),
|
|
90
|
-
INDEX body_inv_idx (body) USING INVERTED,
|
|
91
|
-
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
92
|
-
"distance.function" = "cosine_distance"
|
|
93
|
-
)
|
|
94
|
-
);
|
|
95
|
-
|
|
96
|
-
-- 融合检索:关键词过滤 + 向量排序
|
|
97
|
-
SELECT id, title, cosine_distance(vec, :query_vec) AS dist
|
|
98
|
-
FROM hybrid_docs
|
|
99
|
-
WHERE body LIKE '%向量检索%'
|
|
100
|
-
ORDER BY dist
|
|
101
|
-
LIMIT 10;
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
---
|
|
105
|
-
|
|
106
|
-
## 外部系统写入向量(ARRAY → VECTOR 转换)
|
|
107
|
-
|
|
108
|
-
外部系统(Python SDK、Kafka 等)不能直接写 VECTOR 类型,需先写 ARRAY 再转换:
|
|
109
|
-
|
|
110
|
-
```sql
|
|
111
|
-
-- 暂存表(ARRAY 类型)
|
|
112
|
-
CREATE TABLE staging (id INT, vec_array ARRAY<FLOAT>);
|
|
113
|
-
|
|
114
|
-
-- 转换写入目标表
|
|
115
|
-
INSERT INTO doc_embeddings (id, vec)
|
|
116
|
-
SELECT id, CAST(vec_array AS VECTOR(FLOAT, 1024))
|
|
117
|
-
FROM staging;
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
---
|
|
121
|
-
|
|
122
|
-
## 距离函数速查
|
|
123
|
-
|
|
124
|
-
| 函数 | 适用场景 |
|
|
125
|
-
|---|---|
|
|
126
|
-
| `cosine_distance(v1, v2)` | 文本语义检索(最常用) |
|
|
127
|
-
| `l2_distance(v1, v2)` | 图像/通用向量检索 |
|
|
128
|
-
| `dot_product(v1, v2)` | 归一化向量的相似度 |
|
|
129
|
-
| `hamming_distance(v1, v2)` | 二值向量(高效压缩) |
|
|
130
|
-
| `binary_quantize(v)` | 将 float 向量压缩为二值向量 |
|
|
131
|
-
|
|
132
|
-
---
|
|
133
|
-
|
|
134
|
-
## 性能调优
|
|
135
|
-
|
|
136
|
-
```sql
|
|
137
|
-
-- 调整探索因子(默认 64,越大精度越高但越慢)
|
|
138
|
-
SET cz.vector.index.search.ef = 128;
|
|
139
|
-
|
|
140
|
-
-- 验证向量索引是否生效
|
|
141
|
-
EXPLAIN SELECT id, cosine_distance(vec, vector(0.1, 0.2)) AS dist
|
|
142
|
-
FROM doc_embeddings ORDER BY dist LIMIT 10;
|
|
143
|
-
-- 查看执行计划中是否有 vector_index_search_type 字样
|
|
144
|
-
```
|
|
145
|
-
|
|
146
|
-
**最佳实践:**
|
|
147
|
-
- 向量检索建议**单独占用 VCluster**,避免与其他查询争抢缓存
|
|
148
|
-
- 大批量写入后执行 `BUILD INDEX vec_idx ON table_name` 为存量数据构建索引
|
|
149
|
-
- 外部系统写入时先写 ARRAY,再批量 CAST 转换,避免频繁小文件
|
|
150
|
-
|
|
151
|
-
---
|
|
152
|
-
|
|
153
|
-
## 常见问题
|
|
154
|
-
|
|
155
|
-
| 问题 | 原因 | 解决方案 |
|
|
156
|
-
|---|---|---|
|
|
157
|
-
| 向量索引未生效 | 存量数据未构建索引 | 执行 `BUILD INDEX idx ON table` |
|
|
158
|
-
| 检索精度低 | ef 值太小 | 增大 `cz.vector.index.search.ef` |
|
|
159
|
-
| 外部写入报错 | 不支持直接写 VECTOR | 先写 ARRAY,再 CAST 转换 |
|
|
160
|
-
| 向量检索慢 | 与其他查询共用 VCluster | 为向量检索单独分配 VCluster |
|
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
# 向量检索参考
|
|
2
|
-
|
|
3
|
-
> 来源:https://www.yunqi.tech/documents/vector-search 等
|
|
4
|
-
|
|
5
|
-
## VECTOR 数据类型
|
|
6
|
-
|
|
7
|
-
```sql
|
|
8
|
-
-- 语法
|
|
9
|
-
vector(scalar_type, dimension)
|
|
10
|
-
vector(dimension) -- 默认 float 类型
|
|
11
|
-
|
|
12
|
-
-- 示例
|
|
13
|
-
CREATE TABLE embeddings (
|
|
14
|
-
id INT,
|
|
15
|
-
content STRING,
|
|
16
|
-
vec VECTOR(FLOAT, 1024), -- 1024 维 float 向量
|
|
17
|
-
vec_bin VECTOR(TINYINT, 128) -- 128 维 tinyint 向量(二值化)
|
|
18
|
-
);
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
支持的元素类型:`FLOAT`(f32)、`TINYINT`(i8/b1)
|
|
22
|
-
|
|
23
|
-
---
|
|
24
|
-
|
|
25
|
-
## 创建向量索引
|
|
26
|
-
|
|
27
|
-
```sql
|
|
28
|
-
-- 建表时内联创建
|
|
29
|
-
CREATE TABLE doc_embeddings (
|
|
30
|
-
id INT,
|
|
31
|
-
content STRING,
|
|
32
|
-
vec VECTOR(FLOAT, 1024),
|
|
33
|
-
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
34
|
-
"distance.function" = "cosine_distance",
|
|
35
|
-
"scalar.type" = "f32",
|
|
36
|
-
"m" = "16",
|
|
37
|
-
"ef.construction" = "128"
|
|
38
|
-
)
|
|
39
|
-
);
|
|
40
|
-
|
|
41
|
-
-- 在已有表上添加向量索引
|
|
42
|
-
ALTER TABLE doc_embeddings ADD INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
43
|
-
"distance.function" = "cosine_distance",
|
|
44
|
-
"scalar.type" = "f32"
|
|
45
|
-
);
|
|
46
|
-
|
|
47
|
-
-- 为存量数据构建索引
|
|
48
|
-
BUILD INDEX vec_idx ON doc_embeddings;
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
### 关键参数
|
|
52
|
-
|
|
53
|
-
| 参数 | 可选值 | 默认值 | 说明 |
|
|
54
|
-
|---|---|---|---|
|
|
55
|
-
| distance.function | l2_distance, cosine_distance, jaccard_distance, hamming_distance | cosine_distance | 距离函数 |
|
|
56
|
-
| scalar.type | f32, f16, i8, b1 | f32 | 索引元素类型 |
|
|
57
|
-
| m | 建议 ≤ 1000 | 16 | HNSW 最大邻居数 |
|
|
58
|
-
| ef.construction | 建议 ≤ 5000 | 128 | 构建时候选集大小 |
|
|
59
|
-
| compress.codec | uncompressed/zstd/lz4 | uncompressed | 压缩算法 |
|
|
60
|
-
|
|
61
|
-
---
|
|
62
|
-
|
|
63
|
-
## 插入向量数据
|
|
64
|
-
|
|
65
|
-
```sql
|
|
66
|
-
-- 直接插入
|
|
67
|
-
INSERT INTO doc_embeddings (id, content, vec) VALUES
|
|
68
|
-
(1, 'hello world', vector(0.1, 0.2, 0.3, ...)),
|
|
69
|
-
(2, 'foo bar', vector(0.4, 0.5, 0.6, ...));
|
|
70
|
-
|
|
71
|
-
-- 从字符串转换
|
|
72
|
-
INSERT INTO doc_embeddings (id, vec)
|
|
73
|
-
SELECT id, CAST('[0.1, 0.2, 0.3]' AS VECTOR(3))
|
|
74
|
-
FROM source_table;
|
|
75
|
-
|
|
76
|
-
-- 从 ARRAY 列转换(外部系统写入场景)
|
|
77
|
-
INSERT OVERWRITE doc_embeddings
|
|
78
|
-
SELECT id, content, CAST(vec_array AS VECTOR(FLOAT, 1024))
|
|
79
|
-
FROM staging_table;
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
---
|
|
83
|
-
|
|
84
|
-
## 向量检索
|
|
85
|
-
|
|
86
|
-
```sql
|
|
87
|
-
-- 调整探索因子(精度 vs 速度权衡)
|
|
88
|
-
SET cz.vector.index.search.ef = 64;
|
|
89
|
-
|
|
90
|
-
-- L2 距离检索(欧几里得距离,越小越相似)
|
|
91
|
-
SELECT id, content, l2_distance(vec, vector(0.1, 0.2, 0.3, ...)) AS dist
|
|
92
|
-
FROM doc_embeddings
|
|
93
|
-
ORDER BY dist
|
|
94
|
-
LIMIT 10;
|
|
95
|
-
|
|
96
|
-
-- 余弦距离检索(越小越相似)
|
|
97
|
-
SELECT id, content, cosine_distance(vec, CAST('[0.1,0.2,0.3]' AS VECTOR(3))) AS dist
|
|
98
|
-
FROM doc_embeddings
|
|
99
|
-
ORDER BY dist
|
|
100
|
-
LIMIT 10;
|
|
101
|
-
|
|
102
|
-
-- 带过滤条件的向量检索(向量 + 标量融合)
|
|
103
|
-
SELECT id, content, cosine_distance(vec, :query_vec) AS dist
|
|
104
|
-
FROM doc_embeddings
|
|
105
|
-
WHERE category = 'tech'
|
|
106
|
-
AND cosine_distance(vec, :query_vec) < 0.3
|
|
107
|
-
ORDER BY dist
|
|
108
|
-
LIMIT 10;
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
---
|
|
112
|
-
|
|
113
|
-
## 距离函数速查
|
|
114
|
-
|
|
115
|
-
| 函数 | 适用场景 | 说明 |
|
|
116
|
-
|---|---|---|
|
|
117
|
-
| `l2_distance(v1, v2)` | 通用语义检索 | 欧几里得距离,越小越相似 |
|
|
118
|
-
| `cosine_distance(v1, v2)` | 文本语义检索 | 余弦距离,越小越相似 |
|
|
119
|
-
| `dot_product(v1, v2)` | 归一化向量 | 点积,越大越相似 |
|
|
120
|
-
| `hamming_distance(v1, v2)` | 二值向量 | 汉明距离,越小越相似 |
|
|
121
|
-
| `jaccard_distance(v1, v2)` | 集合相似度 | 雅卡德距离 |
|
|
122
|
-
| `binary_quantize(v)` | 向量压缩 | 将 float 向量二值化 |
|
|
123
|
-
|
|
124
|
-
---
|
|
125
|
-
|
|
126
|
-
## 向量 + 倒排索引融合检索
|
|
127
|
-
|
|
128
|
-
```sql
|
|
129
|
-
-- 建表:同时支持向量索引和倒排索引
|
|
130
|
-
CREATE TABLE hybrid_search (
|
|
131
|
-
id INT,
|
|
132
|
-
content STRING,
|
|
133
|
-
vec VECTOR(FLOAT, 1024),
|
|
134
|
-
INDEX content_inv_idx (content) USING INVERTED,
|
|
135
|
-
INDEX vec_idx (vec) USING VECTOR PROPERTIES (
|
|
136
|
-
"distance.function" = "cosine_distance"
|
|
137
|
-
)
|
|
138
|
-
);
|
|
139
|
-
|
|
140
|
-
-- 融合检索:先用倒排过滤,再用向量排序
|
|
141
|
-
SELECT id, content, cosine_distance(vec, :query_vec) AS dist
|
|
142
|
-
FROM hybrid_search
|
|
143
|
-
WHERE content LIKE '%关键词%'
|
|
144
|
-
ORDER BY dist
|
|
145
|
-
LIMIT 10;
|
|
146
|
-
```
|
|
147
|
-
|
|
148
|
-
---
|
|
149
|
-
|
|
150
|
-
## 注意事项
|
|
151
|
-
|
|
152
|
-
- 向量类型不支持 `ORDER BY` 或 `GROUP BY`(只能对距离函数结果排序)
|
|
153
|
-
- 向量索引性能与内存/磁盘缓存直接相关,建议**单独占用 VCluster**
|
|
154
|
-
- 外部系统写入时不能直接写 VECTOR 类型,需先写 ARRAY 再 CAST 转换
|
|
155
|
-
- `ef` 值越大,检索精度越高但延迟越大;建议从 64 开始调优
|
|
@@ -1,153 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: clickzetta-app-python-sdk
|
|
3
|
-
description: |
|
|
4
|
-
在 Python 应用程序中集成 ClickZetta Lakehouse 的官方 SDK 用法。
|
|
5
|
-
覆盖 clickzetta-connector-python(SQL 查询、参数绑定、批量插入、异步执行)、
|
|
6
|
-
clickzetta-ingestion-python(BulkLoad 批量上传,单线程与分布式模式)、
|
|
7
|
-
clickzetta-ingestion-python-v2(IGS 实时写入,秒级可查,支持主键表 CDC)、
|
|
8
|
-
SQLAlchemy dialect 集成,以及连接参数说明。
|
|
9
|
-
当用户说"Python SDK"、"clickzetta-connector-python"、"clickzetta-ingestion-python"、
|
|
10
|
-
"Python 查询 Lakehouse"、"Python 写入 Lakehouse"、"Python 批量上传"、
|
|
11
|
-
"BulkLoad Python"、"SQLAlchemy Lakehouse"、"Python 连接 Lakehouse"、
|
|
12
|
-
"executemany"、"execute_async"、"参数绑定 Python"、
|
|
13
|
-
"IGS 实时写入"、"实时写入 Python"、"ingestion-python-v2"、
|
|
14
|
-
"主键表写入 Python"、"CDC 写入"、"UPSERT Python"时触发。
|
|
15
|
-
Keywords: Python SDK, clickzetta-connector-python, clickzetta-ingestion-python, bulk insert, async query, SQLAlchemy, IGS
|
|
16
|
-
---
|
|
17
|
-
|
|
18
|
-
# ClickZetta Lakehouse — Python SDK
|
|
19
|
-
|
|
20
|
-
官方提供三个 Python 包:
|
|
21
|
-
- **`clickzetta-connector-python`** — SQL 查询接口(PEP-249 规范),支持参数绑定、批量插入、异步执行、SQLAlchemy dialect
|
|
22
|
-
- **`clickzetta-ingestion-python`** — 高吞吐批量上传(BulkLoad),数据直传对象存储,不消耗计算资源
|
|
23
|
-
- **`clickzetta-ingestion-python-v2`** — IGS 实时写入,秒级可查,支持主键表 CDC(UPSERT/DELETE)
|
|
24
|
-
|
|
25
|
-
阅读 [references/connector.md](references/connector.md) 了解 SQL 查询接口,[references/bulkload.md](references/bulkload.md) 了解批量上传,[references/realtime.md](references/realtime.md) 了解 IGS 实时写入。
|
|
26
|
-
|
|
27
|
-
---
|
|
28
|
-
|
|
29
|
-
## 安装
|
|
30
|
-
|
|
31
|
-
```bash
|
|
32
|
-
# SQL 查询接口
|
|
33
|
-
pip install clickzetta-connector-python -U
|
|
34
|
-
|
|
35
|
-
# 批量上传(按云环境选择)
|
|
36
|
-
pip install "clickzetta-ingestion-python[oss]" -U # 阿里云
|
|
37
|
-
pip install "clickzetta-ingestion-python[s3]" -U # AWS
|
|
38
|
-
pip install "clickzetta-ingestion-python[all]" -U # 全部(安装较慢)
|
|
39
|
-
|
|
40
|
-
# IGS 实时写入
|
|
41
|
-
pip install clickzetta-ingestion-python-v2
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
> 注意:旧版 `clickzetta-connector` 已停止维护,请迁移到 `clickzetta-connector-python`。
|
|
45
|
-
|
|
46
|
-
---
|
|
47
|
-
|
|
48
|
-
## 连接参数
|
|
49
|
-
|
|
50
|
-
```python
|
|
51
|
-
from clickzetta import connect
|
|
52
|
-
|
|
53
|
-
conn = connect(
|
|
54
|
-
username='your_username',
|
|
55
|
-
password='your_password',
|
|
56
|
-
service='api.clickzetta.com', # region.api.clickzetta.com
|
|
57
|
-
instance='your_instance',
|
|
58
|
-
workspace='your_workspace',
|
|
59
|
-
schema='public',
|
|
60
|
-
vcluster='default'
|
|
61
|
-
)
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
| 参数 | 必填 | 说明 |
|
|
65
|
-
|---|---|---|
|
|
66
|
-
| `username` | ✅ | 用户名 |
|
|
67
|
-
| `password` | ✅ | 密码 |
|
|
68
|
-
| `service` | ✅ | 连接地址,格式 `region.api.clickzetta.com` |
|
|
69
|
-
| `instance` | ✅ | 实例名,在 Studio 工作空间 JDBC 连接串中查看 |
|
|
70
|
-
| `workspace` | ✅ | 工作空间名 |
|
|
71
|
-
| `vcluster` | ✅ | 虚拟集群名 |
|
|
72
|
-
| `schema` | ✅ | 默认 schema |
|
|
73
|
-
|
|
74
|
-
---
|
|
75
|
-
|
|
76
|
-
## 快速示例
|
|
77
|
-
|
|
78
|
-
```python
|
|
79
|
-
# 查询
|
|
80
|
-
cursor = conn.cursor()
|
|
81
|
-
cursor.execute('SELECT * FROM orders LIMIT 10')
|
|
82
|
-
results = cursor.fetchall()
|
|
83
|
-
cursor.close()
|
|
84
|
-
conn.close()
|
|
85
|
-
|
|
86
|
-
# 参数绑定(防 SQL 注入)
|
|
87
|
-
cursor.execute('INSERT INTO test (id, name) VALUES (?, ?)', binding_params=[1, 'test'])
|
|
88
|
-
|
|
89
|
-
# 批量插入
|
|
90
|
-
data = [(1, 'a'), (2, 'b'), (3, 'c')]
|
|
91
|
-
cursor.executemany('INSERT INTO test (id, name) VALUES (?, ?)', data)
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
## IGS 实时写入快速示例(ingestion-python-v2)
|
|
95
|
-
|
|
96
|
-
普通表(APPEND_ONLY):
|
|
97
|
-
|
|
98
|
-
```python
|
|
99
|
-
from clickzetta.connector.v0.connection import connect
|
|
100
|
-
from clickzetta.connector.v0.enums import RealtimeOperation
|
|
101
|
-
from clickzetta_ingestion.realtime.realtime_options import RealtimeOptionsBuilder, FlushMode
|
|
102
|
-
from clickzetta_ingestion.realtime.arrow_stream import RowOperator
|
|
103
|
-
|
|
104
|
-
with connect(**CONN_ARGS) as conn:
|
|
105
|
-
stream = conn.get_realtime_stream(
|
|
106
|
-
schema='your_schema',
|
|
107
|
-
table='your_table',
|
|
108
|
-
operate=RealtimeOperation.APPEND_ONLY,
|
|
109
|
-
options=RealtimeOptionsBuilder().with_flush_mode(FlushMode.AUTO_FLUSH_BACKGROUND).build()
|
|
110
|
-
)
|
|
111
|
-
row = stream.create_row(RowOperator.INSERT)
|
|
112
|
-
row.set_value('id', 1)
|
|
113
|
-
row.set_value('name', 'alice')
|
|
114
|
-
stream.apply(row)
|
|
115
|
-
stream.close()
|
|
116
|
-
```
|
|
117
|
-
|
|
118
|
-
主键表 CDC(UPSERT / DELETE):
|
|
119
|
-
|
|
120
|
-
```python
|
|
121
|
-
# 建表:CREATE TABLE users (id STRING NOT NULL PRIMARY KEY, name STRING, age INT);
|
|
122
|
-
|
|
123
|
-
with connect(**CONN_ARGS) as conn:
|
|
124
|
-
stream = conn.get_realtime_stream(
|
|
125
|
-
schema='your_schema',
|
|
126
|
-
table='users',
|
|
127
|
-
operate=RealtimeOperation.CDC, # 主键表必须用 CDC
|
|
128
|
-
options=RealtimeOptionsBuilder().with_flush_mode(FlushMode.AUTO_FLUSH_SYNC).build()
|
|
129
|
-
)
|
|
130
|
-
# UPSERT
|
|
131
|
-
row = stream.create_row(RowOperator.UPSERT)
|
|
132
|
-
row.set_value('id', 'u1')
|
|
133
|
-
row.set_value('name', 'bob')
|
|
134
|
-
row.set_value('age', 25)
|
|
135
|
-
stream.apply(row)
|
|
136
|
-
# DELETE_IGNORE
|
|
137
|
-
row = stream.create_row(RowOperator.DELETE_IGNORE)
|
|
138
|
-
row.set_value('id', 'u1')
|
|
139
|
-
stream.apply(row)
|
|
140
|
-
stream.close()
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
---
|
|
144
|
-
|
|
145
|
-
## 选择指南
|
|
146
|
-
|
|
147
|
-
| 场景 | 推荐方案 |
|
|
148
|
-
|---|---|
|
|
149
|
-
| 查询 / 小批量写入 | `clickzetta-connector-python` |
|
|
150
|
-
| 大批量数据导入(GB 级,间隔 ≥ 5 分钟) | `clickzetta-ingestion-python` BulkLoad |
|
|
151
|
-
| 高频小批写入(间隔 < 5 分钟,秒级可查) | `clickzetta-ingestion-python-v2` 实时写入 |
|
|
152
|
-
| 主键表写入(UPSERT / DELETE) | `clickzetta-ingestion-python-v2` CDC 模式 |
|
|
153
|
-
| SQLAlchemy / ORM 集成 | `clickzetta-connector-python`(内置 dialect) |
|
|
@@ -1,196 +0,0 @@
|
|
|
1
|
-
# clickzetta-ingestion-python BulkLoad 详细参考
|
|
2
|
-
|
|
3
|
-
## 安装
|
|
4
|
-
|
|
5
|
-
```bash
|
|
6
|
-
# 按云环境选择(推荐按需安装,all 安装较慢且可能冲突)
|
|
7
|
-
pip install "clickzetta-ingestion-python[oss]" -U # 阿里云
|
|
8
|
-
pip install "clickzetta-ingestion-python[s3]" -U # AWS
|
|
9
|
-
pip install "clickzetta-ingestion-python[cos]" -U # 腾讯云
|
|
10
|
-
pip install "clickzetta-ingestion-python[gcp]" -U # Google Cloud
|
|
11
|
-
pip install "clickzetta-ingestion-python[all]" -U # 全部
|
|
12
|
-
```
|
|
13
|
-
|
|
14
|
-
## 工作原理
|
|
15
|
-
|
|
16
|
-
```
|
|
17
|
-
[SDK 写入数据] → [对象存储] → [调用 commit()] → [触发 SQL 导入] → [Lakehouse 表]
|
|
18
|
-
```
|
|
19
|
-
|
|
20
|
-
- 数据上传阶段不消耗计算资源
|
|
21
|
-
- `commit()` 触发从对象存储到 Lakehouse 表的导入,消耗少量计算资源
|
|
22
|
-
- `commit()` 只能调用一次,commit 后数据可见
|
|
23
|
-
|
|
24
|
-
## 使用限制
|
|
25
|
-
|
|
26
|
-
- **不支持主键(pk)表写入**
|
|
27
|
-
- **不适合时间间隔小于 5 分钟的高频写入**
|
|
28
|
-
|
|
29
|
-
## 单线程写入
|
|
30
|
-
|
|
31
|
-
### 建表
|
|
32
|
-
|
|
33
|
-
```sql
|
|
34
|
-
CREATE TABLE public.bulkload_test (
|
|
35
|
-
i BIGINT,
|
|
36
|
-
s STRING,
|
|
37
|
-
d DOUBLE
|
|
38
|
-
);
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
### 完整示例
|
|
42
|
-
|
|
43
|
-
```python
|
|
44
|
-
from clickzetta import connect
|
|
45
|
-
|
|
46
|
-
conn = connect(
|
|
47
|
-
username='your_username',
|
|
48
|
-
password='your_password',
|
|
49
|
-
service='api.clickzetta.com',
|
|
50
|
-
instance='your_instance',
|
|
51
|
-
workspace='your_workspace',
|
|
52
|
-
schema='public',
|
|
53
|
-
vcluster='default'
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulkload_test')
|
|
57
|
-
|
|
58
|
-
writer = bulkload_stream.open_writer(0) # 单线程传 0
|
|
59
|
-
for index in range(1000000):
|
|
60
|
-
row = writer.create_row()
|
|
61
|
-
row.set_value('i', index) # 按列名设值
|
|
62
|
-
row.set_value('s', 'Hello')
|
|
63
|
-
row.set_value('d', 123.456)
|
|
64
|
-
writer.write(row)
|
|
65
|
-
writer.close()
|
|
66
|
-
|
|
67
|
-
bulkload_stream.commit() # 提交,数据可见
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
## 读取 CSV 写入示例
|
|
71
|
-
|
|
72
|
-
```python
|
|
73
|
-
from clickzetta import connect
|
|
74
|
-
import csv
|
|
75
|
-
|
|
76
|
-
conn = connect(
|
|
77
|
-
username='',
|
|
78
|
-
password='',
|
|
79
|
-
service='api.clickzetta.com',
|
|
80
|
-
instance='',
|
|
81
|
-
workspace='',
|
|
82
|
-
schema='public',
|
|
83
|
-
vcluster='default_ap'
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulk_order_payments')
|
|
87
|
-
writer = bulkload_stream.open_writer(0)
|
|
88
|
-
|
|
89
|
-
with open('olist_order_payments_dataset.csv', 'r') as csvfile:
|
|
90
|
-
reader = csv.reader(csvfile)
|
|
91
|
-
next(reader) # 跳过 header
|
|
92
|
-
for record in reader:
|
|
93
|
-
row = writer.create_row()
|
|
94
|
-
row.set_value('order_id', record[0])
|
|
95
|
-
row.set_value('payment_sequence', int(record[1]))
|
|
96
|
-
row.set_value('payment_type', record[2])
|
|
97
|
-
row.set_value('payment_installments', int(record[3]))
|
|
98
|
-
row.set_value('payment_value', float(record[4]))
|
|
99
|
-
writer.write(row) # ⚠️ 必须调用,否则数据不发送到服务端
|
|
100
|
-
|
|
101
|
-
writer.close()
|
|
102
|
-
bulkload_stream.commit()
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
## 写入模式
|
|
106
|
-
|
|
107
|
-
```python
|
|
108
|
-
from clickzetta.bulkload.bulkload_enums import BulkLoadOperation
|
|
109
|
-
|
|
110
|
-
# APPEND 模式(默认):新数据追加,不影响旧数据
|
|
111
|
-
bulkload_stream = conn.create_bulkload_stream(schema='public', table='my_table')
|
|
112
|
-
|
|
113
|
-
# OVERWRITE 模式:清空旧数据,写入新数据
|
|
114
|
-
bulkload_stream = conn.create_bulkload_stream(
|
|
115
|
-
schema='public',
|
|
116
|
-
table='my_table',
|
|
117
|
-
operation=BulkLoadOperation.OVERWRITE
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
# 分区表 OVERWRITE(只覆盖指定分区)
|
|
121
|
-
bulkload_stream = conn.create_bulkload_stream(
|
|
122
|
-
schema='public',
|
|
123
|
-
table='my_partitioned_table',
|
|
124
|
-
partition_spec='pt=2024-01-01',
|
|
125
|
-
operation=BulkLoadOperation.OVERWRITE
|
|
126
|
-
)
|
|
127
|
-
```
|
|
128
|
-
|
|
129
|
-
## 分布式并发写入
|
|
130
|
-
|
|
131
|
-
适合 GB 级以上数据,多进程并发写入同一 stream,最后统一 commit。
|
|
132
|
-
|
|
133
|
-
### 控制进程
|
|
134
|
-
|
|
135
|
-
```python
|
|
136
|
-
import subprocess
|
|
137
|
-
from clickzetta import connect
|
|
138
|
-
|
|
139
|
-
conn = connect(username='username', password='password',
|
|
140
|
-
service='api.clickzetta.com', instance='instance',
|
|
141
|
-
workspace='quickstart_ws', schema='public', vcluster='default')
|
|
142
|
-
|
|
143
|
-
bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulkload_test')
|
|
144
|
-
stream_id = bulkload_stream.get_stream_id()
|
|
145
|
-
|
|
146
|
-
# 启动多个写入进程,每个进程用不同的 writer_id
|
|
147
|
-
p1 = subprocess.Popen(['python', 'writer.py', stream_id, '1'])
|
|
148
|
-
p2 = subprocess.Popen(['python', 'writer.py', stream_id, '2'])
|
|
149
|
-
p1.wait()
|
|
150
|
-
p2.wait()
|
|
151
|
-
|
|
152
|
-
bulkload_stream.commit() # 所有 writer 完成后统一 commit
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
### 写入进程
|
|
156
|
-
|
|
157
|
-
```python
|
|
158
|
-
import sys
|
|
159
|
-
from clickzetta import connect
|
|
160
|
-
|
|
161
|
-
conn = connect(username='username', password='password',
|
|
162
|
-
service='api.clickzetta.com', instance='instance',
|
|
163
|
-
workspace='quickstart_ws', schema='public', vcluster='default')
|
|
164
|
-
|
|
165
|
-
stream_id = sys.argv[1]
|
|
166
|
-
writer_id = int(sys.argv[2])
|
|
167
|
-
|
|
168
|
-
# 通过 stream_id 获取已有 stream(不创建新的)
|
|
169
|
-
bulkload_stream = conn.get_bulkload_stream(
|
|
170
|
-
schema='public', table='bulkload_test', stream_id=stream_id
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
writer = bulkload_stream.open_writer(writer_id) # writer_id 必须唯一
|
|
174
|
-
for index in range(1, 1000000):
|
|
175
|
-
row = writer.create_row()
|
|
176
|
-
row.set_value('i', index)
|
|
177
|
-
row.set_value('s', 'Hello')
|
|
178
|
-
row.set_value('d', 123.456)
|
|
179
|
-
writer.write(row)
|
|
180
|
-
writer.close()
|
|
181
|
-
# 写入进程不调用 commit,只有控制进程调用
|
|
182
|
-
```
|
|
183
|
-
|
|
184
|
-
## 关键 API
|
|
185
|
-
|
|
186
|
-
| API | 说明 |
|
|
187
|
-
|---|---|
|
|
188
|
-
| `conn.create_bulkload_stream(schema, table)` | 创建新的 bulkload stream |
|
|
189
|
-
| `conn.get_bulkload_stream(schema, table, stream_id)` | 获取已有 stream(分布式写入用) |
|
|
190
|
-
| `bulkload_stream.get_stream_id()` | 获取 stream id(传给写入进程) |
|
|
191
|
-
| `bulkload_stream.open_writer(writer_id)` | 创建 writer,id 必须唯一 |
|
|
192
|
-
| `writer.create_row()` | 创建行对象 |
|
|
193
|
-
| `row.set_value(column_name, value)` | 按列名设值 |
|
|
194
|
-
| `writer.write(row)` | 写入行(必须调用) |
|
|
195
|
-
| `writer.close()` | 关闭 writer(写完必须调用) |
|
|
196
|
-
| `bulkload_stream.commit()` | 提交,数据可见(只能调用一次) |
|