@clickzetta/cz-cli-darwin-x64 0.3.18 → 0.3.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
- package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
- package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
- package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +160 -0
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +155 -0
- package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +259 -0
- package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
- package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
- package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
- package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
- package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +218 -0
- package/bin/skills/clickzetta-lakehouse-connect/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +35 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +435 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +478 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +225 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +468 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +445 -0
- package/bin/skills/clickzetta-manage-comments/SKILL.md +219 -0
- package/bin/skills/clickzetta-manage-comments/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-metadata/SKILL.md +483 -0
- package/bin/skills/clickzetta-metadata/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-metadata/references/instance-views-reference.md +276 -0
- package/bin/skills/clickzetta-metadata/references/metering-views-reference.md +137 -0
- package/bin/skills/clickzetta-metadata/references/show-desc-reference.md +326 -0
- package/bin/skills/clickzetta-metadata/references/views-reference.md +271 -0
- package/bin/skills/clickzetta-monitoring/SKILL.md +199 -0
- package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
- package/bin/skills/clickzetta-overview/SKILL.md +102 -0
- package/bin/skills/clickzetta-overview/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-overview/references/brands-and-endpoints.md +79 -0
- package/bin/skills/clickzetta-overview/references/object-model.md +311 -0
- package/bin/skills/clickzetta-overview/references/studio-modules.md +173 -0
- package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
- package/bin/skills/cz-cli/SKILL.md +1 -1
- package/bin/skills/cz-cli-inner/SKILL.md +8 -0
- package/package.json +1 -1
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# External Function DDL 参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/CREATE_EXTERNATL_FUNCTION 等
|
|
4
|
+
|
|
5
|
+
## 概念
|
|
6
|
+
|
|
7
|
+
External Function(外部函数)是通过 Python/Java 编写、在云函数服务(阿里云 FC / 腾讯云 SCF / AWS Lambda)上执行的自定义 UDF。可调用:
|
|
8
|
+
- **在线服务**:LLM API、图像识别 API 等
|
|
9
|
+
- **离线模型**:打包上传的 Hugging Face 模型等
|
|
10
|
+
|
|
11
|
+
支持函数类型:UDF(标量)、UDAF(聚合,仅 Java)、UDTF(表函数,仅 Java)
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## CREATE API CONNECTION(云函数连接)
|
|
16
|
+
|
|
17
|
+
```sql
|
|
18
|
+
CREATE API CONNECTION IF NOT EXISTS my_fc_conn
|
|
19
|
+
TYPE CLOUD_FUNCTION
|
|
20
|
+
PROVIDER = 'aliyun' -- 'aliyun' | 'tencent' | 'aws'
|
|
21
|
+
REGION = 'cn-shanghai'
|
|
22
|
+
ROLE_ARN = 'acs:ram::1234567890:role/CzUDFRole'
|
|
23
|
+
NAMESPACE = 'default' -- 腾讯云必填,其他填 'default'
|
|
24
|
+
CODE_BUCKET = 'my-oss-bucket';
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
| 参数 | 说明 |
|
|
28
|
+
|---|---|
|
|
29
|
+
| PROVIDER | `'aliyun'` / `'tencent'` / `'aws'` |
|
|
30
|
+
| REGION | 阿里云:`cn-shanghai`;腾讯云:`ap-beijing`;AWS:`cn-northwest-1` |
|
|
31
|
+
| ROLE_ARN | 授权给 Lakehouse 的 RAM 角色 ARN |
|
|
32
|
+
| NAMESPACE | 腾讯云命名空间(必填);其他填 `'default'` |
|
|
33
|
+
| CODE_BUCKET | 存放函数代码包的 OSS/COS/S3 bucket 名称 |
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## CREATE EXTERNAL FUNCTION
|
|
38
|
+
|
|
39
|
+
```sql
|
|
40
|
+
CREATE EXTERNAL FUNCTION IF NOT EXISTS my_schema.my_udf
|
|
41
|
+
AS 'module_name.ClassName'
|
|
42
|
+
USING FILE = 'oss://my-bucket/functions/code.zip'
|
|
43
|
+
CONNECTION = my_fc_conn
|
|
44
|
+
WITH PROPERTIES (
|
|
45
|
+
'remote.udf.api' = 'python3.mc.v0' -- Python: python3.mc.v0 | Java: java8.hive2.v0
|
|
46
|
+
)
|
|
47
|
+
COMMENT '自定义函数说明';
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### 资源文件地址格式
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
-- OSS/COS/S3
|
|
54
|
+
oss://bucket-name/path/to/code.zip
|
|
55
|
+
cos://bucket-name/path/to/code.zip
|
|
56
|
+
s3://bucket-name/path/to/code.zip
|
|
57
|
+
|
|
58
|
+
-- User Volume(无需开通对象存储)
|
|
59
|
+
volume:user://~/code.zip
|
|
60
|
+
|
|
61
|
+
-- External Volume
|
|
62
|
+
volume://workspace.schema.volume_name/code.zip
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### WITH PROPERTIES 参数
|
|
66
|
+
|
|
67
|
+
| 参数 | 值 | 说明 |
|
|
68
|
+
|---|---|---|
|
|
69
|
+
| `remote.udf.api` | `python3.mc.v0` | Python 3.10 运行时 |
|
|
70
|
+
| `remote.udf.api` | `java8.hive2.v0` | Java 8 Hive 风格 UDF |
|
|
71
|
+
| `remote.udf.protocol` | `http.arrow.v0` | 默认,访问云函数的协议 |
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Python UDF 代码结构
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
#!/usr/bin/env python
|
|
79
|
+
try:
|
|
80
|
+
from cz.udf import annotate
|
|
81
|
+
except ImportError:
|
|
82
|
+
annotate = lambda _: lambda _: _
|
|
83
|
+
|
|
84
|
+
@annotate("string->string") # 函数签名:输入类型->返回类型
|
|
85
|
+
class Upper(object):
|
|
86
|
+
def evaluate(self, arg):
|
|
87
|
+
if arg is None:
|
|
88
|
+
return None
|
|
89
|
+
return arg.upper()
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### 函数签名格式
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
"input_type1,input_type2->return_type"
|
|
96
|
+
|
|
97
|
+
# 示例
|
|
98
|
+
"string->string" # 字符串转字符串
|
|
99
|
+
"string,int->double" # 两个输入,返回 double
|
|
100
|
+
"string->array<string>" # 返回数组
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
支持类型:`string`、`int`、`bigint`、`double`、`float`、`boolean`、`array<T>`、`map<K,V>`
|
|
104
|
+
|
|
105
|
+
### 打包上传
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# 安装依赖到当前目录
|
|
109
|
+
pip3 install httpx pydantic -t .
|
|
110
|
+
|
|
111
|
+
# 打包(< 500MB)
|
|
112
|
+
zip -rq code.zip ./*
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
```sql
|
|
116
|
+
-- 上传到 User Volume(在 ClickZetta Studio 或 CLI 中执行,source_path 使用绝对路径)
|
|
117
|
+
PUT '/path/to/code.zip' TO USER VOLUME;
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## 管理操作
|
|
123
|
+
|
|
124
|
+
```sql
|
|
125
|
+
-- 查看外部函数列表
|
|
126
|
+
SHOW EXTERNAL FUNCTIONS;
|
|
127
|
+
SHOW EXTERNAL FUNCTIONS LIKE 'my_%';
|
|
128
|
+
|
|
129
|
+
-- 删除外部函数
|
|
130
|
+
DROP FUNCTION IF EXISTS my_schema.my_udf;
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## 内置 AI 函数(无需部署云函数)
|
|
136
|
+
|
|
137
|
+
### AI_COMPLETE(调用 LLM)
|
|
138
|
+
|
|
139
|
+
```sql
|
|
140
|
+
-- 通过 API Connection 调用(需先创建连接)
|
|
141
|
+
CREATE API CONNECTION conn_bailian
|
|
142
|
+
TYPE ai_function
|
|
143
|
+
PROVIDER = 'bailian'
|
|
144
|
+
BASE_URL = 'https://dashscope.aliyuncs.com/api/v1'
|
|
145
|
+
API_KEY = '<key>';
|
|
146
|
+
|
|
147
|
+
-- 调用 LLM 生成文本
|
|
148
|
+
SELECT AI_COMPLETE('connection:conn_bailian', '请用一句话总结:' || content) AS summary
|
|
149
|
+
FROM articles
|
|
150
|
+
LIMIT 10;
|
|
151
|
+
|
|
152
|
+
-- 通过平台 Endpoint 调用(管理员预配置)
|
|
153
|
+
SELECT AI_COMPLETE('endpoint:my_llm_endpoint', prompt_col) AS result
|
|
154
|
+
FROM my_table;
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### AI_EMBEDDING(文本向量化)
|
|
158
|
+
|
|
159
|
+
```sql
|
|
160
|
+
-- 将文本转为向量(用于语义搜索)
|
|
161
|
+
SELECT id, content,
|
|
162
|
+
AI_EMBEDDING('connection:conn_bailian', content) AS embedding
|
|
163
|
+
FROM documents;
|
|
164
|
+
|
|
165
|
+
-- 结合向量索引做语义搜索
|
|
166
|
+
SELECT id, content,
|
|
167
|
+
cosine_distance(embedding, AI_EMBEDDING('connection:conn_bailian', '查询文本')) AS dist
|
|
168
|
+
FROM doc_embeddings
|
|
169
|
+
ORDER BY dist
|
|
170
|
+
LIMIT 10;
|
|
171
|
+
```
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-index-manager
|
|
3
|
+
description: |
|
|
4
|
+
管理 ClickZetta Lakehouse 的三类索引:Bloom Filter 索引(等值查询加速)、
|
|
5
|
+
倒排索引(全文检索)、向量索引(语义相似度搜索)。覆盖创建、构建存量数据、
|
|
6
|
+
删除、查看等完整生命周期,以及索引类型选择指南。
|
|
7
|
+
当用户说"创建索引"、"加索引"、"Bloom Filter"、"布隆过滤器"、"倒排索引"、
|
|
8
|
+
"全文检索"、"向量索引"、"向量搜索"、"相似度搜索"、"BUILD INDEX"、
|
|
9
|
+
"DROP INDEX"、"SHOW INDEX"、"查询加速"、"索引优化"时触发。
|
|
10
|
+
Keywords: index, bloom filter, inverted index, vector index, full-text search
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# ClickZetta 索引管理
|
|
14
|
+
|
|
15
|
+
## 索引类型选择
|
|
16
|
+
|
|
17
|
+
| 需求 | 推荐索引 | 参考文件 |
|
|
18
|
+
|---|---|---|
|
|
19
|
+
| 高基数列等值查询(ID、邮箱、手机号) | Bloom Filter | [references/bloomfilter-index.md](references/bloomfilter-index.md) |
|
|
20
|
+
| 文本关键词搜索、全文检索 | 倒排索引 | [references/inverted-index.md](references/inverted-index.md) |
|
|
21
|
+
| 向量相似度搜索、语义检索、RAG | 向量索引 | [references/vector-index.md](references/vector-index.md) |
|
|
22
|
+
| 存量数据补建索引、删除、查看 | — | [references/index-management.md](references/index-management.md) |
|
|
23
|
+
|
|
24
|
+
## ⚠️ 关键注意事项
|
|
25
|
+
|
|
26
|
+
- **所有索引只对新写入数据生效**,旧数据需用 `BUILD INDEX` 补建(Bloom Filter 除外,不支持 BUILD INDEX)
|
|
27
|
+
- Bloom Filter 旧数据生效方法:`INSERT OVERWRITE table SELECT * FROM table`(重写数据)
|
|
28
|
+
- `BUILD INDEX` 是同步任务,大表建议按分区逐批执行
|
|
29
|
+
- **索引必须与表在同一 Schema 中**,跨 Schema 创建索引会报错(`index and table must in the same schema`)
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 步骤 1:选择索引类型并创建
|
|
34
|
+
|
|
35
|
+
### Bloom Filter(等值查询加速)
|
|
36
|
+
|
|
37
|
+
阅读 [references/bloomfilter-index.md](references/bloomfilter-index.md)
|
|
38
|
+
|
|
39
|
+
```sql
|
|
40
|
+
-- 建表时指定
|
|
41
|
+
CREATE TABLE orders (
|
|
42
|
+
order_id INT,
|
|
43
|
+
INDEX order_id_idx (order_id) BLOOMFILTER
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
-- 已有表添加
|
|
47
|
+
CREATE BLOOMFILTER INDEX idx_name
|
|
48
|
+
ON TABLE my_schema.orders(order_id)
|
|
49
|
+
COMMENT '订单ID布隆过滤器';
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### 倒排索引(全文检索)
|
|
53
|
+
|
|
54
|
+
阅读 [references/inverted-index.md](references/inverted-index.md)
|
|
55
|
+
|
|
56
|
+
```sql
|
|
57
|
+
-- 数值/日期列(不需要 PROPERTIES)
|
|
58
|
+
CREATE INVERTED INDEX id_idx ON TABLE t(order_id);
|
|
59
|
+
|
|
60
|
+
-- 字符串列(必须指定分词器,否则报错)
|
|
61
|
+
-- ⚠️ 字符串列不指定 analyzer 会创建失败
|
|
62
|
+
CREATE INVERTED INDEX title_idx
|
|
63
|
+
ON TABLE articles(title)
|
|
64
|
+
PROPERTIES('analyzer'='chinese'); -- 中文内容用 chinese
|
|
65
|
+
|
|
66
|
+
-- 其他分词器选项:
|
|
67
|
+
-- 'keyword' → 不分词,整列作为一个词(适合精确匹配:状态码、标签)
|
|
68
|
+
-- 'english' → 英文分词
|
|
69
|
+
-- 'unicode' → 通用 Unicode 分词(中英混合)
|
|
70
|
+
-- 'chinese' → 中文分词(默认推荐)
|
|
71
|
+
|
|
72
|
+
-- 查询
|
|
73
|
+
SELECT * FROM articles WHERE match_any(title, '关键词', 'analyzer'='chinese');
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 向量索引(相似度搜索)
|
|
77
|
+
|
|
78
|
+
阅读 [references/vector-index.md](references/vector-index.md)
|
|
79
|
+
|
|
80
|
+
```sql
|
|
81
|
+
CREATE VECTOR INDEX vec_idx
|
|
82
|
+
ON TABLE embeddings(vec)
|
|
83
|
+
PROPERTIES(
|
|
84
|
+
"scalar.type" = "f32",
|
|
85
|
+
"distance.function" = "cosine_distance"
|
|
86
|
+
);
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## 步骤 2:为存量数据构建索引
|
|
92
|
+
|
|
93
|
+
阅读 [references/index-management.md](references/index-management.md)
|
|
94
|
+
|
|
95
|
+
```sql
|
|
96
|
+
-- 全表构建(倒排索引和向量索引支持,Bloom Filter 不支持)
|
|
97
|
+
BUILD INDEX index_name ON my_schema.table_name;
|
|
98
|
+
|
|
99
|
+
-- 按分区构建(大表推荐)
|
|
100
|
+
BUILD INDEX index_name ON table_name WHERE dt = '2024-01-01';
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## 步骤 3:查看和管理索引
|
|
106
|
+
|
|
107
|
+
```sql
|
|
108
|
+
-- 列出表的所有索引
|
|
109
|
+
SHOW INDEX FROM my_schema.orders;
|
|
110
|
+
|
|
111
|
+
-- 查看索引详情
|
|
112
|
+
DESC INDEX index_name;
|
|
113
|
+
DESC INDEX EXTENDED index_name; -- 含索引大小
|
|
114
|
+
|
|
115
|
+
-- 删除索引
|
|
116
|
+
DROP INDEX IF EXISTS index_name;
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## 常见问题
|
|
122
|
+
|
|
123
|
+
| 问题 | 原因 | 解决方案 |
|
|
124
|
+
|---|---|---|
|
|
125
|
+
| 加了索引但查询没变快 | 旧数据未建索引 | 执行 `BUILD INDEX`(倒排/向量)或重写数据(Bloom Filter) |
|
|
126
|
+
| BUILD INDEX 执行很慢 | 数据量大 | 按分区逐批执行 `BUILD INDEX ... WHERE partition=...` |
|
|
127
|
+
| 倒排索引字符串列报错 | 未指定分词器(字符串列必须指定) | 添加 `PROPERTIES('analyzer'='chinese')` 或其他分词器 |
|
|
128
|
+
| 向量索引查询结果不准 | ef.construction 太小 | 调大 `ef.construction`(默认 128,可调至 200-500) |
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## 参考文档
|
|
133
|
+
|
|
134
|
+
- [CREATE BLOOMFILTER INDEX](https://www.yunqi.tech/documents/CREATE-BLOOMFILTER-INDEX)
|
|
135
|
+
- [CREATE INVERTED INDEX](https://www.yunqi.tech/documents/create-inverted-index)
|
|
136
|
+
- [CREATE VECTOR INDEX](https://www.yunqi.tech/documents/create-vector-index)
|
|
137
|
+
- [BUILD INDEX](https://www.yunqi.tech/documents/build-inverted-index)
|
|
138
|
+
- [DROP INDEX](https://www.yunqi.tech/documents/DROP-INDEX)
|
|
139
|
+
- [SHOW INDEX](https://www.yunqi.tech/documents/SHOW-INDEX)
|
|
140
|
+
- [DESC INDEX](https://www.yunqi.tech/documents/DESC-INDEX)
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"我想给 public.dim_studio_user_dmin_f 表的 user_id 列加个索引,加快等值查询速度","expected_skill":"clickzetta-index-manager","expected_output_contains":["BLOOMFILTER","INDEX"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"我想在 login_name 列上创建倒排索引,支持用户名的模糊搜索","expected_skill":"clickzetta-index-manager","expected_output_contains":["INVERTED","INDEX"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"Bloom Filter 索引和倒排索引分别适合什么场景?怎么选?","expected_skill":"clickzetta-index-manager","expected_output_contains":["Bloom","倒排"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"索引创建后存量数据怎么生效?BUILD INDEX 怎么用?","expected_skill":"clickzetta-index-manager","expected_output_contains":["BUILD INDEX"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"怎么查看表上有哪些索引?怎么删除不需要的索引?","expected_skill":"clickzetta-index-manager","expected_output_contains":["SHOW INDEX","DROP INDEX"]}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Bloom Filter 索引参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/CREATE-BLOOMFILTER-INDEX
|
|
4
|
+
|
|
5
|
+
## 适用场景
|
|
6
|
+
|
|
7
|
+
高基数列(如 ID、邮箱、手机号)的**等值查询**加速。通过跳过不含目标值的数据文件,减少 I/O。
|
|
8
|
+
|
|
9
|
+
不支持的列类型:INTERVAL、STRUCT、MAP、ARRAY。
|
|
10
|
+
|
|
11
|
+
## 建表时创建
|
|
12
|
+
|
|
13
|
+
```sql
|
|
14
|
+
CREATE TABLE orders (
|
|
15
|
+
order_id INT,
|
|
16
|
+
customer_id INT,
|
|
17
|
+
amount DOUBLE,
|
|
18
|
+
INDEX order_id_idx (order_id) BLOOMFILTER COMMENT 'bloom filter on order_id',
|
|
19
|
+
INDEX customer_id_idx (customer_id) BLOOMFILTER
|
|
20
|
+
) USING parquet;
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## 已有表添加
|
|
24
|
+
|
|
25
|
+
```sql
|
|
26
|
+
CREATE BLOOMFILTER INDEX [IF NOT EXISTS] index_name
|
|
27
|
+
ON TABLE [schema.]table_name(column_name)
|
|
28
|
+
[COMMENT 'comment']
|
|
29
|
+
[PROPERTIES ('key' = 'value')];
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### ngram 分词器(用于字符串模糊匹配)
|
|
33
|
+
|
|
34
|
+
```sql
|
|
35
|
+
CREATE BLOOMFILTER INDEX idx_ngram
|
|
36
|
+
ON TABLE demo(col_name)
|
|
37
|
+
PROPERTIES ('analyzer' = 'ngram', 'n' = '3');
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
`n` 为 ngram 长度,例如 n=4 时 "Lakehouse" 被索引为 "Lake"、"akeh"、"keho"...
|
|
41
|
+
|
|
42
|
+
## 注意事项
|
|
43
|
+
|
|
44
|
+
- **只对新写入数据生效**,旧数据不生效
|
|
45
|
+
- 旧数据需要生效:执行 `INSERT OVERWRITE table SELECT * FROM table` 重写数据
|
|
46
|
+
- 一张表可以创建多个 Bloom Filter 索引
|
|
47
|
+
- 目前只支持**单列索引**
|
|
48
|
+
|
|
49
|
+
## 示例(完整流程)
|
|
50
|
+
|
|
51
|
+
```sql
|
|
52
|
+
-- 建表时指定
|
|
53
|
+
CREATE TABLE t (
|
|
54
|
+
order_id INT,
|
|
55
|
+
customer_id INT,
|
|
56
|
+
INDEX order_id_index (order_id) BLOOMFILTER COMMENT 'BLOOMFILTER'
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
-- 查看索引
|
|
60
|
+
SHOW INDEX FROM t;
|
|
61
|
+
|
|
62
|
+
-- 查看索引详情
|
|
63
|
+
DESC INDEX order_id_index;
|
|
64
|
+
|
|
65
|
+
-- 删除索引
|
|
66
|
+
DROP INDEX order_id_index;
|
|
67
|
+
```
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# 索引管理命令参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/build-inverted-index、DROP-INDEX、SHOW-INDEX、DESC-INDEX
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## BUILD INDEX(为存量数据构建索引)
|
|
8
|
+
|
|
9
|
+
支持向量索引和倒排索引,**不支持 Bloom Filter**。
|
|
10
|
+
|
|
11
|
+
```sql
|
|
12
|
+
-- 全表构建
|
|
13
|
+
BUILD INDEX index_name ON [schema.]table_name;
|
|
14
|
+
|
|
15
|
+
-- 指定分区构建(支持 =, !=, >, >=, <, <=)
|
|
16
|
+
BUILD INDEX index_name ON table_name
|
|
17
|
+
WHERE partition_col1 = '2024-01-01' AND partition_col2 = 'us';
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
说明:
|
|
21
|
+
- `BUILD INDEX` 是**同步任务**,执行过程消耗计算资源
|
|
22
|
+
- 大分区表建议**按分区逐批**构建,避免单次消耗过多资源
|
|
23
|
+
- 进度可通过 Job Profile 查看
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## DROP INDEX(删除索引)
|
|
28
|
+
|
|
29
|
+
```sql
|
|
30
|
+
DROP INDEX [IF EXISTS] index_name;
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
注意:删除索引**不会立即释放存储空间**,后续新增数据不再构建该索引数据。
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## SHOW INDEX(列出表的所有索引)
|
|
38
|
+
|
|
39
|
+
```sql
|
|
40
|
+
SHOW INDEX [IN|FROM] [schema.]table_name [LIMIT num];
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
示例:
|
|
44
|
+
```sql
|
|
45
|
+
SHOW INDEX FROM orders;
|
|
46
|
+
SHOW INDEX FROM my_schema.orders;
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## DESC INDEX(查看索引详情)
|
|
52
|
+
|
|
53
|
+
```sql
|
|
54
|
+
DESC INDEX [EXTENDED] index_name;
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
- 基础模式:显示名称、创建时间、类型、所属表、列名
|
|
58
|
+
- `EXTENDED`:额外显示索引大小(倒排索引支持,Bloom Filter 暂不支持)
|
|
59
|
+
|
|
60
|
+
示例输出:
|
|
61
|
+
```
|
|
62
|
+
+--------------------------+--------------------------+
|
|
63
|
+
| info_name | info_value |
|
|
64
|
+
+--------------------------+--------------------------+
|
|
65
|
+
| name | order_year_index |
|
|
66
|
+
| creator | my_user |
|
|
67
|
+
| created_time | 2024-12-27 10:51:58.977 |
|
|
68
|
+
| index_type | inverted |
|
|
69
|
+
| table_name | t |
|
|
70
|
+
| table_column | order_year |
|
|
71
|
+
| total_index_size | 296 |
|
|
72
|
+
+--------------------------+--------------------------+
|
|
73
|
+
```
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# 倒排索引参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/create-inverted-index
|
|
4
|
+
|
|
5
|
+
## 适用场景
|
|
6
|
+
|
|
7
|
+
文本搜索、关键词匹配。支持数值、日期、字符串列。字符串列必须指定分词器。
|
|
8
|
+
|
|
9
|
+
## 分词器选择
|
|
10
|
+
|
|
11
|
+
| 分词器 | 适用场景 | 说明 |
|
|
12
|
+
|---|---|---|
|
|
13
|
+
| `keyword` | 精确匹配 | 不分词,整个字符串作为一个词根 |
|
|
14
|
+
| `english` | 英文文本 | 识别连续 ASCII 字母和数字,转小写 |
|
|
15
|
+
| `chinese` | 中英文混合 | 识别中文和英文,过滤标点,英文转小写 |
|
|
16
|
+
| `unicode` | 多语言 | 基于 Unicode 文本分割算法,支持多语言 |
|
|
17
|
+
|
|
18
|
+
数值和日期类型**不需要**指定 PROPERTIES。
|
|
19
|
+
|
|
20
|
+
## 建表时创建
|
|
21
|
+
|
|
22
|
+
```sql
|
|
23
|
+
CREATE TABLE articles (
|
|
24
|
+
id INT,
|
|
25
|
+
title STRING,
|
|
26
|
+
content STRING,
|
|
27
|
+
INDEX id_idx (id) INVERTED,
|
|
28
|
+
INDEX title_idx (title) INVERTED PROPERTIES('analyzer'='chinese'),
|
|
29
|
+
INDEX content_idx (content) INVERTED PROPERTIES('analyzer'='english')
|
|
30
|
+
);
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## 已有表添加
|
|
34
|
+
|
|
35
|
+
```sql
|
|
36
|
+
CREATE INVERTED INDEX [IF NOT EXISTS] index_name
|
|
37
|
+
ON TABLE [schema.]table_name(column_name)
|
|
38
|
+
[COMMENT 'comment']
|
|
39
|
+
[PROPERTIES('analyzer'='english|chinese|keyword|unicode')];
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## 注意事项
|
|
43
|
+
|
|
44
|
+
- **只对新写入数据生效**,旧数据需用 `BUILD INDEX` 命令补建
|
|
45
|
+
- 只支持**单列索引**
|
|
46
|
+
|
|
47
|
+
## 查询语法
|
|
48
|
+
|
|
49
|
+
```sql
|
|
50
|
+
-- 匹配任意词(OR)
|
|
51
|
+
SELECT * FROM articles WHERE match_any(content, 'keyword1 keyword2');
|
|
52
|
+
|
|
53
|
+
-- 匹配所有词(AND)
|
|
54
|
+
SELECT * FROM articles WHERE match_all(content, 'keyword1 keyword2');
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## 完整示例
|
|
58
|
+
|
|
59
|
+
```sql
|
|
60
|
+
-- 建表
|
|
61
|
+
CREATE TABLE t (
|
|
62
|
+
order_id INT,
|
|
63
|
+
order_year STRING,
|
|
64
|
+
INDEX order_id_index (order_id) INVERTED COMMENT 'INVERTED'
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
-- 给已有列添加索引
|
|
68
|
+
CREATE INVERTED INDEX order_year_index
|
|
69
|
+
ON TABLE public.t(order_year)
|
|
70
|
+
PROPERTIES('analyzer'='chinese');
|
|
71
|
+
|
|
72
|
+
-- 对存量数据构建索引
|
|
73
|
+
BUILD INDEX order_year_index ON public.t;
|
|
74
|
+
|
|
75
|
+
-- 查询
|
|
76
|
+
SELECT * FROM t WHERE match_all(order_year, '2023');
|
|
77
|
+
|
|
78
|
+
-- 查看索引详情
|
|
79
|
+
DESC INDEX EXTENDED order_year_index;
|
|
80
|
+
```
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# 向量索引参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/create-vector-index
|
|
4
|
+
|
|
5
|
+
## 适用场景
|
|
6
|
+
|
|
7
|
+
语义相似度搜索、RAG 检索、推荐系统。基于 HNSW 算法。
|
|
8
|
+
|
|
9
|
+
## 建表时创建
|
|
10
|
+
|
|
11
|
+
```sql
|
|
12
|
+
CREATE TABLE embeddings (
|
|
13
|
+
id INT,
|
|
14
|
+
vec VECTOR(FLOAT, 512),
|
|
15
|
+
INDEX vec_idx (vec) USING VECTOR PROPERTIES(
|
|
16
|
+
"scalar.type" = "f32",
|
|
17
|
+
"distance.function" = "l2_distance"
|
|
18
|
+
)
|
|
19
|
+
);
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## 已有表添加
|
|
23
|
+
|
|
24
|
+
```sql
|
|
25
|
+
CREATE VECTOR INDEX [IF NOT EXISTS] index_name
|
|
26
|
+
ON TABLE [schema.]table_name(column_name)
|
|
27
|
+
PROPERTIES(
|
|
28
|
+
"property1" = "value1",
|
|
29
|
+
...
|
|
30
|
+
);
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## PROPERTIES 参数说明
|
|
34
|
+
|
|
35
|
+
| 参数 | 可选值 | 默认值 | 说明 |
|
|
36
|
+
|---|---|---|---|
|
|
37
|
+
| `distance.function` | `l2_distance`, `cosine_distance`, `jaccard_distance`, `hamming_distance` | `cosine_distance` | 距离函数 |
|
|
38
|
+
| `scalar.type` | `f32`, `f16`, `i8`, `b1` | `f32` | 向量元素类型 |
|
|
39
|
+
| `m` | 建议不超过 1000 | `16` | HNSW 最大邻居数 |
|
|
40
|
+
| `ef.construction` | 建议不超过 5000 | `128` | HNSW 构建时候选集大小 |
|
|
41
|
+
| `reuse.vector.column` | `true`, `false` | `false` | 复用 vector column 数据节省存储 |
|
|
42
|
+
| `compress.codec` | `uncompressed`, `zstd`, `lz4` | `uncompressed` | 压缩算法(复用 column 时不生效) |
|
|
43
|
+
| `compress.level` | `fastest`, `default`, `best` | `default` | 压缩级别 |
|
|
44
|
+
|
|
45
|
+
## 向量列类型与索引元素类型对应
|
|
46
|
+
|
|
47
|
+
| 索引元素类型(scalar.type) | 支持的向量列类型 |
|
|
48
|
+
|---|---|
|
|
49
|
+
| `f32` | int, float |
|
|
50
|
+
| `f16` | int, float |
|
|
51
|
+
| `i8` | tinyint, int, float |
|
|
52
|
+
| `b1` | tinyint, int, float(按位建索引需设 `conversion.rule=as_bits`) |
|
|
53
|
+
|
|
54
|
+
## 注意事项
|
|
55
|
+
|
|
56
|
+
- **只对新写入数据生效**,旧数据需用 `BUILD INDEX` 命令补建
|
|
57
|
+
|
|
58
|
+
## 完整示例
|
|
59
|
+
|
|
60
|
+
```sql
|
|
61
|
+
-- 建表时创建向量索引
|
|
62
|
+
CREATE TABLE test_vector (
|
|
63
|
+
vec VECTOR(FLOAT, 4),
|
|
64
|
+
id INT,
|
|
65
|
+
INDEX vec_idx (vec) USING VECTOR PROPERTIES(
|
|
66
|
+
"scalar.type" = "f32",
|
|
67
|
+
"distance.function" = "l2_distance"
|
|
68
|
+
)
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
-- 已有表添加向量索引
|
|
72
|
+
CREATE VECTOR INDEX vec_idx
|
|
73
|
+
ON TABLE public.test_vector(vec)
|
|
74
|
+
PROPERTIES(
|
|
75
|
+
"scalar.type" = "f32",
|
|
76
|
+
"distance.function" = "cosine_distance"
|
|
77
|
+
);
|
|
78
|
+
|
|
79
|
+
-- 对存量数据构建索引
|
|
80
|
+
BUILD INDEX vec_idx ON public.test_vector;
|
|
81
|
+
```
|