@clickzetta/cz-cli-darwin-arm64 0.3.81 → 0.3.84
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-access-control/LICENSE +16 -0
- package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
- package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
- package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
- package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
- package/bin/skills/clickzetta-app-python-sdk/LICENSE +16 -0
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +227 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/LICENSE +16 -0
- package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +237 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/LICENSE +16 -0
- package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
- package/bin/skills/clickzetta-data-science/LICENSE +16 -0
- package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
- package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
- package/bin/skills/clickzetta-data-sharing/LICENSE +16 -0
- package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
- package/bin/skills/clickzetta-dba-guide/LICENSE +16 -0
- package/bin/skills/clickzetta-dba-guide/SKILL.md +542 -0
- package/bin/skills/clickzetta-dba-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-dw-modeling/LICENSE +16 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +351 -0
- package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
- package/bin/skills/clickzetta-dynamic-table/LICENSE +16 -0
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +230 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +253 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +124 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +96 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +109 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +135 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +15 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +185 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +427 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +260 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +80 -0
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +190 -0
- package/bin/skills/clickzetta-dynamic-table/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/SKILL.md +27 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-column-validation-rules.md +118 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-conversion-rules.md +225 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-placeholder-rules.md +182 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-refresh-rules.md +98 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-self-reference-rules.md +76 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-workflow.md +109 -0
- package/bin/skills/clickzetta-external-catalog/LICENSE +16 -0
- package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
- package/bin/skills/clickzetta-external-function/LICENSE +16 -0
- package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
- package/bin/skills/clickzetta-file-import-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +190 -0
- package/bin/skills/clickzetta-file-import-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
- package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
- package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
- package/bin/skills/clickzetta-java-sdk/LICENSE +16 -0
- package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +769 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +324 -0
- package/bin/skills/clickzetta-lakehouse-connect/LICENSE +16 -0
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +218 -0
- package/bin/skills/clickzetta-lakehouse-connect/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +35 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +435 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +478 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +225 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +468 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +445 -0
- package/bin/skills/clickzetta-manage-comments/LICENSE +16 -0
- package/bin/skills/clickzetta-manage-comments/SKILL.md +219 -0
- package/bin/skills/clickzetta-manage-comments/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-metadata/LICENSE +16 -0
- package/bin/skills/clickzetta-metadata/SKILL.md +502 -0
- package/bin/skills/clickzetta-metadata/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-metadata/references/instance-views-reference.md +276 -0
- package/bin/skills/clickzetta-metadata/references/metering-views-reference.md +137 -0
- package/bin/skills/clickzetta-metadata/references/show-desc-reference.md +326 -0
- package/bin/skills/clickzetta-metadata/references/views-reference.md +271 -0
- package/bin/skills/clickzetta-monitoring/LICENSE +16 -0
- package/bin/skills/clickzetta-monitoring/SKILL.md +215 -0
- package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +562 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-overview/LICENSE +16 -0
- package/bin/skills/clickzetta-overview/SKILL.md +102 -0
- package/bin/skills/clickzetta-overview/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-overview/references/brands-and-endpoints.md +79 -0
- package/bin/skills/clickzetta-overview/references/object-model.md +311 -0
- package/bin/skills/clickzetta-overview/references/studio-modules.md +173 -0
- package/bin/skills/clickzetta-pipeline-review/LICENSE +16 -0
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
- package/bin/skills/clickzetta-query-optimizer/LICENSE +16 -0
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +156 -0
- package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +56 -0
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +78 -0
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +65 -0
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +49 -0
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +42 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-semantic-view/LICENSE +16 -0
- package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
- package/bin/skills/clickzetta-spark-flink-connector/LICENSE +16 -0
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +485 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +166 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +185 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +129 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +222 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +125 -0
- package/bin/skills/clickzetta-sql-syntax-guide/LICENSE +16 -0
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
- package/bin/skills/clickzetta-table-lineage/LICENSE +16 -0
- package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
- package/bin/skills/clickzetta-table-stream-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +206 -0
- package/bin/skills/clickzetta-table-stream-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-vcluster-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +212 -0
- package/bin/skills/clickzetta-vcluster-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +54 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +150 -0
- package/bin/skills/clickzetta-volume-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-volume-manager/SKILL.md +292 -0
- package/bin/skills/clickzetta-volume-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +199 -0
- package/bin/skills/clickzetta-zettapark/LICENSE +16 -0
- package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
- package/bin/skills/cz-cli/SKILL.md +313 -0
- package/bin/skills/cz-cli/references/profile-setup.md +120 -0
- package/package.json +1 -1
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-file-import-pipeline
|
|
3
|
+
description: |
|
|
4
|
+
从 URL、本地文件或 Volume 路径将数据导入到 ClickZetta 表中,覆盖文件下载、格式推断、
|
|
5
|
+
表创建、COPY INTO 导入、结果验证的完整流程。当用户说"导入数据"、"从 URL 加载"、
|
|
6
|
+
"上传 CSV 到表"、"文件导入"、"COPY INTO"时触发。包含 ClickZetta USER VOLUME 机制、
|
|
7
|
+
COPY INTO 语法、格式推断规则、写入模式语义等平台特有知识。
|
|
8
|
+
Keywords: file import, URL, CSV, JSON, Parquet, COPY INTO, Volume
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# URL/文件数据导入工作流
|
|
12
|
+
|
|
13
|
+
## 向导:收集必要信息
|
|
14
|
+
|
|
15
|
+
开始导入前,优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
question({
|
|
19
|
+
questions: [
|
|
20
|
+
{
|
|
21
|
+
question: "文件来源?",
|
|
22
|
+
options: [
|
|
23
|
+
{ label: "HTTP/HTTPS URL", description: "提供完整链接,自动下载" },
|
|
24
|
+
{ label: "本地文件", description: "本地路径,上传到 User Volume" },
|
|
25
|
+
{ label: "已在 Volume 上", description: "提供 Volume 名称和文件路径" },
|
|
26
|
+
{ label: "外部 Volume(OSS/S3/COS)", description: "提供外部存储路径" }
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
question: "写入模式?",
|
|
31
|
+
options: [
|
|
32
|
+
{ label: "create(自动建表)", description: "表不存在,推断 schema 后建表" },
|
|
33
|
+
{ label: "append(追加)", description: "追加到已有表,不删除历史数据" },
|
|
34
|
+
{ label: "overwrite(覆盖)", description: "清空已有表再写入" }
|
|
35
|
+
]
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
})
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
**如果用户已经提供了足够信息,直接进入工作流,不再弹出菜单。**
|
|
42
|
+
|
|
43
|
+
**如果用户已经提供了足够信息(如"把这个 URL 的 CSV 导入到 ods.orders 表"),直接进入步骤 1,不再重复询问。**
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 指令
|
|
48
|
+
|
|
49
|
+
### 步骤 1:获取源文件并上传到 Volume
|
|
50
|
+
根据数据来源选择对应方式:
|
|
51
|
+
- **HTTP/HTTPS URL**:需要先用外部工具下载到本地,然后用 `PUT` 命令上传到 User Volume
|
|
52
|
+
- **本地文件**:执行 SQL `PUT '/local/path/file.csv' TO USER VOLUME` 上传
|
|
53
|
+
- **Volume 路径**:文件已在 Volume 上,跳过此步骤
|
|
54
|
+
- **外部 Volume(OSS/S3/COS)**:文件已在外部 Volume,直接使用
|
|
55
|
+
- 记录上传后的 Volume 名称和文件名,后续步骤需要
|
|
56
|
+
|
|
57
|
+
> ⚠️ **注意**:文件上传操作参考 `clickzetta-volume-manager` skill。
|
|
58
|
+
|
|
59
|
+
### 步骤 2:推断文件格式
|
|
60
|
+
根据文件扩展名推断格式(ClickZetta COPY INTO 支持的格式):
|
|
61
|
+
- `.csv`, `.tsv`, `.txt` → CSV 格式
|
|
62
|
+
- `.json`, `.jsonl`, `.ndjson` → JSON 格式
|
|
63
|
+
- `.parquet`, `.pq` → PARQUET 格式
|
|
64
|
+
- `.orc` → ORC 格式
|
|
65
|
+
- `.bson` → BSON 格式
|
|
66
|
+
如果扩展名不明确,执行 `SELECT FROM VOLUME ... USING format` 预览文件内容来确认格式和 schema。
|
|
67
|
+
|
|
68
|
+
### 步骤 3:确认或创建目标表
|
|
69
|
+
根据写入模式处理目标表:
|
|
70
|
+
- **create 模式**:表必须不存在。执行 `SELECT FROM VOLUME ... LIMIT 5` 推断 schema,然后执行 `CREATE TABLE` 创建表
|
|
71
|
+
- **append 模式**:表必须已存在。用 `DESC TABLE <table_name>` 确认表存在并检查列兼容性
|
|
72
|
+
- **overwrite 模式**:表存在则先清空。执行 `TRUNCATE TABLE table_name`,再执行 COPY INTO(⚠️ 不支持 `COPY OVERWRITE INTO` 语法)
|
|
73
|
+
|
|
74
|
+
### 步骤 4:执行 COPY INTO 导入数据
|
|
75
|
+
执行 COPY INTO 语句。核心语法:
|
|
76
|
+
|
|
77
|
+
```sql
|
|
78
|
+
COPY INTO target_table
|
|
79
|
+
FROM VOLUME volume_name
|
|
80
|
+
USING format_type
|
|
81
|
+
OPTIONS('option_name' = 'value')
|
|
82
|
+
FILES('filename');
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
对于 USER VOLUME(通过 PUT 命令上传的文件):
|
|
86
|
+
```sql
|
|
87
|
+
COPY INTO target_table
|
|
88
|
+
FROM USER VOLUME
|
|
89
|
+
USING CSV
|
|
90
|
+
OPTIONS('header' = 'true')
|
|
91
|
+
FILES('uploaded_filename');
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
CSV 格式可附加 OPTIONS:
|
|
95
|
+
```sql
|
|
96
|
+
COPY INTO target_table
|
|
97
|
+
FROM VOLUME vol
|
|
98
|
+
USING CSV
|
|
99
|
+
OPTIONS('header' = 'true', 'sep' = ',', 'quote' = '"', 'nullValue' = '')
|
|
100
|
+
FILES('data.csv');
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
⚠️ **语法顺序要求**:`OPTIONS` 必须在 `FILES` 之前,否则报错 `Syntax error - missing EQ at '('`
|
|
104
|
+
|
|
105
|
+
overwrite 模式(⚠️ 不支持 `COPY OVERWRITE INTO`):
|
|
106
|
+
```sql
|
|
107
|
+
-- 正确方式:先 TRUNCATE 再 COPY
|
|
108
|
+
TRUNCATE TABLE target_table;
|
|
109
|
+
COPY INTO target_table FROM VOLUME vol USING CSV FILES('data.csv');
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### 步骤 5:验证导入结果
|
|
113
|
+
执行验证查询:
|
|
114
|
+
```sql
|
|
115
|
+
SELECT COUNT(*) as row_count FROM target_table;
|
|
116
|
+
SELECT * FROM target_table LIMIT 5;
|
|
117
|
+
```
|
|
118
|
+
确认行数符合预期,数据内容正确。
|
|
119
|
+
|
|
120
|
+
## 示例
|
|
121
|
+
|
|
122
|
+
### 示例 1:从 URL 导入 CSV 到新表
|
|
123
|
+
```sql
|
|
124
|
+
-- 1. 下载 URL 文件到本地,然后上传到 User Volume
|
|
125
|
+
PUT '/tmp/data.csv' TO USER VOLUME;
|
|
126
|
+
|
|
127
|
+
-- 2. 预览文件内容推断 schema
|
|
128
|
+
SELECT * FROM USER VOLUME USING CSV OPTIONS('header' = 'true') FILES('data.csv') LIMIT 5;
|
|
129
|
+
-- 推断出列:id INT, name STRING, value DOUBLE
|
|
130
|
+
|
|
131
|
+
-- 3. 创建目标表
|
|
132
|
+
CREATE TABLE imported_data (id INT, name STRING, value DOUBLE);
|
|
133
|
+
|
|
134
|
+
-- 4. 执行 COPY INTO 导入(注意:OPTIONS 必须在 FILES 之前)
|
|
135
|
+
COPY INTO imported_data FROM USER VOLUME USING CSV OPTIONS('header' = 'true') FILES('data.csv');
|
|
136
|
+
|
|
137
|
+
-- 5. 验证导入结果
|
|
138
|
+
SELECT COUNT(*) FROM imported_data;
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### 示例 2:追加 Parquet 数据到已有表
|
|
142
|
+
```sql
|
|
143
|
+
-- 1. 上传本地文件到 User Volume
|
|
144
|
+
PUT '/local/new_batch.parquet' TO USER VOLUME;
|
|
145
|
+
|
|
146
|
+
-- 2. 确认目标表存在
|
|
147
|
+
DESC TABLE existing_table;
|
|
148
|
+
|
|
149
|
+
-- 3. 执行 COPY INTO 导入(Parquet 格式通常不需要 OPTIONS)
|
|
150
|
+
COPY INTO existing_table FROM USER VOLUME USING PARQUET FILES('new_batch.parquet');
|
|
151
|
+
|
|
152
|
+
-- 4. 验证导入结果
|
|
153
|
+
SELECT COUNT(*) FROM existing_table;
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### 示例 3:从外部 Volume(OSS)导入
|
|
157
|
+
```sql
|
|
158
|
+
-- 1. 查看 Volume 中的文件列表
|
|
159
|
+
SHOW VOLUME DIRECTORY my_oss_volume;
|
|
160
|
+
|
|
161
|
+
-- 2. 预览文件内容
|
|
162
|
+
SELECT * FROM VOLUME my_oss_volume USING CSV OPTIONS('header' = 'true') FILES('data.csv') LIMIT 5;
|
|
163
|
+
|
|
164
|
+
-- 3. 创建目标表并导入(注意:OPTIONS 必须在 FILES 之前)
|
|
165
|
+
CREATE TABLE imported_data (col1 INT, col2 STRING);
|
|
166
|
+
COPY INTO imported_data FROM VOLUME my_oss_volume USING CSV OPTIONS('header' = 'true') FILES('data.csv');
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## 故障排除
|
|
170
|
+
|
|
171
|
+
| 错误 | 原因 | 解决方案 |
|
|
172
|
+
|------|------|----------|
|
|
173
|
+
| COPY INTO 报 "table not found" | create 模式下表未创建,或 append 模式下表名拼写错误 | 先用 `SHOW TABLES` 确认表是否存在 |
|
|
174
|
+
| COPY INTO 报 "file not found" | FILES 中的文件名与 Volume 上的实际文件名不匹配 | 执行 `SHOW VOLUME DIRECTORY vol_name` 或 `SHOW USER VOLUME DIRECTORY` 确认文件名,注意大小写敏感 |
|
|
175
|
+
| COPY INTO 报语法错误 "missing EQ at '('" | OPTIONS 放在了 FILES 之后 | 调整顺序,确保 `OPTIONS` 在 `FILES` 之前:`USING CSV OPTIONS(...) FILES(...)` |
|
|
176
|
+
| CSV 导入列数不匹配 | CSV 文件有 header 行但未指定 `OPTIONS('header'='true')`,导致 header 被当作数据行 | 添加 `OPTIONS('header' = 'true')`,或检查 CSV 分隔符是否正确(sep 参数) |
|
|
177
|
+
| COPY INTO 报 "schema mismatch" | 文件中的数据类型与目标表列定义不兼容 | 执行 `SELECT FROM VOLUME ... USING format LIMIT 5` 预览实际数据,调整表定义或使用列映射 |
|
|
178
|
+
| overwrite 模式数据未清空 | 使用了 `COPY OVERWRITE INTO` 语法(不支持) | overwrite 模式应先用 `TRUNCATE TABLE` 清空表,再执行 `COPY INTO` |
|
|
179
|
+
| SELECT FROM VOLUME 报错 | 格式不匹配或多格式文件混合 | 确认 USING 后的格式与实际文件格式一致;使用 `FILES()` 指定文件或 `SUBDIRECTORY` 指定子目录 |
|
|
180
|
+
| PUT 命令失败 | 本地文件路径不存在 | 确认本地文件路径正确,文件存在 |
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## 依赖的 Skills
|
|
185
|
+
|
|
186
|
+
| 操作 | 需要加载的 Skill |
|
|
187
|
+
|------|-----------------|
|
|
188
|
+
| 文件上传/下载/删除 | `clickzetta-volume-manager` |
|
|
189
|
+
| 查询 Volume 文件内容 | `clickzetta-volume-manager` |
|
|
190
|
+
| COPY INTO 导入 | 本 Skill |
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"从 URL 导入文件到 Lakehouse 的步骤和语法是什么?","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["Volume","COPY INTO"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"本地文件上传到 Lakehouse 表的流程是什么?需要哪些步骤?","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["Volume","COPY INTO"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"COPY INTO 导入数据时 append 和 overwrite 写入模式有什么区别?请说明语法","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["append","overwrite"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"COPY INTO 导入前怎么推断文件格式?有哪些支持的格式类型?","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["CSV","JSON","Parquet"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"CSV 有自定义分隔符,COPY INTO 的 OPTIONS 怎么写?","expected_skill":"clickzetta-file-import-pipeline","expected_output_contains":["CSV","OPTIONS"]}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
ClickZetta Skills License
|
|
2
|
+
© 2026 Yunqi Inc. All rights reserved.
|
|
3
|
+
LICENSE: Use of these materials (including all code, prompts, assets, files, and other components of these skills (collectively, "Skills")) is governed by your agreement with ClickZetta for the Service. If no separate agreement exists, use is governed by ClickZetta's Terms of Service (available at: https://yunqi.tech/documents/user-aggrement).
|
|
4
|
+
Your applicable agreement is referred to as the "Agreement." "Service" is as defined in the Agreement.
|
|
5
|
+
ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the contrary, you may not:
|
|
6
|
+
|
|
7
|
+
Extract from the Service or retain copies of the Skills outside use with the Service;
|
|
8
|
+
Reproduce or copy the Skills, except for temporary copies created automatically during authorized use of the Service;
|
|
9
|
+
Create derivative works based on the Skills;
|
|
10
|
+
Distribute, sublicense, or transfer the Skills to any third party;
|
|
11
|
+
Make, offer to sell, sell, or import any inventions embodied in the Skills; nor,
|
|
12
|
+
Reverse engineer, decompile, or disassemble the Skills.
|
|
13
|
+
|
|
14
|
+
The receipt, viewing, or possession of the Skills does not convey or imply any license or right beyond those expressly granted above.
|
|
15
|
+
Yunqi retains all rights, title, and interest in the Skills, including all copyrights, trademarks, patents, and all other applicable intellectual property rights.
|
|
16
|
+
THE SKILLS ARE PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SKILLS OR THE USE OR OTHER DEALINGS IN THE SKILLS.
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-index-manager
|
|
3
|
+
description: |
|
|
4
|
+
管理 ClickZetta Lakehouse 的三类索引:Bloom Filter 索引(等值查询加速)、
|
|
5
|
+
倒排索引(全文检索)、向量索引(语义相似度搜索)。覆盖创建、构建存量数据、
|
|
6
|
+
删除、查看等完整生命周期,以及索引类型选择指南。
|
|
7
|
+
当用户说"创建索引"、"加索引"、"Bloom Filter"、"布隆过滤器"、"倒排索引"、
|
|
8
|
+
"全文检索"、"向量索引"、"向量搜索"、"相似度搜索"、"BUILD INDEX"、
|
|
9
|
+
"DROP INDEX"、"SHOW INDEX"、"查询加速"、"索引优化"时触发。
|
|
10
|
+
Keywords: index, bloom filter, inverted index, vector index, full-text search
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# ClickZetta 索引管理
|
|
14
|
+
|
|
15
|
+
## 索引类型选择
|
|
16
|
+
|
|
17
|
+
| 需求 | 推荐索引 | 参考文件 |
|
|
18
|
+
|---|---|---|
|
|
19
|
+
| 高基数列等值查询(ID、邮箱、手机号) | Bloom Filter | [references/bloomfilter-index.md](references/bloomfilter-index.md) |
|
|
20
|
+
| 文本关键词搜索、全文检索 | 倒排索引 | [references/inverted-index.md](references/inverted-index.md) |
|
|
21
|
+
| 向量相似度搜索、语义检索、RAG | 向量索引 | [references/vector-index.md](references/vector-index.md) |
|
|
22
|
+
| 存量数据补建索引、删除、查看 | — | [references/index-management.md](references/index-management.md) |
|
|
23
|
+
|
|
24
|
+
## ⚠️ 关键注意事项
|
|
25
|
+
|
|
26
|
+
- **所有索引只对新写入数据生效**,旧数据需用 `BUILD INDEX` 补建(Bloom Filter 除外,不支持 BUILD INDEX)
|
|
27
|
+
- Bloom Filter 旧数据生效方法:`INSERT OVERWRITE table SELECT * FROM table`(重写数据)
|
|
28
|
+
- `BUILD INDEX` 是同步任务,大表建议按分区逐批执行
|
|
29
|
+
- **索引必须与表在同一 Schema 中**,跨 Schema 创建索引会报错(`index and table must in the same schema`)
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 步骤 1:选择索引类型并创建
|
|
34
|
+
|
|
35
|
+
### Bloom Filter(等值查询加速)
|
|
36
|
+
|
|
37
|
+
阅读 [references/bloomfilter-index.md](references/bloomfilter-index.md)
|
|
38
|
+
|
|
39
|
+
```sql
|
|
40
|
+
-- 建表时指定
|
|
41
|
+
CREATE TABLE orders (
|
|
42
|
+
order_id INT,
|
|
43
|
+
INDEX order_id_idx (order_id) BLOOMFILTER
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
-- 已有表添加
|
|
47
|
+
CREATE BLOOMFILTER INDEX idx_name
|
|
48
|
+
ON TABLE my_schema.orders(order_id)
|
|
49
|
+
COMMENT '订单ID布隆过滤器';
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### 倒排索引(全文检索)
|
|
53
|
+
|
|
54
|
+
阅读 [references/inverted-index.md](references/inverted-index.md)
|
|
55
|
+
|
|
56
|
+
```sql
|
|
57
|
+
-- 数值/日期列(不需要 PROPERTIES)
|
|
58
|
+
CREATE INVERTED INDEX id_idx ON TABLE t(order_id);
|
|
59
|
+
|
|
60
|
+
-- 字符串列(必须指定分词器,否则报错)
|
|
61
|
+
-- ⚠️ 字符串列不指定 analyzer 会创建失败
|
|
62
|
+
CREATE INVERTED INDEX title_idx
|
|
63
|
+
ON TABLE articles(title)
|
|
64
|
+
PROPERTIES('analyzer'='chinese'); -- 中文内容用 chinese
|
|
65
|
+
|
|
66
|
+
-- 其他分词器选项:
|
|
67
|
+
-- 'keyword' → 不分词,整列作为一个词(适合精确匹配:状态码、标签)
|
|
68
|
+
-- 'english' → 英文分词
|
|
69
|
+
-- 'unicode' → 通用 Unicode 分词(中英混合)
|
|
70
|
+
-- 'chinese' → 中文分词(默认推荐)
|
|
71
|
+
|
|
72
|
+
-- 查询
|
|
73
|
+
SELECT * FROM articles WHERE match_any(title, '关键词', 'analyzer'='chinese');
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 向量索引(相似度搜索)
|
|
77
|
+
|
|
78
|
+
阅读 [references/vector-index.md](references/vector-index.md)
|
|
79
|
+
|
|
80
|
+
```sql
|
|
81
|
+
CREATE VECTOR INDEX vec_idx
|
|
82
|
+
ON TABLE embeddings(vec)
|
|
83
|
+
PROPERTIES(
|
|
84
|
+
"scalar.type" = "f32",
|
|
85
|
+
"distance.function" = "cosine_distance"
|
|
86
|
+
);
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## 步骤 2:为存量数据构建索引
|
|
92
|
+
|
|
93
|
+
阅读 [references/index-management.md](references/index-management.md)
|
|
94
|
+
|
|
95
|
+
```sql
|
|
96
|
+
-- 全表构建(倒排索引和向量索引支持,Bloom Filter 不支持)
|
|
97
|
+
BUILD INDEX index_name ON my_schema.table_name;
|
|
98
|
+
|
|
99
|
+
-- 按分区构建(大表推荐)
|
|
100
|
+
BUILD INDEX index_name ON table_name WHERE dt = '2024-01-01';
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## 步骤 3:查看和管理索引
|
|
106
|
+
|
|
107
|
+
```sql
|
|
108
|
+
-- 列出表的所有索引
|
|
109
|
+
SHOW INDEX FROM my_schema.orders;
|
|
110
|
+
|
|
111
|
+
-- 查看索引详情
|
|
112
|
+
DESC INDEX index_name;
|
|
113
|
+
DESC INDEX EXTENDED index_name; -- 含索引大小
|
|
114
|
+
|
|
115
|
+
-- 删除索引
|
|
116
|
+
DROP INDEX IF EXISTS index_name;
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## 常见问题
|
|
122
|
+
|
|
123
|
+
| 问题 | 原因 | 解决方案 |
|
|
124
|
+
|---|---|---|
|
|
125
|
+
| 加了索引但查询没变快 | 旧数据未建索引 | 执行 `BUILD INDEX`(倒排/向量)或重写数据(Bloom Filter) |
|
|
126
|
+
| BUILD INDEX 执行很慢 | 数据量大 | 按分区逐批执行 `BUILD INDEX ... WHERE partition=...` |
|
|
127
|
+
| 倒排索引字符串列报错 | 未指定分词器(字符串列必须指定) | 添加 `PROPERTIES('analyzer'='chinese')` 或其他分词器 |
|
|
128
|
+
| 向量索引查询结果不准 | ef.construction 太小 | 调大 `ef.construction`(默认 128,可调至 200-500) |
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## 参考文档
|
|
133
|
+
|
|
134
|
+
- [CREATE BLOOMFILTER INDEX](https://www.yunqi.tech/documents/CREATE-BLOOMFILTER-INDEX)
|
|
135
|
+
- [CREATE INVERTED INDEX](https://www.yunqi.tech/documents/create-inverted-index)
|
|
136
|
+
- [CREATE VECTOR INDEX](https://www.yunqi.tech/documents/create-vector-index)
|
|
137
|
+
- [BUILD INDEX](https://www.yunqi.tech/documents/build-inverted-index)
|
|
138
|
+
- [DROP INDEX](https://www.yunqi.tech/documents/DROP-INDEX)
|
|
139
|
+
- [SHOW INDEX](https://www.yunqi.tech/documents/SHOW-INDEX)
|
|
140
|
+
- [DESC INDEX](https://www.yunqi.tech/documents/DESC-INDEX)
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"我想给 public.dim_studio_user_dmin_f 表的 user_id 列加个索引,加快等值查询速度","expected_skill":"clickzetta-index-manager","expected_output_contains":["BLOOMFILTER","INDEX"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"我想在 login_name 列上创建倒排索引,支持用户名的模糊搜索","expected_skill":"clickzetta-index-manager","expected_output_contains":["INVERTED","INDEX"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"Bloom Filter 索引和倒排索引分别适合什么场景?怎么选?","expected_skill":"clickzetta-index-manager","expected_output_contains":["Bloom","倒排"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"索引创建后存量数据怎么生效?BUILD INDEX 怎么用?","expected_skill":"clickzetta-index-manager","expected_output_contains":["BUILD INDEX"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"怎么查看表上有哪些索引?怎么删除不需要的索引?","expected_skill":"clickzetta-index-manager","expected_output_contains":["SHOW INDEX","DROP INDEX"]}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Bloom Filter 索引参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/CREATE-BLOOMFILTER-INDEX
|
|
4
|
+
|
|
5
|
+
## 适用场景
|
|
6
|
+
|
|
7
|
+
高基数列(如 ID、邮箱、手机号)的**等值查询**加速。通过跳过不含目标值的数据文件,减少 I/O。
|
|
8
|
+
|
|
9
|
+
不支持的列类型:INTERVAL、STRUCT、MAP、ARRAY。
|
|
10
|
+
|
|
11
|
+
## 建表时创建
|
|
12
|
+
|
|
13
|
+
```sql
|
|
14
|
+
CREATE TABLE orders (
|
|
15
|
+
order_id INT,
|
|
16
|
+
customer_id INT,
|
|
17
|
+
amount DOUBLE,
|
|
18
|
+
INDEX order_id_idx (order_id) BLOOMFILTER COMMENT 'bloom filter on order_id',
|
|
19
|
+
INDEX customer_id_idx (customer_id) BLOOMFILTER
|
|
20
|
+
) USING parquet;
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## 已有表添加
|
|
24
|
+
|
|
25
|
+
```sql
|
|
26
|
+
CREATE BLOOMFILTER INDEX [IF NOT EXISTS] index_name
|
|
27
|
+
ON TABLE [schema.]table_name(column_name)
|
|
28
|
+
[COMMENT 'comment']
|
|
29
|
+
[PROPERTIES ('key' = 'value')];
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### ngram 分词器(用于字符串模糊匹配)
|
|
33
|
+
|
|
34
|
+
```sql
|
|
35
|
+
CREATE BLOOMFILTER INDEX idx_ngram
|
|
36
|
+
ON TABLE demo(col_name)
|
|
37
|
+
PROPERTIES ('analyzer' = 'ngram', 'n' = '3');
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
`n` 为 ngram 长度,例如 n=4 时 "Lakehouse" 被索引为 "Lake"、"akeh"、"keho"...
|
|
41
|
+
|
|
42
|
+
## 注意事项
|
|
43
|
+
|
|
44
|
+
- **只对新写入数据生效**,旧数据不生效
|
|
45
|
+
- 旧数据需要生效:执行 `INSERT OVERWRITE table SELECT * FROM table` 重写数据
|
|
46
|
+
- 一张表可以创建多个 Bloom Filter 索引
|
|
47
|
+
- 目前只支持**单列索引**
|
|
48
|
+
|
|
49
|
+
## 示例(完整流程)
|
|
50
|
+
|
|
51
|
+
```sql
|
|
52
|
+
-- 建表时指定
|
|
53
|
+
CREATE TABLE t (
|
|
54
|
+
order_id INT,
|
|
55
|
+
customer_id INT,
|
|
56
|
+
INDEX order_id_index (order_id) BLOOMFILTER COMMENT 'BLOOMFILTER'
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
-- 查看索引
|
|
60
|
+
SHOW INDEX FROM t;
|
|
61
|
+
|
|
62
|
+
-- 查看索引详情
|
|
63
|
+
DESC INDEX order_id_index;
|
|
64
|
+
|
|
65
|
+
-- 删除索引
|
|
66
|
+
DROP INDEX order_id_index;
|
|
67
|
+
```
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# 索引管理命令参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/build-inverted-index、DROP-INDEX、SHOW-INDEX、DESC-INDEX
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## BUILD INDEX(为存量数据构建索引)
|
|
8
|
+
|
|
9
|
+
支持向量索引和倒排索引,**不支持 Bloom Filter**。
|
|
10
|
+
|
|
11
|
+
```sql
|
|
12
|
+
-- 全表构建
|
|
13
|
+
BUILD INDEX index_name ON [schema.]table_name;
|
|
14
|
+
|
|
15
|
+
-- 指定分区构建(支持 =, !=, >, >=, <, <=)
|
|
16
|
+
BUILD INDEX index_name ON table_name
|
|
17
|
+
WHERE partition_col1 = '2024-01-01' AND partition_col2 = 'us';
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
说明:
|
|
21
|
+
- `BUILD INDEX` 是**同步任务**,执行过程消耗计算资源
|
|
22
|
+
- 大分区表建议**按分区逐批**构建,避免单次消耗过多资源
|
|
23
|
+
- 进度可通过 Job Profile 查看
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## DROP INDEX(删除索引)
|
|
28
|
+
|
|
29
|
+
```sql
|
|
30
|
+
DROP INDEX [IF EXISTS] index_name;
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
注意:删除索引**不会立即释放存储空间**,后续新增数据不再构建该索引数据。
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## SHOW INDEX(列出表的所有索引)
|
|
38
|
+
|
|
39
|
+
```sql
|
|
40
|
+
SHOW INDEX [IN|FROM] [schema.]table_name [LIMIT num];
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
示例:
|
|
44
|
+
```sql
|
|
45
|
+
SHOW INDEX FROM orders;
|
|
46
|
+
SHOW INDEX FROM my_schema.orders;
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## DESC INDEX(查看索引详情)
|
|
52
|
+
|
|
53
|
+
```sql
|
|
54
|
+
DESC INDEX [EXTENDED] index_name;
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
- 基础模式:显示名称、创建时间、类型、所属表、列名
|
|
58
|
+
- `EXTENDED`:额外显示索引大小(倒排索引支持,Bloom Filter 暂不支持)
|
|
59
|
+
|
|
60
|
+
示例输出:
|
|
61
|
+
```
|
|
62
|
+
+--------------------------+--------------------------+
|
|
63
|
+
| info_name | info_value |
|
|
64
|
+
+--------------------------+--------------------------+
|
|
65
|
+
| name | order_year_index |
|
|
66
|
+
| creator | my_user |
|
|
67
|
+
| created_time | 2024-12-27 10:51:58.977 |
|
|
68
|
+
| index_type | inverted |
|
|
69
|
+
| table_name | t |
|
|
70
|
+
| table_column | order_year |
|
|
71
|
+
| total_index_size | 296 |
|
|
72
|
+
+--------------------------+--------------------------+
|
|
73
|
+
```
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# 倒排索引参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/create-inverted-index
|
|
4
|
+
|
|
5
|
+
## 适用场景
|
|
6
|
+
|
|
7
|
+
文本搜索、关键词匹配。支持数值、日期、字符串列。字符串列必须指定分词器。
|
|
8
|
+
|
|
9
|
+
## 分词器选择
|
|
10
|
+
|
|
11
|
+
| 分词器 | 适用场景 | 说明 |
|
|
12
|
+
|---|---|---|
|
|
13
|
+
| `keyword` | 精确匹配 | 不分词,整个字符串作为一个词根 |
|
|
14
|
+
| `english` | 英文文本 | 识别连续 ASCII 字母和数字,转小写 |
|
|
15
|
+
| `chinese` | 中英文混合 | 识别中文和英文,过滤标点,英文转小写 |
|
|
16
|
+
| `unicode` | 多语言 | 基于 Unicode 文本分割算法,支持多语言 |
|
|
17
|
+
|
|
18
|
+
数值和日期类型**不需要**指定 PROPERTIES。
|
|
19
|
+
|
|
20
|
+
## 建表时创建
|
|
21
|
+
|
|
22
|
+
```sql
|
|
23
|
+
CREATE TABLE articles (
|
|
24
|
+
id INT,
|
|
25
|
+
title STRING,
|
|
26
|
+
content STRING,
|
|
27
|
+
INDEX id_idx (id) INVERTED,
|
|
28
|
+
INDEX title_idx (title) INVERTED PROPERTIES('analyzer'='chinese'),
|
|
29
|
+
INDEX content_idx (content) INVERTED PROPERTIES('analyzer'='english')
|
|
30
|
+
);
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## 已有表添加
|
|
34
|
+
|
|
35
|
+
```sql
|
|
36
|
+
CREATE INVERTED INDEX [IF NOT EXISTS] index_name
|
|
37
|
+
ON TABLE [schema.]table_name(column_name)
|
|
38
|
+
[COMMENT 'comment']
|
|
39
|
+
[PROPERTIES('analyzer'='english|chinese|keyword|unicode')];
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## 注意事项
|
|
43
|
+
|
|
44
|
+
- **只对新写入数据生效**,旧数据需用 `BUILD INDEX` 命令补建
|
|
45
|
+
- 只支持**单列索引**
|
|
46
|
+
|
|
47
|
+
## 查询语法
|
|
48
|
+
|
|
49
|
+
```sql
|
|
50
|
+
-- 匹配任意词(OR)
|
|
51
|
+
SELECT * FROM articles WHERE match_any(content, 'keyword1 keyword2');
|
|
52
|
+
|
|
53
|
+
-- 匹配所有词(AND)
|
|
54
|
+
SELECT * FROM articles WHERE match_all(content, 'keyword1 keyword2');
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## 完整示例
|
|
58
|
+
|
|
59
|
+
```sql
|
|
60
|
+
-- 建表
|
|
61
|
+
CREATE TABLE t (
|
|
62
|
+
order_id INT,
|
|
63
|
+
order_year STRING,
|
|
64
|
+
INDEX order_id_index (order_id) INVERTED COMMENT 'INVERTED'
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
-- 给已有列添加索引
|
|
68
|
+
CREATE INVERTED INDEX order_year_index
|
|
69
|
+
ON TABLE public.t(order_year)
|
|
70
|
+
PROPERTIES('analyzer'='chinese');
|
|
71
|
+
|
|
72
|
+
-- 对存量数据构建索引
|
|
73
|
+
BUILD INDEX order_year_index ON public.t;
|
|
74
|
+
|
|
75
|
+
-- 查询
|
|
76
|
+
SELECT * FROM t WHERE match_all(order_year, '2023');
|
|
77
|
+
|
|
78
|
+
-- 查看索引详情
|
|
79
|
+
DESC INDEX EXTENDED order_year_index;
|
|
80
|
+
```
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# 向量索引参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/create-vector-index
|
|
4
|
+
|
|
5
|
+
## 适用场景
|
|
6
|
+
|
|
7
|
+
语义相似度搜索、RAG 检索、推荐系统。基于 HNSW 算法。
|
|
8
|
+
|
|
9
|
+
## 建表时创建
|
|
10
|
+
|
|
11
|
+
```sql
|
|
12
|
+
CREATE TABLE embeddings (
|
|
13
|
+
id INT,
|
|
14
|
+
vec VECTOR(FLOAT, 512),
|
|
15
|
+
INDEX vec_idx (vec) USING VECTOR PROPERTIES(
|
|
16
|
+
"scalar.type" = "f32",
|
|
17
|
+
"distance.function" = "l2_distance"
|
|
18
|
+
)
|
|
19
|
+
);
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## 已有表添加
|
|
23
|
+
|
|
24
|
+
```sql
|
|
25
|
+
CREATE VECTOR INDEX [IF NOT EXISTS] index_name
|
|
26
|
+
ON TABLE [schema.]table_name(column_name)
|
|
27
|
+
PROPERTIES(
|
|
28
|
+
"property1" = "value1",
|
|
29
|
+
...
|
|
30
|
+
);
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## PROPERTIES 参数说明
|
|
34
|
+
|
|
35
|
+
| 参数 | 可选值 | 默认值 | 说明 |
|
|
36
|
+
|---|---|---|---|
|
|
37
|
+
| `distance.function` | `l2_distance`, `cosine_distance`, `jaccard_distance`, `hamming_distance` | `cosine_distance` | 距离函数 |
|
|
38
|
+
| `scalar.type` | `f32`, `f16`, `i8`, `b1` | `f32` | 向量元素类型 |
|
|
39
|
+
| `m` | 建议不超过 1000 | `16` | HNSW 最大邻居数 |
|
|
40
|
+
| `ef.construction` | 建议不超过 5000 | `128` | HNSW 构建时候选集大小 |
|
|
41
|
+
| `reuse.vector.column` | `true`, `false` | `false` | 复用 vector column 数据节省存储 |
|
|
42
|
+
| `compress.codec` | `uncompressed`, `zstd`, `lz4` | `uncompressed` | 压缩算法(复用 column 时不生效) |
|
|
43
|
+
| `compress.level` | `fastest`, `default`, `best` | `default` | 压缩级别 |
|
|
44
|
+
|
|
45
|
+
## 向量列类型与索引元素类型对应
|
|
46
|
+
|
|
47
|
+
| 索引元素类型(scalar.type) | 支持的向量列类型 |
|
|
48
|
+
|---|---|
|
|
49
|
+
| `f32` | int, float |
|
|
50
|
+
| `f16` | int, float |
|
|
51
|
+
| `i8` | tinyint, int, float |
|
|
52
|
+
| `b1` | tinyint, int, float(按位建索引需设 `conversion.rule=as_bits`) |
|
|
53
|
+
|
|
54
|
+
## 注意事项
|
|
55
|
+
|
|
56
|
+
- **只对新写入数据生效**,旧数据需用 `BUILD INDEX` 命令补建
|
|
57
|
+
|
|
58
|
+
## 完整示例
|
|
59
|
+
|
|
60
|
+
```sql
|
|
61
|
+
-- 建表时创建向量索引
|
|
62
|
+
CREATE TABLE test_vector (
|
|
63
|
+
vec VECTOR(FLOAT, 4),
|
|
64
|
+
id INT,
|
|
65
|
+
INDEX vec_idx (vec) USING VECTOR PROPERTIES(
|
|
66
|
+
"scalar.type" = "f32",
|
|
67
|
+
"distance.function" = "l2_distance"
|
|
68
|
+
)
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
-- 已有表添加向量索引
|
|
72
|
+
CREATE VECTOR INDEX vec_idx
|
|
73
|
+
ON TABLE public.test_vector(vec)
|
|
74
|
+
PROPERTIES(
|
|
75
|
+
"scalar.type" = "f32",
|
|
76
|
+
"distance.function" = "cosine_distance"
|
|
77
|
+
);
|
|
78
|
+
|
|
79
|
+
-- 对存量数据构建索引
|
|
80
|
+
BUILD INDEX vec_idx ON public.test_vector;
|
|
81
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
ClickZetta Skills License
|
|
2
|
+
© 2026 Yunqi Inc. All rights reserved.
|
|
3
|
+
LICENSE: Use of these materials (including all code, prompts, assets, files, and other components of these skills (collectively, "Skills")) is governed by your agreement with ClickZetta for the Service. If no separate agreement exists, use is governed by ClickZetta's Terms of Service (available at: https://yunqi.tech/documents/user-aggrement).
|
|
4
|
+
Your applicable agreement is referred to as the "Agreement." "Service" is as defined in the Agreement.
|
|
5
|
+
ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the contrary, you may not:
|
|
6
|
+
|
|
7
|
+
Extract from the Service or retain copies of the Skills outside use with the Service;
|
|
8
|
+
Reproduce or copy the Skills, except for temporary copies created automatically during authorized use of the Service;
|
|
9
|
+
Create derivative works based on the Skills;
|
|
10
|
+
Distribute, sublicense, or transfer the Skills to any third party;
|
|
11
|
+
Make, offer to sell, sell, or import any inventions embodied in the Skills; nor,
|
|
12
|
+
Reverse engineer, decompile, or disassemble the Skills.
|
|
13
|
+
|
|
14
|
+
The receipt, viewing, or possession of the Skills does not convey or imply any license or right beyond those expressly granted above.
|
|
15
|
+
Yunqi retains all rights, title, and interest in the Skills, including all copyrights, trademarks, patents, and all other applicable intellectual property rights.
|
|
16
|
+
THE SKILLS ARE PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SKILLS OR THE USE OR OTHER DEALINGS IN THE SKILLS.
|