@clickzetta/cz-cli-linux-x64 0.3.2 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
- package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
- package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
- package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -450
- package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
- package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
- package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
- package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
- package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
- package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
- package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
- package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
- package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
- package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
- package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -86
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
- package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
- package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
- package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -117
- package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
- package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
- package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
- package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
- package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
- package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
- package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -531
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -186
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
- package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
- package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
- package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
- package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -402
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
- package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -353
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -173
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -160
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -123
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
- package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -155
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
- package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -249
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -194
- package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
|
@@ -1,402 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: clickzetta-oss-ingest-pipeline
|
|
3
|
-
description: |
|
|
4
|
-
搭建 ClickZetta 对象存储(OSS/S3/COS)数据导入管道,覆盖持续导入(PIPE)和批量一次性导入
|
|
5
|
-
两大场景。持续导入支持 LIST_PURGE 扫描模式和 EVENT_NOTIFICATION 消息通知模式;批量导入支持
|
|
6
|
-
Volume + INSERT INTO 和 Volume + COPY INTO 两种方式。当用户说"对象存储导入"、"OSS 数据管道"、
|
|
7
|
-
"S3 数据导入"、"PIPE 持续导入"、"文件自动加载"、"存储桶数据同步"、"COS 导入"、
|
|
8
|
-
"批量导入 OSS"、"从 OSS 加载数据"、"Volume 导入"时触发。
|
|
9
|
-
包含 PIPE 持续导入(两种 INGEST_MODE)、批量导入(Volume + COPY/INSERT)、Connection/Volume 创建、
|
|
10
|
-
监控管理等 ClickZetta 特有逻辑。
|
|
11
|
-
Keywords: OSS, S3, COS, object storage, PIPE, COPY INTO, file ingestion
|
|
12
|
-
---
|
|
13
|
-
|
|
14
|
-
# 对象存储数据管道搭建工作流
|
|
15
|
-
|
|
16
|
-
## 适用场景
|
|
17
|
-
|
|
18
|
-
- 从对象存储(阿里云 OSS / AWS S3 / 腾讯云 COS)持续自动导入数据到 Lakehouse(PIPE 模式)
|
|
19
|
-
- 从对象存储批量一次性导入数据到 Lakehouse(Volume + COPY/INSERT 模式)
|
|
20
|
-
- 需要微批处理方式加载新增文件,实现近实时数据同步
|
|
21
|
-
- 需要选择扫描模式(LIST_PURGE)或消息通知模式(EVENT_NOTIFICATION)
|
|
22
|
-
- 需要对导入数据进行过滤转换(WHERE 条件、指定文件)
|
|
23
|
-
- 关键词:OSS PIPE、S3 导入、对象存储管道、文件自动加载、PIPE 持续导入、COS 数据同步、批量导入、Volume 导入
|
|
24
|
-
|
|
25
|
-
## 前置依赖
|
|
26
|
-
|
|
27
|
-
- ClickZetta Lakehouse 账户,具备创建 PIPE、表、存储连接、Volume 等权限
|
|
28
|
-
- 对象存储桶可达(Endpoint、AccessKey 或 Role ARN)
|
|
29
|
-
- clickzetta-studio-mcp 工具可用(`LH_execute_query`、`LH_show_object_list` 等)
|
|
30
|
-
|
|
31
|
-
## 核心概念
|
|
32
|
-
|
|
33
|
-
### INGEST_MODE 选择指引
|
|
34
|
-
|
|
35
|
-
| 模式 | 触发方式 | 适用场景 | 云平台支持 | 授权方式 |
|
|
36
|
-
|------|---------|---------|-----------|---------|
|
|
37
|
-
| `LIST_PURGE` | 定期扫描目录 | 通用场景,导入后删除源文件 | 所有云平台 | 密钥 或 Role ARN |
|
|
38
|
-
| `EVENT_NOTIFICATION` | 消息服务通知 | 低延迟场景,文件上传即触发 | 仅阿里云 OSS + AWS S3 | 仅 Role ARN |
|
|
39
|
-
|
|
40
|
-
### 关键限制
|
|
41
|
-
|
|
42
|
-
- 每个 PIPE 需对应独立的 Volume,不可复用
|
|
43
|
-
- 不支持修改 COPY 语句逻辑,需删除 PIPE 重新创建
|
|
44
|
-
- PIPE 中的 COPY 语句不支持 `files` / `regexp` / `subdirectory` 参数
|
|
45
|
-
- 数据加载无法保证严格有序
|
|
46
|
-
- `load_history` 去重记录保留 7 天
|
|
47
|
-
- 修改 `COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数
|
|
48
|
-
- **Volume PIPE 不支持 Kafka 专用参数**:`BATCH_INTERVAL_IN_SECONDS`、`BATCH_SIZE_PER_KAFKA_PARTITION`、`MAX_SKIP_BATCH_COUNT_ON_ERROR` 仅适用于 Kafka PIPE
|
|
49
|
-
- **`COPY_JOB_HINT` 必须是合法 JSON 格式**,键值都要用双引号:`'{"IGNORE_TMP_FILE": "true"}'`,不能用 `KEY=VALUE` 格式
|
|
50
|
-
|
|
51
|
-
### 文件大小建议
|
|
52
|
-
|
|
53
|
-
- gzip 压缩文件:≈ 50MB
|
|
54
|
-
- CSV / PARQUET 未压缩文件:128MB ~ 256MB
|
|
55
|
-
|
|
56
|
-
## 工作流
|
|
57
|
-
|
|
58
|
-
### 模式 A:LIST_PURGE 扫描模式(通用)
|
|
59
|
-
|
|
60
|
-
#### 步骤 1:创建存储连接(Storage Connection)
|
|
61
|
-
|
|
62
|
-
```sql
|
|
63
|
-
-- 使用 LH_execute_query 执行
|
|
64
|
-
-- 密钥方式(LIST_PURGE 模式支持)
|
|
65
|
-
CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_connection
|
|
66
|
-
TYPE OSS
|
|
67
|
-
ENDPOINT = 'oss-cn-hangzhou.aliyuncs.com'
|
|
68
|
-
ACCESS_KEY = '<your_access_key>'
|
|
69
|
-
SECRET_KEY = '<your_secret_key>'
|
|
70
|
-
COMMENT = 'OSS connection for data pipeline';
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
> **提示**:如果使用 Role ARN 方式(EVENT_NOTIFICATION 模式必须),参见下方"模式 B"中的 Connection 创建语法。
|
|
74
|
-
|
|
75
|
-
#### 步骤 2:创建外部 Volume
|
|
76
|
-
|
|
77
|
-
```sql
|
|
78
|
-
-- 使用 LH_execute_query 执行
|
|
79
|
-
CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_volume
|
|
80
|
-
STORAGE_CONNECTION = my_oss_connection
|
|
81
|
-
LOCATION = 'oss://my-bucket/data-path/'
|
|
82
|
-
COMMENT = 'Volume for OSS PIPE ingestion';
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
> **关键参数**:
|
|
86
|
-
> - 如需递归扫描子目录:添加 `recursive = true`
|
|
87
|
-
> - 如需自动刷新目录元数据:添加 `directory = (enable = true, auto_refresh = true)`
|
|
88
|
-
|
|
89
|
-
#### 步骤 3:验证 COPY INTO 可独立运行
|
|
90
|
-
|
|
91
|
-
在创建 PIPE 之前,先用 COPY INTO 验证数据能正常加载:
|
|
92
|
-
|
|
93
|
-
```sql
|
|
94
|
-
-- 使用 LH_execute_query 执行
|
|
95
|
-
COPY INTO my_schema.target_table
|
|
96
|
-
FROM VOLUME pipe_volume
|
|
97
|
-
USING CSV
|
|
98
|
-
OPTIONS (
|
|
99
|
-
'header' = 'true',
|
|
100
|
-
'delimiter' = ','
|
|
101
|
-
);
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
> **重要**:PIPE 中的 COPY 语句不支持 `files`、`regexp`、`subdirectory` 参数。确保此处验证时也不使用这些参数。
|
|
105
|
-
|
|
106
|
-
#### 步骤 4:创建 PIPE(LIST_PURGE 模式)
|
|
107
|
-
|
|
108
|
-
```sql
|
|
109
|
-
-- 使用 LH_execute_query 执行
|
|
110
|
-
CREATE PIPE IF NOT EXISTS my_oss_pipe
|
|
111
|
-
INGEST_MODE = 'LIST_PURGE'
|
|
112
|
-
VIRTUAL_CLUSTER = 'my_vc'
|
|
113
|
-
COMMENT = 'OSS data pipeline - scan mode'
|
|
114
|
-
AS
|
|
115
|
-
COPY INTO my_schema.target_table
|
|
116
|
-
FROM VOLUME pipe_volume
|
|
117
|
-
USING CSV
|
|
118
|
-
OPTIONS (
|
|
119
|
-
'header' = 'true',
|
|
120
|
-
'delimiter' = ',',
|
|
121
|
-
'purge' = 'true'
|
|
122
|
-
);
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
> **参数说明**:
|
|
126
|
-
> - `INGEST_MODE = 'LIST_PURGE'`:定期扫描 Volume 目录,发现新文件即加载
|
|
127
|
-
> - `purge = true`:**LIST_PURGE 模式必须设置**,加载成功后删除源文件(避免重复导入)。即使不想删除源文件,LIST_PURGE 模式也需要此参数,否则会重复导入同一文件
|
|
128
|
-
> - `VIRTUAL_CLUSTER`:指定执行 PIPE 任务的虚拟集群
|
|
129
|
-
|
|
130
|
-
#### 步骤 5:验证 PIPE 状态
|
|
131
|
-
|
|
132
|
-
```sql
|
|
133
|
-
-- 使用 LH_execute_query 执行
|
|
134
|
-
DESC PIPE EXTENDED my_oss_pipe;
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
确认 `pipe_execution_paused = false`(PIPE 已启动运行)。
|
|
138
|
-
|
|
139
|
-
---
|
|
140
|
-
|
|
141
|
-
### 模式 B:EVENT_NOTIFICATION 消息通知模式(低延迟)
|
|
142
|
-
|
|
143
|
-
> 仅支持阿里云 OSS + AWS S3。文件上传到桶后,通过消息服务(MNS/SQS)通知 Lakehouse 立即加载。
|
|
144
|
-
|
|
145
|
-
#### 前置准备(阿里云 OSS 示例)
|
|
146
|
-
|
|
147
|
-
1. **开通阿里云 MNS 消息服务**:在阿里云控制台开通消息服务 MNS
|
|
148
|
-
2. **配置 OSS 事件通知**:在 OSS 桶 → 事件通知 → 创建规则,事件类型选择 `ObjectCreated`,目标选择 MNS 队列
|
|
149
|
-
3. **授权 OSS 读取权限**:创建 RAM 角色,授予 `oss:GetObject`、`oss:ListBucket` 权限,记录 Role ARN
|
|
150
|
-
4. **授权 MNS 到 Lakehouse**:将 Lakehouse 服务账号添加到 MNS 队列的授权策略中
|
|
151
|
-
|
|
152
|
-
#### 步骤 1:创建存储连接(Role ARN 方式)
|
|
153
|
-
|
|
154
|
-
```sql
|
|
155
|
-
-- 使用 LH_execute_query 执行
|
|
156
|
-
CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_role_connection
|
|
157
|
-
TYPE OSS
|
|
158
|
-
ENDPOINT = 'oss-cn-hangzhou.aliyuncs.com'
|
|
159
|
-
ROLE_ARN = 'acs:ram::1234567890:role/clickzetta-oss-role'
|
|
160
|
-
REGION = 'cn-hangzhou'
|
|
161
|
-
COMMENT = 'OSS connection via Role ARN for event notification mode';
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
#### 步骤 2:创建外部 Volume
|
|
165
|
-
|
|
166
|
-
```sql
|
|
167
|
-
-- 使用 LH_execute_query 执行
|
|
168
|
-
CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_event_volume
|
|
169
|
-
STORAGE_CONNECTION = my_oss_role_connection
|
|
170
|
-
LOCATION = 'oss://my-bucket/data-path/';
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
#### 步骤 3:创建 PIPE(EVENT_NOTIFICATION 模式)
|
|
174
|
-
|
|
175
|
-
```sql
|
|
176
|
-
-- 使用 LH_execute_query 执行
|
|
177
|
-
CREATE PIPE IF NOT EXISTS my_oss_event_pipe
|
|
178
|
-
INGEST_MODE = 'EVENT_NOTIFICATION'
|
|
179
|
-
VIRTUAL_CLUSTER = 'my_vc'
|
|
180
|
-
ALICLOUD_MNS_QUEUE = 'my-mns-queue-name'
|
|
181
|
-
COMMENT = 'OSS data pipeline - event notification mode'
|
|
182
|
-
AS
|
|
183
|
-
COPY INTO my_schema.target_table
|
|
184
|
-
FROM VOLUME pipe_event_volume
|
|
185
|
-
USING CSV
|
|
186
|
-
OPTIONS (
|
|
187
|
-
'header' = 'true',
|
|
188
|
-
'delimiter' = ','
|
|
189
|
-
);
|
|
190
|
-
```
|
|
191
|
-
|
|
192
|
-
> **参数说明**:
|
|
193
|
-
> - `INGEST_MODE = 'EVENT_NOTIFICATION'`:通过消息通知触发加载
|
|
194
|
-
> - `ALICLOUD_MNS_QUEUE`:阿里云 MNS 队列名称(AWS 使用 `AWS_SQS_QUEUE`)
|
|
195
|
-
> - 此模式下不需要 `purge = true`,因为是事件驱动而非扫描
|
|
196
|
-
|
|
197
|
-
---
|
|
198
|
-
|
|
199
|
-
### 模式 C:批量导入(一次性 Volume + COPY/INSERT)
|
|
200
|
-
|
|
201
|
-
> 适用于一次性或定期批量加载对象存储中的文件,无需创建 PIPE。支持阿里云 OSS、腾讯云 COS 和 AWS S3。
|
|
202
|
-
> 推荐使用 GENERAL PURPOSE 类型的虚拟集群执行批量加载。
|
|
203
|
-
|
|
204
|
-
#### 使用限制
|
|
205
|
-
|
|
206
|
-
- 不支持跨云导入(源存储与 Lakehouse 环境需在同一云平台)
|
|
207
|
-
- 同地域建议使用内网 Endpoint(如 `oss-cn-shanghai-internal.aliyuncs.com`)以提升速度和稳定性
|
|
208
|
-
|
|
209
|
-
#### 步骤 1:创建目标表
|
|
210
|
-
|
|
211
|
-
```sql
|
|
212
|
-
-- 使用 LH_execute_query 执行
|
|
213
|
-
CREATE TABLE IF NOT EXISTS my_schema.target_table (
|
|
214
|
-
id STRING,
|
|
215
|
-
name STRING,
|
|
216
|
-
amount DECIMAL(10,2),
|
|
217
|
-
created_date STRING
|
|
218
|
-
);
|
|
219
|
-
```
|
|
220
|
-
|
|
221
|
-
#### 步骤 2:创建存储连接(access_id/access_key 语法)
|
|
222
|
-
|
|
223
|
-
```sql
|
|
224
|
-
-- 使用 LH_execute_query 执行
|
|
225
|
-
-- 批量导入场景使用 access_id / access_key 语法
|
|
226
|
-
CREATE STORAGE CONNECTION IF NOT EXISTS my_batch_conn
|
|
227
|
-
TYPE OSS
|
|
228
|
-
ENDPOINT = 'oss-cn-shanghai-internal.aliyuncs.com'
|
|
229
|
-
access_id = '<your_access_id>'
|
|
230
|
-
access_key = '<your_access_key>'
|
|
231
|
-
COMMENTS = 'OSS batch import connection';
|
|
232
|
-
```
|
|
233
|
-
|
|
234
|
-
> **注意**:批量导入场景中 Connection 参数使用 `access_id` / `access_key`(小写),与 PIPE 模式中的 `ACCESS_KEY` / `SECRET_KEY` 写法不同,两种写法均可使用。
|
|
235
|
-
|
|
236
|
-
#### 步骤 3:创建外部 Volume(启用目录自动刷新)
|
|
237
|
-
|
|
238
|
-
```sql
|
|
239
|
-
-- 使用 LH_execute_query 执行
|
|
240
|
-
CREATE EXTERNAL VOLUME IF NOT EXISTS my_batch_volume
|
|
241
|
-
LOCATION 'oss://my-bucket/data-path/'
|
|
242
|
-
USING CONNECTION my_batch_conn
|
|
243
|
-
DIRECTORY = (enable=true, auto_refresh=true);
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
> **关键参数**:
|
|
247
|
-
> - `LOCATION`:对象存储路径,格式为 `oss://bucket/path/`
|
|
248
|
-
> - `USING CONNECTION`:引用已创建的存储连接
|
|
249
|
-
> - `DIRECTORY = (enable=true, auto_refresh=true)`:启用目录元数据并自动刷新,便于查询 Volume 中的文件列表
|
|
250
|
-
>
|
|
251
|
-
> **注意**:批量导入 Volume 使用 `LOCATION ... USING CONNECTION ...` 语法;PIPE 模式 Volume 使用 `STORAGE_CONNECTION = ... LOCATION = ...` 语法。两种语法均有效,适用于不同场景,不可混用。
|
|
252
|
-
|
|
253
|
-
#### 步骤 4a:INSERT INTO 从 Volume 导入(支持过滤转换)
|
|
254
|
-
|
|
255
|
-
```sql
|
|
256
|
-
-- 使用 LH_execute_query 执行
|
|
257
|
-
INSERT INTO my_schema.target_table
|
|
258
|
-
SELECT * FROM VOLUME my_batch_volume (
|
|
259
|
-
id STRING,
|
|
260
|
-
name STRING,
|
|
261
|
-
amount DECIMAL(10,2),
|
|
262
|
-
created_date STRING
|
|
263
|
-
) USING CSV OPTIONS ('header'='true', 'sep'=',')
|
|
264
|
-
FILES ('data_file_01.csv')
|
|
265
|
-
WHERE amount > 0;
|
|
266
|
-
```
|
|
267
|
-
|
|
268
|
-
> **参数说明**:
|
|
269
|
-
> - `VOLUME my_batch_volume (...)`:指定 Volume 及列定义(Schema-on-Read)
|
|
270
|
-
> - `USING CSV OPTIONS (...)`:指定文件格式和解析选项
|
|
271
|
-
> - `FILES ('file1.csv', 'file2.csv')`:指定要加载的文件名(可选,不指定则加载全部)
|
|
272
|
-
> - `WHERE ...`:对数据进行过滤转换(可选)
|
|
273
|
-
> - INSERT INTO 方式支持 `FILES` 和 `WHERE` 参数,适合需要精细控制的场景
|
|
274
|
-
|
|
275
|
-
#### 步骤 4b:COPY INTO 从 Volume 导入(简洁语法)
|
|
276
|
-
|
|
277
|
-
```sql
|
|
278
|
-
-- 使用 LH_execute_query 执行
|
|
279
|
-
COPY INTO my_schema.target_table
|
|
280
|
-
FROM VOLUME my_batch_volume (
|
|
281
|
-
id STRING,
|
|
282
|
-
name STRING,
|
|
283
|
-
amount DECIMAL(10,2),
|
|
284
|
-
created_date STRING
|
|
285
|
-
) USING CSV OPTIONS ('header'='true', 'sep'=',');
|
|
286
|
-
```
|
|
287
|
-
|
|
288
|
-
> **INSERT INTO vs COPY INTO 选择**:
|
|
289
|
-
> - `INSERT INTO`:支持 `FILES()` 指定文件、`WHERE` 过滤转换,适合精细控制
|
|
290
|
-
> - `COPY INTO`:语法更简洁,适合全量加载
|
|
291
|
-
> - 两者都支持 Schema-on-Read(在 FROM VOLUME 中定义列)
|
|
292
|
-
|
|
293
|
-
#### 步骤 5:验证导入结果
|
|
294
|
-
|
|
295
|
-
```sql
|
|
296
|
-
-- 使用 LH_execute_query 执行
|
|
297
|
-
SELECT COUNT(*) AS total_rows FROM my_schema.target_table;
|
|
298
|
-
SELECT * FROM my_schema.target_table LIMIT 10;
|
|
299
|
-
```
|
|
300
|
-
|
|
301
|
-
---
|
|
302
|
-
|
|
303
|
-
## 监控与运维
|
|
304
|
-
|
|
305
|
-
### 查看 PIPE 详细状态
|
|
306
|
-
|
|
307
|
-
```sql
|
|
308
|
-
-- 使用 LH_execute_query 执行
|
|
309
|
-
DESC PIPE EXTENDED my_oss_pipe;
|
|
310
|
-
```
|
|
311
|
-
|
|
312
|
-
关键字段:
|
|
313
|
-
- `pipe_execution_paused`:是否暂停
|
|
314
|
-
- `ingest_mode`:导入模式
|
|
315
|
-
- `virtual_cluster`:执行集群
|
|
316
|
-
- `definition`:COPY 语句定义
|
|
317
|
-
|
|
318
|
-
### 查看加载历史
|
|
319
|
-
|
|
320
|
-
```sql
|
|
321
|
-
-- 使用 LH_execute_query 执行
|
|
322
|
-
SELECT * FROM TABLE(load_history('my_schema.target_table'))
|
|
323
|
-
ORDER BY last_load_time DESC
|
|
324
|
-
LIMIT 20;
|
|
325
|
-
```
|
|
326
|
-
|
|
327
|
-
> `load_history` 去重记录保留 7 天。
|
|
328
|
-
|
|
329
|
-
### 通过 query_tag 过滤 PIPE 作业
|
|
330
|
-
|
|
331
|
-
PIPE 执行的作业会自动打上 `query_tag`,格式为:`pipe.<workspace_name>.<schema_name>.<pipe_name>`
|
|
332
|
-
|
|
333
|
-
```sql
|
|
334
|
-
-- 使用 LH_execute_query 执行
|
|
335
|
-
-- 在 JOBS 列表中过滤 PIPE 相关作业
|
|
336
|
-
SHOW JOBS WHERE query_tag = 'pipe.my_workspace.my_schema.my_oss_pipe';
|
|
337
|
-
```
|
|
338
|
-
|
|
339
|
-
---
|
|
340
|
-
|
|
341
|
-
## PIPE 管理操作
|
|
342
|
-
|
|
343
|
-
### 暂停 / 恢复 PIPE
|
|
344
|
-
|
|
345
|
-
```sql
|
|
346
|
-
-- 暂停 PIPE
|
|
347
|
-
ALTER PIPE my_oss_pipe SET PIPE_EXECUTION_PAUSED = true;
|
|
348
|
-
|
|
349
|
-
-- 恢复 PIPE
|
|
350
|
-
ALTER PIPE my_oss_pipe SET PIPE_EXECUTION_PAUSED = false;
|
|
351
|
-
```
|
|
352
|
-
|
|
353
|
-
### 修改 PIPE 属性
|
|
354
|
-
|
|
355
|
-
```sql
|
|
356
|
-
-- 修改虚拟集群
|
|
357
|
-
ALTER PIPE my_oss_pipe SET VIRTUAL_CLUSTER = 'new_vc';
|
|
358
|
-
|
|
359
|
-
-- 修改 COPY_JOB_HINT(注意:会覆盖所有已有 hints,需一次性设置全部参数)
|
|
360
|
-
-- 必须是合法 JSON 格式,键值都要用双引号
|
|
361
|
-
ALTER PIPE my_oss_pipe SET COPY_JOB_HINT = '{"max_file_count":"100","force":"false"}';
|
|
362
|
-
```
|
|
363
|
-
|
|
364
|
-
> **限制**:每次 ALTER PIPE 只能修改一个属性。不支持修改 COPY 语句逻辑,需删除 PIPE 重新创建。
|
|
365
|
-
|
|
366
|
-
### 删除 PIPE
|
|
367
|
-
|
|
368
|
-
```sql
|
|
369
|
-
DROP PIPE IF EXISTS my_oss_pipe;
|
|
370
|
-
```
|
|
371
|
-
|
|
372
|
-
---
|
|
373
|
-
|
|
374
|
-
## 故障排除
|
|
375
|
-
|
|
376
|
-
| 问题 | 排查方向 |
|
|
377
|
-
|------|---------|
|
|
378
|
-
| PIPE 创建后无数据加载 | 1. `DESC PIPE EXTENDED` 检查是否暂停 2. 确认 Volume 路径下有新文件 3. 检查 COPY INTO 是否能独立运行 |
|
|
379
|
-
| LIST_PURGE 模式文件未被删除 | 确认 `purge = true` 已设置;检查 Connection 的 AccessKey 是否有删除权限 |
|
|
380
|
-
| EVENT_NOTIFICATION 模式无触发 | 1. 检查 MNS/SQS 队列是否收到消息 2. 确认 OSS 事件通知规则配置正确 3. 检查 Role ARN 授权 |
|
|
381
|
-
| 重复加载数据 | `load_history` 去重记录仅保留 7 天,超过 7 天的同名文件会被重新加载 |
|
|
382
|
-
| COPY_JOB_HINT 修改后部分参数丢失 | `SET COPY_JOB_HINT` 会覆盖所有已有 hints,需在一次 ALTER 中设置全部参数 |
|
|
383
|
-
|
|
384
|
-
## 注意事项
|
|
385
|
-
|
|
386
|
-
### PIPE 持续导入(模式 A / B)
|
|
387
|
-
|
|
388
|
-
- 每个 PIPE 需对应独立的 Volume,不可多个 PIPE 共用同一 Volume
|
|
389
|
-
- PIPE 中的 COPY 语句不支持 `files` / `regexp` / `subdirectory` 参数
|
|
390
|
-
- 数据加载无法保证严格有序(多文件并行加载)
|
|
391
|
-
- 推荐文件大小:gzip 压缩 ≈ 50MB,CSV/Parquet 未压缩 128MB ~ 256MB
|
|
392
|
-
- `load_history` 去重记录保留 7 天,超期后同名文件可能被重复加载
|
|
393
|
-
- 修改 COPY 逻辑需删除 PIPE 重新创建,ALTER PIPE 不支持修改 COPY 语句
|
|
394
|
-
|
|
395
|
-
### 批量导入(模式 C)
|
|
396
|
-
|
|
397
|
-
- Volume 支持阿里云 OSS、腾讯云 COS 和 AWS S3
|
|
398
|
-
- 不支持跨云导入(源存储与 Lakehouse 环境需在同一云平台)
|
|
399
|
-
- 同地域建议使用内网 Endpoint 以提升传输速度和稳定性
|
|
400
|
-
- 推荐使用 GENERAL PURPOSE 类型虚拟集群执行批量加载任务
|
|
401
|
-
- INSERT INTO 方式支持 `FILES()` 和 `WHERE` 参数,COPY INTO 不支持
|
|
402
|
-
- Connection 参数 `access_id`/`access_key` 和 `ACCESS_KEY`/`SECRET_KEY` 两种写法均可使用
|
|
@@ -1,156 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: clickzetta-query-optimizer
|
|
3
|
-
description: |
|
|
4
|
-
诊断和优化 ClickZetta Lakehouse SQL 查询性能。覆盖执行计划分析、慢查询排查、
|
|
5
|
-
结果缓存、小文件合并、Map Join 优化、Sort Key 推荐等完整调优工作流。
|
|
6
|
-
当用户说"查询慢"、"SQL 性能优化"、"执行计划"、"EXPLAIN"、"查看 Job"、
|
|
7
|
-
"慢查询"、"小文件"、"OPTIMIZE"、"结果缓存"、"Result Cache"、
|
|
8
|
-
"Map Join"、"排序列"、"sort key"、"查询调优"、"性能诊断"时触发。
|
|
9
|
-
Keywords: query optimization, EXPLAIN, execution plan, slow query, cache, Map Join, Sort Key
|
|
10
|
-
---
|
|
11
|
-
|
|
12
|
-
# ClickZetta 查询性能优化
|
|
13
|
-
|
|
14
|
-
## ⚠️ 注意事项
|
|
15
|
-
|
|
16
|
-
- `OPTIMIZE` 命令只能在**通用型(GENERAL PURPOSE)计算集群**运行,分析型集群不生效
|
|
17
|
-
- Result Cache 默认未开启,需手动 `SET cz.sql.enable.shortcut.result.cache = true`
|
|
18
|
-
- Map Join 小表限制为 **1GB**,超过则失败
|
|
19
|
-
|
|
20
|
-
---
|
|
21
|
-
|
|
22
|
-
## 诊断流程
|
|
23
|
-
|
|
24
|
-
```
|
|
25
|
-
查询慢
|
|
26
|
-
├── 1. 先看执行计划(EXPLAIN)
|
|
27
|
-
│ ├── 发现全表扫描 → 考虑加索引或设置 sort key
|
|
28
|
-
│ ├── 发现大表 JOIN → 考虑 MAPJOIN hint
|
|
29
|
-
│ └── 发现大量 Sort → 检查 ORDER BY 是否必要
|
|
30
|
-
├── 2. 查看 Job 历史(SHOW JOBS)
|
|
31
|
-
│ └── 找到慢 Job → 在 Studio Job Profile 查看详细执行统计
|
|
32
|
-
├── 3. 检查小文件问题
|
|
33
|
-
│ └── 频繁写入的表 → OPTIMIZE 合并小文件
|
|
34
|
-
└── 4. 利用缓存
|
|
35
|
-
└── 重复查询 → 开启 Result Cache
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
---
|
|
39
|
-
|
|
40
|
-
## 步骤 1:分析执行计划
|
|
41
|
-
|
|
42
|
-
阅读 [references/explain.md](references/explain.md)
|
|
43
|
-
|
|
44
|
-
```sql
|
|
45
|
-
-- 快速查看物理执行计划
|
|
46
|
-
EXPLAIN SELECT ...;
|
|
47
|
-
|
|
48
|
-
-- 详细查看逻辑+物理执行计划
|
|
49
|
-
EXPLAIN EXTENDED SELECT ...;
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
重点关注:
|
|
53
|
-
- `PhysicalTableScan` 是否扫描了过多数据
|
|
54
|
-
- `PhysicalJoin` 的策略(是否触发 MapJoin)
|
|
55
|
-
- `PhysicalSort` 是否可以避免
|
|
56
|
-
|
|
57
|
-
---
|
|
58
|
-
|
|
59
|
-
## 步骤 2:查看慢查询 Job
|
|
60
|
-
|
|
61
|
-
阅读 [references/show-jobs.md](references/show-jobs.md)
|
|
62
|
-
|
|
63
|
-
```sql
|
|
64
|
-
-- 查看执行超过 2 分钟的 Job
|
|
65
|
-
SHOW JOBS IN VCLUSTER default_ap WHERE execution_time > interval 2 minute;
|
|
66
|
-
|
|
67
|
-
-- 查看最近 50 条 Job
|
|
68
|
-
SHOW JOBS LIMIT 50;
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
找到 Job ID 后,在 Studio → Job Profile 查看详细执行统计和执行计划图。
|
|
72
|
-
|
|
73
|
-
---
|
|
74
|
-
|
|
75
|
-
## 步骤 3:小文件优化
|
|
76
|
-
|
|
77
|
-
阅读 [references/optimize.md](references/optimize.md)
|
|
78
|
-
|
|
79
|
-
```sql
|
|
80
|
-
-- 手动合并小文件(异步,立即返回)
|
|
81
|
-
OPTIMIZE my_schema.orders;
|
|
82
|
-
|
|
83
|
-
-- 指定分区合并
|
|
84
|
-
OPTIMIZE my_schema.orders WHERE dt = '2024-01-01';
|
|
85
|
-
|
|
86
|
-
-- 同步执行(等待完成)
|
|
87
|
-
OPTIMIZE my_schema.orders OPTIONS('cz.sql.optimize.table.async' = 'false');
|
|
88
|
-
|
|
89
|
-
-- 写入时自动触发合并
|
|
90
|
-
SET cz.sql.compaction.after.commit = true;
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
---
|
|
94
|
-
|
|
95
|
-
## 步骤 4:开启结果缓存
|
|
96
|
-
|
|
97
|
-
阅读 [references/result-cache.md](references/result-cache.md)
|
|
98
|
-
|
|
99
|
-
```sql
|
|
100
|
-
-- 开启 Result Cache(SESSION 级别)
|
|
101
|
-
SET cz.sql.enable.shortcut.result.cache = true;
|
|
102
|
-
|
|
103
|
-
-- 关闭
|
|
104
|
-
SET cz.sql.enable.shortcut.result.cache = false;
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
命中缓存的查询通常在 15ms 内返回。在 Job Profile 中可看到 `JOB RESULT REUSE` 标记。
|
|
108
|
-
|
|
109
|
-
---
|
|
110
|
-
|
|
111
|
-
## 步骤 5:Map Join 与 Sort Key
|
|
112
|
-
|
|
113
|
-
阅读 [references/hints-and-sortkey.md](references/hints-and-sortkey.md)
|
|
114
|
-
|
|
115
|
-
```sql
|
|
116
|
-
-- Map Join:小表(<1GB)与大表 JOIN 时使用
|
|
117
|
-
SELECT /*+ MAPJOIN (small_table) */ *
|
|
118
|
-
FROM large_table t1
|
|
119
|
-
JOIN small_table t2 ON t1.id = t2.id;
|
|
120
|
-
|
|
121
|
-
-- 查看系统推荐的 Sort Key
|
|
122
|
-
SELECT * FROM information_schema.sortkey_candidates;
|
|
123
|
-
|
|
124
|
-
-- 应用推荐(直接执行 statement 列中的 SQL)
|
|
125
|
-
ALTER TABLE schema.table_name SET PROPERTIES("hint.sort.columns"="column_name");
|
|
126
|
-
|
|
127
|
-
-- 开启自动收集 Sort Key 推荐
|
|
128
|
-
ALTER WORKSPACE my_workspace SET PROPERTIES (auto_index='day');
|
|
129
|
-
|
|
130
|
-
-- 收集表统计信息(Sort Key 推荐为空时先执行)
|
|
131
|
-
ANALYZE TABLE schema.table_name;
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
---
|
|
135
|
-
|
|
136
|
-
## 常见问题
|
|
137
|
-
|
|
138
|
-
| 问题 | 排查方向 |
|
|
139
|
-
|---|---|
|
|
140
|
-
| 查询慢但执行计划看起来正常 | 检查小文件数量(`SHOW PARTITIONS EXTENDED`),考虑 OPTIMIZE |
|
|
141
|
-
| Result Cache 未命中 | 检查 SQL 是否完全一致、是否含 UDF 或非确定性函数、表数据是否有变更 |
|
|
142
|
-
| OPTIMIZE 无效 | 确认使用的是通用型(GP)集群,不是分析型集群 |
|
|
143
|
-
| Map Join 失败 | 小表超过 1GB,改用普通 JOIN 或拆分查询 |
|
|
144
|
-
| Sort Key 推荐为空 | 先执行 `ANALYZE TABLE`,再等待自动收集周期 |
|
|
145
|
-
|
|
146
|
-
---
|
|
147
|
-
|
|
148
|
-
## 参考文档
|
|
149
|
-
|
|
150
|
-
- [EXPLAIN](https://www.yunqi.tech/documents/EXPLAIN)
|
|
151
|
-
- [SHOW JOBS](https://www.yunqi.tech/documents/show-jobs)
|
|
152
|
-
- [Result Cache](https://www.yunqi.tech/documents/result_cache)
|
|
153
|
-
- [OPTIMIZE](https://www.yunqi.tech/documents/OPTIMIZE)
|
|
154
|
-
- [小文件优化](https://www.yunqi.tech/documents/small_file_optimization)
|
|
155
|
-
- [Map Join](https://www.yunqi.tech/documents/mapjoin)
|
|
156
|
-
- [推荐排序列](https://www.yunqi.tech/documents/auto-index)
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
# EXPLAIN 命令参考
|
|
2
|
-
|
|
3
|
-
> 来源:https://www.yunqi.tech/documents/EXPLAIN
|
|
4
|
-
|
|
5
|
-
## 语法
|
|
6
|
-
|
|
7
|
-
```sql
|
|
8
|
-
EXPLAIN [EXTENDED] query_statement
|
|
9
|
-
```
|
|
10
|
-
|
|
11
|
-
## 两种模式
|
|
12
|
-
|
|
13
|
-
### 基础模式(EXPLAIN)
|
|
14
|
-
|
|
15
|
-
显示物理执行计划,用于快速理解查询执行方式。
|
|
16
|
-
|
|
17
|
-
```sql
|
|
18
|
-
EXPLAIN SELECT * FROM orders LIMIT 5;
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
输出示例:
|
|
22
|
-
```
|
|
23
|
-
Type: DML
|
|
24
|
-
Plan: PhysicalTableSink() name=TableSink0 stage=stg0
|
|
25
|
-
PhysicalTableScan(orders, a) as [0] name=TableScan1
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
### 扩展模式(EXPLAIN EXTENDED)
|
|
29
|
-
|
|
30
|
-
显示完整的逻辑执行计划 + 物理执行计划,包含表达式转换、系统列、优化过程。
|
|
31
|
-
|
|
32
|
-
```sql
|
|
33
|
-
EXPLAIN EXTENDED SELECT * FROM orders LIMIT 5;
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
输出包含:
|
|
37
|
-
- `[LogicalPlan]`:逻辑执行计划
|
|
38
|
-
- `[PhysicalPlan]`:物理执行计划
|
|
39
|
-
- 系统隐藏列信息(`__commit_version`、`__change_type` 等)
|
|
40
|
-
|
|
41
|
-
## 常见操作符说明
|
|
42
|
-
|
|
43
|
-
| 操作符 | 说明 | 性能特征 |
|
|
44
|
-
|---|---|---|
|
|
45
|
-
| PhysicalTableScan | 从表读取数据 | 基础 I/O 操作 |
|
|
46
|
-
| PhysicalTableSink | 输出查询结果 | 固定开销 |
|
|
47
|
-
| PhysicalSort | 对数据排序 | O(n log n),可能成为瓶颈 |
|
|
48
|
-
| PhysicalFilter | 条件过滤 | 线性操作,早期过滤是最佳实践 |
|
|
49
|
-
| PhysicalHashAggregate | 聚合操作 | 根据 GROUP BY 基数变化 |
|
|
50
|
-
| PhysicalJoin | JOIN 操作 | 复杂度取决于 JOIN 策略和数据量 |
|
|
51
|
-
|
|
52
|
-
## 使用建议
|
|
53
|
-
|
|
54
|
-
- 先用 `EXPLAIN` 快速确认执行路径
|
|
55
|
-
- 发现异常(如全表扫描、大量 Sort)再用 `EXPLAIN EXTENDED` 深入分析
|
|
56
|
-
- 关注 PhysicalJoin 的策略:是否触发了 MapJoin(小表广播)
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
# Map Join 与 Sort Key 推荐参考
|
|
2
|
-
|
|
3
|
-
> 来源:https://www.yunqi.tech/documents/mapjoin 和 https://www.yunqi.tech/documents/auto-index
|
|
4
|
-
|
|
5
|
-
---
|
|
6
|
-
|
|
7
|
-
## Map Join(小表广播优化)
|
|
8
|
-
|
|
9
|
-
### 语法
|
|
10
|
-
|
|
11
|
-
```sql
|
|
12
|
-
SELECT /*+ MAPJOIN (small_table_alias) */ *
|
|
13
|
-
FROM large_table t1
|
|
14
|
-
JOIN small_table t2 ON t1.id = t2.id;
|
|
15
|
-
```
|
|
16
|
-
|
|
17
|
-
### 说明
|
|
18
|
-
|
|
19
|
-
- 将小表广播到各节点,在 Map 阶段完成 JOIN,避免 Shuffle
|
|
20
|
-
- **小表大小限制:1GB**,超过则 Map Join 失败或退化为普通 JOIN
|
|
21
|
-
- 适用于小表 JOIN 大表,不适用于大表 JOIN 大表
|
|
22
|
-
|
|
23
|
-
### 示例
|
|
24
|
-
|
|
25
|
-
```sql
|
|
26
|
-
-- 员工与部门关联
|
|
27
|
-
SELECT /*+ MAPJOIN (dept) */ *
|
|
28
|
-
FROM employees emp
|
|
29
|
-
JOIN departments dept ON emp.dept_id = dept.dept_id;
|
|
30
|
-
|
|
31
|
-
-- 订单与客户关联
|
|
32
|
-
SELECT /*+ MAPJOIN (customer) */ *
|
|
33
|
-
FROM orders o
|
|
34
|
-
JOIN customers customer ON o.customer_id = customer.customer_id;
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
---
|
|
38
|
-
|
|
39
|
-
## Sort Key 推荐(自动索引建议)
|
|
40
|
-
|
|
41
|
-
### 启用自动收集
|
|
42
|
-
|
|
43
|
-
```sql
|
|
44
|
-
-- 按天收集(推荐)
|
|
45
|
-
ALTER WORKSPACE workspace_name SET PROPERTIES (auto_index='day');
|
|
46
|
-
|
|
47
|
-
-- 自定义参数:天/月, 最近N分钟job, 最少重复次数, 最多job数
|
|
48
|
-
ALTER WORKSPACE workspace_name SET PROPERTIES (auto_index='day,150,5,100');
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
参数说明:
|
|
52
|
-
- 第 1 个参数:`day`(每天)或 `month`(每月 1 号),收集时间为晚上 6 点
|
|
53
|
-
- 第 2 个参数:使用最近多少分钟的 job(默认 150)
|
|
54
|
-
- 第 3 个参数:job 需要重复多少次才被采用(默认 5)
|
|
55
|
-
- 第 4 个参数:每列最多使用的 job 数(默认 100)
|
|
56
|
-
|
|
57
|
-
### 查询推荐结果
|
|
58
|
-
|
|
59
|
-
```sql
|
|
60
|
-
SELECT * FROM information_schema.sortkey_candidates;
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
返回字段:`table_name`、`col`(推荐列)、`statement`(可直接执行的 ALTER 语句)、`ratio`(估算提升效果百分比)
|
|
64
|
-
|
|
65
|
-
### 应用推荐
|
|
66
|
-
|
|
67
|
-
```sql
|
|
68
|
-
-- 直接执行 statement 列中的 SQL 即可设置 sort key
|
|
69
|
-
ALTER TABLE schema.table_name SET PROPERTIES("hint.sort.columns"="column_name");
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
### 建议
|
|
73
|
-
|
|
74
|
-
执行前先对表收集统计信息,提高推荐准确性:
|
|
75
|
-
|
|
76
|
-
```sql
|
|
77
|
-
ANALYZE TABLE schema.table_name;
|
|
78
|
-
```
|