@clickzetta/cz-cli-darwin-arm64 0.3.40 → 0.3.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +128 -287
- package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
- package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
- package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +103 -11
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +58 -2
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +4 -4
- package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +34 -0
- package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +31 -0
- package/bin/skills/clickzetta-metadata/SKILL.md +28 -30
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +39 -0
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +115 -9
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
- package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
- package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +0 -4
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# 数据分享 DDL 参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/datasharing 等
|
|
4
|
+
|
|
5
|
+
## 概念
|
|
6
|
+
|
|
7
|
+
数据分享(Share)是 Lakehouse 提供的**无复制**跨账户/跨实例数据共享功能:
|
|
8
|
+
- 数据提供方创建 Share 对象,将表或视图授权给指定服务实例
|
|
9
|
+
- 数据消费方通过 `CREATE SCHEMA FROM SHARE` 在本地创建只读 Schema 访问共享数据
|
|
10
|
+
- 数据实时更新,消费方无需同步,无需为存储付费
|
|
11
|
+
|
|
12
|
+
**限制:**
|
|
13
|
+
- 共享数据为**只读**,消费方不可修改
|
|
14
|
+
- 一个 Share 只能包含同一工作空间下的数据
|
|
15
|
+
- 一个 Share 最多包含 1000 个 table/view
|
|
16
|
+
- 不支持二次分享
|
|
17
|
+
- 需要 `instance_admin` 角色才能创建 Share
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 提供方操作(OUTBOUND)
|
|
22
|
+
|
|
23
|
+
### 1. 创建 Share
|
|
24
|
+
|
|
25
|
+
```sql
|
|
26
|
+
CREATE SHARE share_demo;
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### 2. 将表/视图加入 Share
|
|
30
|
+
|
|
31
|
+
```sql
|
|
32
|
+
-- 添加单张表
|
|
33
|
+
GRANT SELECT, READ METADATA ON TABLE public.orders TO SHARE share_demo;
|
|
34
|
+
|
|
35
|
+
-- 添加视图(推荐:用视图控制分享范围)
|
|
36
|
+
GRANT SELECT, READ METADATA ON VIEW public.orders_summary TO SHARE share_demo;
|
|
37
|
+
|
|
38
|
+
-- 添加多张表
|
|
39
|
+
GRANT SELECT, READ METADATA ON TABLE public.orders, public.customers TO SHARE share_demo;
|
|
40
|
+
|
|
41
|
+
-- 添加 Schema 下所有表(含未来新建的表,谨慎使用)
|
|
42
|
+
GRANT SELECT, READ METADATA ON ALL TABLES IN SCHEMA public TO SHARE share_demo;
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### 3. 指定接收实例
|
|
46
|
+
|
|
47
|
+
```sql
|
|
48
|
+
-- 添加接收方实例
|
|
49
|
+
ALTER SHARE share_demo ADD INSTANCE consumer_instance_name;
|
|
50
|
+
|
|
51
|
+
-- 移除接收方实例(立即撤销访问权限)
|
|
52
|
+
ALTER SHARE share_demo REMOVE INSTANCE consumer_instance_name;
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### 4. 从 Share 移除数据对象
|
|
56
|
+
|
|
57
|
+
```sql
|
|
58
|
+
-- 撤销表的分享权限
|
|
59
|
+
REVOKE SELECT, READ METADATA ON TABLE public.orders FROM SHARE share_demo;
|
|
60
|
+
|
|
61
|
+
-- 撤销视图的分享权限
|
|
62
|
+
REVOKE SELECT ON VIEW public.orders_summary FROM SHARE share_demo;
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 5. 查看与管理
|
|
66
|
+
|
|
67
|
+
```sql
|
|
68
|
+
-- 查看所有 Share
|
|
69
|
+
SHOW SHARES;
|
|
70
|
+
|
|
71
|
+
-- 查看本实例分享出去的 Share
|
|
72
|
+
SHOW SHARES WHERE kind = 'OUTBOUND';
|
|
73
|
+
|
|
74
|
+
-- 查看 Share 包含的数据对象
|
|
75
|
+
DESC SHARE share_demo;
|
|
76
|
+
|
|
77
|
+
-- 删除 Share
|
|
78
|
+
DROP SHARE IF EXISTS share_demo;
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## 消费方操作(INBOUND)
|
|
84
|
+
|
|
85
|
+
### 1. 查看收到的 Share
|
|
86
|
+
|
|
87
|
+
```sql
|
|
88
|
+
-- 查看所有 Share(含 INBOUND)
|
|
89
|
+
SHOW SHARES;
|
|
90
|
+
|
|
91
|
+
-- 只看收到的 Share
|
|
92
|
+
SHOW SHARES WHERE kind = 'INBOUND';
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 2. 查看 Share 内容
|
|
96
|
+
|
|
97
|
+
```sql
|
|
98
|
+
-- 格式:DESC SHARE <provider_instance>.<share_name>
|
|
99
|
+
DESC SHARE provider_instance.share_demo;
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
返回字段:`kind`(schema/table/view)、`name`(对象名)、`shared_on`(共享时间)
|
|
103
|
+
|
|
104
|
+
### 3. 创建本地只读 Schema(消费数据)
|
|
105
|
+
|
|
106
|
+
```sql
|
|
107
|
+
-- 格式:CREATE SCHEMA <local_schema> FROM SHARE SHARE <instance>.<share>.<schema>
|
|
108
|
+
CREATE SCHEMA data_from_provider FROM SHARE SHARE provider_instance.share_demo.public;
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
创建后即可直接查询:
|
|
112
|
+
|
|
113
|
+
```sql
|
|
114
|
+
SELECT * FROM data_from_provider.orders LIMIT 10;
|
|
115
|
+
|
|
116
|
+
-- 与本地表关联查询
|
|
117
|
+
SELECT o.*, c.name
|
|
118
|
+
FROM data_from_provider.orders o
|
|
119
|
+
JOIN my_schema.customers c ON o.customer_id = c.id;
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## SHOW SHARES 返回字段说明
|
|
125
|
+
|
|
126
|
+
| 字段 | 说明 |
|
|
127
|
+
|---|---|
|
|
128
|
+
| share_name | Share 名称 |
|
|
129
|
+
| provider | 提供方租户名 |
|
|
130
|
+
| provider_instance | 提供方服务实例名 |
|
|
131
|
+
| provider_workspace | Share 所属工作空间 |
|
|
132
|
+
| scope | 分享范围(当前仅 PRIVATE) |
|
|
133
|
+
| to_instance | 接收方实例名(逗号分隔) |
|
|
134
|
+
| kind | OUTBOUND(分享出)/ INBOUND(收到) |
|
|
@@ -108,6 +108,16 @@ B. 大奖牌架构(Medallion)
|
|
|
108
108
|
- 实时看板(分钟级延迟)→ 告诉我,方案会有调整
|
|
109
109
|
```
|
|
110
110
|
|
|
111
|
+
### 4. 方案确认时主动提示成本注意事项
|
|
112
|
+
|
|
113
|
+
给出方案选项时,提醒用户以下成本相关决策点,让用户做知情选择:
|
|
114
|
+
|
|
115
|
+
- **Dynamic Table 刷新频率**:按业务时效性选择,不要默认最高频率。T+1 用 `1 DAY`,小时级用 `1 HOUR`,分钟级用 `10~30 MINUTE`
|
|
116
|
+
- **分层数量**:评估 DWS 和 ADS 是否都必要,每多一层 DT 就多一份刷新成本
|
|
117
|
+
- **VCluster 规格**:建议从小规格开始,按需扩容
|
|
118
|
+
|
|
119
|
+
> 具体 CRU 消耗无法在方案设计阶段精确预估,上线后加载 `clickzetta-cost-management` skill 监控实际消耗,按需调整刷新频率和集群规格。
|
|
120
|
+
|
|
111
121
|
---
|
|
112
122
|
|
|
113
123
|
## 第三阶段:方案确认后的完整输出
|
|
@@ -173,35 +183,113 @@ CLUSTERED BY (user_id) INTO 32 BUCKETS
|
|
|
173
183
|
|
|
174
184
|
```
|
|
175
185
|
ODS/Bronze → DWD/Silver:SQL 任务(Studio 调度,清洗逻辑需手动控制)
|
|
176
|
-
DWD/Silver → DWS/Gold:Dynamic Table(
|
|
186
|
+
DWD/Silver → DWS/Gold:Dynamic Table(REFRESH INTERVAL 控制延迟,自动增量)
|
|
177
187
|
DWS → ADS:Dynamic Table 或直接查询
|
|
178
188
|
```
|
|
179
189
|
|
|
180
190
|
加载 `clickzetta-sql-pipeline-manager` 获取 Dynamic Table 详细语法。
|
|
181
191
|
|
|
192
|
+
### 建管分离原则(重要)
|
|
193
|
+
|
|
194
|
+
Studio 任务按类型严格区分,**不同类型任务的调度策略完全不同**:
|
|
195
|
+
|
|
196
|
+
| 任务类型 | 示例 | 调度配置 | 说明 |
|
|
197
|
+
|---|---|---|---|
|
|
198
|
+
| DDL 建表任务 | CREATE TABLE、CREATE SCHEMA | **DRAFT,禁止配 Cron,禁止配依赖** | 一次性执行,手动触发,不参与调度链 |
|
|
199
|
+
| ETL 转换任务 | ODS→DWD 清洗 SQL | 配置 Cron + 依赖上游同步任务 | 周期性执行,构成调度 DAG |
|
|
200
|
+
| 数据同步任务 | MySQL→ODS 整库同步 | 配置 Cron,作为 ETL 任务的上游 | 周期性执行,ETL 任务的触发前提 |
|
|
201
|
+
| DWS/ADS 聚合层 | 指标汇总、报表宽表 | **使用 Dynamic Table,不建调度任务** | 系统自动刷新,额外建任务是冗余计算 |
|
|
202
|
+
|
|
203
|
+
> ⚠️ **常见错误**:为 DDL 任务配置了 Cron,导致建表语句被重复执行,引发 `SCHEDULE_TASK_HAD_CHILDREN_NODES_EXCEPTION` 等调度冲突。DDL 任务完成后应立即降级为 DRAFT 状态。
|
|
204
|
+
|
|
205
|
+
### Studio 任务目录组织规范
|
|
206
|
+
|
|
207
|
+
每个数仓项目在 Studio 中创建独立任务目录,统一管理所有任务资产:
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
<业务域>_dw/ ← 项目任务目录(如 shenyu_gateway_dw)
|
|
211
|
+
├── 00_sync_<source>_to_ods ← 数据同步(Cron,最早执行)
|
|
212
|
+
├── 01_ddl_ods ← ODS 建表(DRAFT,不调度)
|
|
213
|
+
├── 02_ddl_dwd ← DWD 建表(DRAFT,不调度)
|
|
214
|
+
├── 03_ddl_dws_ads ← DWS/ADS 动态表建表(DRAFT,不调度)
|
|
215
|
+
├── 04_transform_ods_to_dwd ← ODS→DWD 清洗(Cron,依赖 00)
|
|
216
|
+
└── 05_dqc_check ← 数据质量检查(Cron,依赖 04,可选)
|
|
217
|
+
(DWS/ADS 层由 Dynamic Table 自动刷新,无需任务)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
> 任务编号规范:`00~` 同步层,`01~03` DDL 层(DRAFT),`04~` ETL 层(调度),DWS/ADS 无任务。
|
|
221
|
+
|
|
182
222
|
### 数据质量卡点
|
|
183
223
|
|
|
184
224
|
| 层次 | 检查重点 | 时机 |
|
|
185
225
|
|---|---|---|
|
|
186
|
-
| ODS/Bronze | NULL 比例、CDC _op
|
|
187
|
-
| DWD/Silver |
|
|
188
|
-
| DWS/Gold/ADS |
|
|
226
|
+
| ODS/Bronze | NULL 比例、CDC _op 分布、行数与源端一致 | 入库后 |
|
|
227
|
+
| DWD/Silver | 唯一性、LEFT JOIN 匹配率(结果行数 ≥ 左表行数)、关键字段非空率 | ETL 后 |
|
|
228
|
+
| DWS/Gold/ADS | 指标环比异常、汇总一致性、Dynamic Table 刷新历史为 SUCCESS | Dynamic Table 刷新后 |
|
|
229
|
+
|
|
230
|
+
> ⚠️ **LEFT JOIN 陷阱**:`LEFT JOIN ... WHERE 右表字段 = 值` 会退化为 INNER JOIN,导致数据丢失。过滤右表字段必须放在 `ON` 子句:`LEFT JOIN ... ON ... AND 右表字段 = 值`。
|
|
231
|
+
|
|
232
|
+
### 交付验证 Checklist
|
|
233
|
+
|
|
234
|
+
方案上线前必须逐项确认:
|
|
235
|
+
|
|
236
|
+
- [ ] 各层行数与预期一致(ODS 行数 ≈ 源端,DWD 行数 ≤ ODS,DWS/ADS 行数符合聚合逻辑)
|
|
237
|
+
- [ ] Dynamic Table 刷新历史显示 `SUCCESS`(`SHOW DYNAMIC TABLE REFRESH HISTORY`)
|
|
238
|
+
- [ ] 关键字段 NULL 率在可接受范围内
|
|
239
|
+
- [ ] LEFT JOIN 结果行数 ≥ 左表行数(否则检查过滤条件是否误放在 WHERE)
|
|
240
|
+
- [ ] DWS/ADS 层无冗余调度任务(Dynamic Table 不需要额外 Cron)
|
|
241
|
+
- [ ] 所有 DDL 任务已降级为 DRAFT 状态
|
|
242
|
+
|
|
243
|
+
验证通过后,如需对整个管道做全面健康检查(调度依赖、DT 反模式、分层跳层等),加载 `clickzetta-pipeline-review` skill。
|
|
189
244
|
|
|
190
245
|
### 调度 DAG
|
|
191
246
|
|
|
192
247
|
```
|
|
193
248
|
日批场景:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
249
|
+
00_sync(Cron 02:00)→ 04_transform(Cron 02:30,依赖 00)→ 05_dqc(可选)
|
|
250
|
+
↓
|
|
251
|
+
DWS/ADS(Dynamic Table 自动刷新,无需调度)
|
|
197
252
|
|
|
198
253
|
实时场景:
|
|
199
|
-
CDC/Kafka 持续写入 Bronze → Silver(
|
|
254
|
+
CDC/Kafka 持续写入 Bronze → Silver(REFRESH INTERVAL 10 MINUTE)→ Gold(REFRESH INTERVAL 1 HOUR)
|
|
200
255
|
```
|
|
201
256
|
|
|
202
257
|
### DDL 模板
|
|
203
258
|
|
|
204
|
-
加载 `clickzetta-sql-syntax-guide` 确认语法,生成各层 DDL
|
|
259
|
+
加载 `clickzetta-sql-syntax-guide` 确认语法,生成各层 DDL。
|
|
260
|
+
|
|
261
|
+
**数仓开发代码资产化原则:每段 SQL 都应保存为 Studio 任务,作为可管理的代码资产。**
|
|
262
|
+
|
|
263
|
+
生成 DDL 后,按以下规范保存为 Studio 任务(先创建任务目录,再逐层保存):
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
# 创建项目任务目录
|
|
267
|
+
cz-cli task folder create <业务域>_dw
|
|
268
|
+
|
|
269
|
+
# 各层 DDL 保存为独立 DRAFT 任务(不配 Cron,不配依赖)
|
|
270
|
+
cz-cli task save-content 01_ddl_ods --content "<ods_ddl_sql>"
|
|
271
|
+
cz-cli task save-content 02_ddl_dwd --content "<dwd_ddl_sql>"
|
|
272
|
+
cz-cli task save-content 03_ddl_dws_ads --content "<dws_ads_ddl_sql>"
|
|
273
|
+
|
|
274
|
+
# ETL 转换 SQL 保存为调度任务(配 Cron + 依赖上游同步任务)
|
|
275
|
+
cz-cli task save-content 04_transform_ods_to_dwd --content "<etl_sql>"
|
|
276
|
+
cz-cli task save-cron 04_transform_ods_to_dwd --cron '0 30 2 * * ? *'
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
> 任务是代码的载体,不只是调度配置。即使是一次性执行的 DDL,也应保存为 DRAFT 任务,方便后续查阅、复用和多环境迁移。
|
|
280
|
+
|
|
281
|
+
**生成 Dynamic Table DDL 前,先确认可用的 GP 型 VCluster:**
|
|
282
|
+
|
|
283
|
+
```sql
|
|
284
|
+
-- 查看所有 VCluster 及状态,找到 type=GENERAL 且 status=RUNNING 的集群
|
|
285
|
+
SHOW VCLUSTERS;
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
- `type = GENERAL`(GP 型)且 `status = RUNNING` → 直接使用该集群名
|
|
289
|
+
- `status = STOPPED` → 先执行 `ALTER VCLUSTER <name> RESUME;` 再建表
|
|
290
|
+
- 无 GP 型集群 → 参考 `clickzetta-vcluster-manager` 创建
|
|
291
|
+
|
|
292
|
+
将查到的集群名替换下方 DDL 中的 `<gp_vcluster_name>`。
|
|
205
293
|
|
|
206
294
|
```sql
|
|
207
295
|
-- ODS/Bronze(以 CDC 接入为例)
|
|
@@ -233,8 +321,7 @@ COMMENT 'DWD 订单事实表,清洗标准化';
|
|
|
233
321
|
|
|
234
322
|
-- DWS/Gold(Dynamic Table,不用物化视图)
|
|
235
323
|
CREATE DYNAMIC TABLE IF NOT EXISTS dws.user_order_daily
|
|
236
|
-
REFRESH
|
|
237
|
-
VCLUSTER default_ap
|
|
324
|
+
REFRESH INTERVAL 1 HOUR vcluster <gp_vcluster_name>
|
|
238
325
|
AS
|
|
239
326
|
SELECT
|
|
240
327
|
user_id,
|
|
@@ -245,6 +332,9 @@ SELECT
|
|
|
245
332
|
FROM dwd.fact_orders
|
|
246
333
|
WHERE status_code = 1
|
|
247
334
|
GROUP BY user_id, order_date;
|
|
335
|
+
|
|
336
|
+
-- 创建后立即执行首次刷新,重置刷新基准时间
|
|
337
|
+
REFRESH DYNAMIC TABLE dws.user_order_daily;
|
|
248
338
|
```
|
|
249
339
|
|
|
250
340
|
---
|
|
@@ -257,3 +347,5 @@ GROUP BY user_id, order_date;
|
|
|
257
347
|
4. **建模和管道一体**——DDL 和管道配置同步输出
|
|
258
348
|
5. **分区用转换函数**:`days(col)` 不是 `col`
|
|
259
349
|
6. **ODS/Bronze 零转换**,保留原始数据方便回溯
|
|
350
|
+
7. **建管分离**——DDL 任务 DRAFT 不调度,DWS/ADS 层不建调度任务
|
|
351
|
+
8. **创建 Dynamic Table 后立即 REFRESH**——重置刷新基准,实现开箱即用
|
|
@@ -14,6 +14,25 @@ description: |
|
|
|
14
14
|
|
|
15
15
|
# Dynamic Table 使用指南 — 目录索引
|
|
16
16
|
|
|
17
|
+
## 前置检查:确认可用的 GP 型 VCluster
|
|
18
|
+
|
|
19
|
+
**创建动态表前必须先确认 VCluster 存在且运行正常。** 动态表的 `vcluster` 参数必须填写实际存在的 GP 型集群名称,写死 `default` 可能导致刷新失败。
|
|
20
|
+
|
|
21
|
+
```sql
|
|
22
|
+
-- 查看所有 VCluster 及其状态
|
|
23
|
+
SHOW VCLUSTERS;
|
|
24
|
+
-- 关注列:name, type, status
|
|
25
|
+
-- type = GENERAL(GP 型,推荐用于动态表)
|
|
26
|
+
-- status = RUNNING(正常)或 STOPPED(已停止,需先启动)
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
根据查询结果:
|
|
30
|
+
- 找到 `type = GENERAL` 且 `status = RUNNING` 的集群名称,用于 `vcluster <name>`
|
|
31
|
+
- 如果目标集群 `status = STOPPED`,先执行:`ALTER VCLUSTER <name> RESUME;`
|
|
32
|
+
- 如果没有 GP 型集群,需先创建:参考 `clickzetta-vcluster-manager` skill
|
|
33
|
+
|
|
34
|
+
> ⚠️ 不要用 `type = ANALYSIS`(AP 型)的集群创建动态表,AP 型不支持小文件合并,长期运行会导致查询性能下降。
|
|
35
|
+
|
|
17
36
|
## 快速入门
|
|
18
37
|
|
|
19
38
|
```sql
|
|
@@ -51,8 +70,44 @@ SHOW TABLES IN silver WHERE table_name = 'orders_daily';
|
|
|
51
70
|
|
|
52
71
|
INTERVAL 支持的单位:`SECOND`、`MINUTE`、`HOUR`、`DAY`,最小值为 1 分钟。
|
|
53
72
|
|
|
54
|
-
>
|
|
55
|
-
|
|
73
|
+
> ⚠️ **VCluster 类型**:始终使用 GP 型集群(`vcluster default`),不要用 AP 型(`default_ap`)。AP 型集群不支持小文件合并,长期运行会导致查询性能下降。
|
|
74
|
+
|
|
75
|
+
### ⚠️ 刷新周期的时间基准
|
|
76
|
+
|
|
77
|
+
**`REFRESH INTERVAL N DAY/HOUR` 以动态表的创建时间(或上次刷新时间)为基准计算下次触发时间,不是从零点或整点开始对齐。**
|
|
78
|
+
|
|
79
|
+
例如:动态表在 23:17 创建,设置 `REFRESH INTERVAL 1 DAY`,则后续每次刷新约在 23:17 触发,而不是次日 00:00 或业务期望的 03:00。`START WITH TIMESTAMP` 仅影响首次刷新时间,不改变后续周期的基准。
|
|
80
|
+
|
|
81
|
+
**如需控制刷新时间窗口,有三种方案:**
|
|
82
|
+
|
|
83
|
+
1. **在目标时间点附近创建动态表**(最简单)
|
|
84
|
+
2. **创建后立即执行 `REFRESH` 重置基准**(推荐,见下方最佳实践)
|
|
85
|
+
3. **改用短间隔**(如 `4 HOUR`)减少偏差,业务容忍度允许时可接受
|
|
86
|
+
|
|
87
|
+
### 创建动态表的最佳实践:创建后立即执行首次刷新
|
|
88
|
+
|
|
89
|
+
```sql
|
|
90
|
+
-- ✅ 推荐写法:创建后立即 REFRESH,重置刷新基准时间,实现"开箱即用"
|
|
91
|
+
CREATE DYNAMIC TABLE IF NOT EXISTS dws.user_order_daily
|
|
92
|
+
REFRESH INTERVAL 1 DAY vcluster default
|
|
93
|
+
AS
|
|
94
|
+
SELECT user_id, DATE(created_at) AS dt, COUNT(*) AS order_cnt
|
|
95
|
+
FROM dwd.fact_orders
|
|
96
|
+
GROUP BY 1, 2;
|
|
97
|
+
|
|
98
|
+
REFRESH DYNAMIC TABLE dws.user_order_daily;
|
|
99
|
+
-- 立即触发首次计算,同时将刷新基准时间重置为当前时刻
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### 手动刷新命令
|
|
103
|
+
|
|
104
|
+
```sql
|
|
105
|
+
-- ✅ 推荐:手动触发刷新
|
|
106
|
+
REFRESH DYNAMIC TABLE schema.table_name;
|
|
107
|
+
|
|
108
|
+
-- ⚠️ 不推荐:ALTER DYNAMIC TABLE ... REFRESH 语法不标准,建议使用上面的 REFRESH 命令
|
|
109
|
+
-- ALTER DYNAMIC TABLE schema.table_name REFRESH;
|
|
110
|
+
```
|
|
56
111
|
|
|
57
112
|
### 开启增量刷新的前提
|
|
58
113
|
|
|
@@ -110,3 +165,4 @@ Dynamic Table 最佳实践与避坑指南(维度表 JOIN 场景、性能优化
|
|
|
110
165
|
| 状态表损坏 | 系统异常 | `SET cz.optimizer.incremental.rebuild.rule.based.state.table = true` |
|
|
111
166
|
| 手动 REFRESH 后历史未显示 | 刷新历史有短暂延迟 | 等待几秒后重新查询 `SHOW DYNAMIC TABLE REFRESH HISTORY` |
|
|
112
167
|
| AP 集群刷新后查询变慢 | AP 集群不支持小文件合并 | 改用 GP 型集群(`CREATE OR REPLACE` 重建) |
|
|
168
|
+
| 刷新时间与预期不符(如期望 03:00 实际 23:00) | REFRESH INTERVAL 以创建时间为基准,不对齐整点 | 在目标时间点附近创建 DT,或创建后立即执行 `REFRESH DYNAMIC TABLE` 重置基准 |
|
|
@@ -12,7 +12,7 @@ description: |
|
|
|
12
12
|
## 指令
|
|
13
13
|
|
|
14
14
|
### 步骤 1:确认动态表存在并获取当前定义
|
|
15
|
-
|
|
15
|
+
执行 `SHOW CREATE TABLE schema_name.table_name` 获取动态表当前定义。
|
|
16
16
|
如果不确定是否为动态表,先用 `SHOW TABLES WHERE is_dynamic` 查看列表。
|
|
17
17
|
|
|
18
18
|
### 步骤 2:判断操作类型并选择执行方式
|
|
@@ -69,11 +69,11 @@ ALTER DYNAMIC TABLE dt_name UNSET PROPERTIES('key');
|
|
|
69
69
|
|
|
70
70
|
### 步骤 3:执行 CREATE OR REPLACE 重建(仅 B 类操作)
|
|
71
71
|
|
|
72
|
-
1.
|
|
72
|
+
1. 执行 `SHOW CREATE TABLE schema_name.table_name` 获取原始 DDL
|
|
73
73
|
> ⚠️ `SHOW CREATE TABLE` 不支持 LIMIT/WHERE 子句,直接执行即可
|
|
74
74
|
2. 解析出:列定义、REFRESH 子句、AS SELECT 子句、COMMENT 等
|
|
75
75
|
3. 根据操作修改对应部分
|
|
76
|
-
4.
|
|
76
|
+
4. 执行重建 SQL
|
|
77
77
|
|
|
78
78
|
**关于全量刷新的触发**:
|
|
79
79
|
- 简单的删除列 / 添加列(添加的列只是从源表 SELECT 透传,不参与 JOIN key、GROUP key 等计算)→ **增量刷新**
|
|
@@ -186,5 +186,5 @@ REFRESH DYNAMIC TABLE change_table;
|
|
|
186
186
|
| `UNDROP DYNAMIC TABLE` 报错 | UNDROP 不支持 DYNAMIC TABLE 关键字 | 改为 `UNDROP TABLE dt_name` |
|
|
187
187
|
| `DESC DYNAMIC TABLE ... EXTENDED` 报错 | 不支持 EXTENDED 参数 | 改为 `DESC TABLE dt_name`(不加 EXTENDED) |
|
|
188
188
|
| UPDATE/DELETE 报 "MV__KEY" 相关错误 | 动态表有隐藏列 MV__KEY,默认禁止 DML | 先执行 `SET cz.sql.dt.allow.dml = true;` |
|
|
189
|
-
| CREATE OR REPLACE 后数据为空 | AS SELECT 子句引用的源表或列不正确 |
|
|
189
|
+
| CREATE OR REPLACE 后数据为空 | AS SELECT 子句引用的源表或列不正确 | 先验证 SELECT 子句是否返回数据 |
|
|
190
190
|
| CREATE OR REPLACE 后全量刷新 | 新增列参与了计算逻辑(JOIN key、GROUP key 等) | 预期行为,等待全量刷新完成 |
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-external-catalog
|
|
3
|
+
description: |
|
|
4
|
+
配置和使用 ClickZetta Lakehouse External Catalog,实现对 Hive、Iceberg、Databricks、
|
|
5
|
+
Snowflake Open Catalog 等外部数据源的联邦查询(只读)。覆盖完整创建流程:存储连接 →
|
|
6
|
+
Catalog Connection → External Catalog,以及 SHOW/DESC/查询外部表等操作。
|
|
7
|
+
支持数据源:Hive(OSS/COS/S3/HDFS)、Databricks Unity Catalog、
|
|
8
|
+
Snowflake Open Catalog(通过 Iceberg,OAuth 认证)。
|
|
9
|
+
当用户说"外部数据目录"、"External Catalog"、"联邦查询"、"Hive 联邦"、
|
|
10
|
+
"访问 Hive 数据"、"Databricks 联邦"、"Iceberg 联邦"、"跨数据源查询"、
|
|
11
|
+
"不迁移数据直接查询"、"Catalog Connection"、"Snowflake 联邦"、
|
|
12
|
+
"访问 Snowflake 数据"时触发。
|
|
13
|
+
Keywords: external catalog, Hive, Iceberg, Databricks, Snowflake, federation, read-only
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
# ClickZetta External Catalog
|
|
17
|
+
|
|
18
|
+
> ⚠️ 创建 External Catalog 需要 `instance_admin` 角色。查询权限可通过 GRANT 授予其他用户。
|
|
19
|
+
|
|
20
|
+
阅读 [references/external-catalog-ddl.md](references/external-catalog-ddl.md) 了解完整语法。
|
|
21
|
+
|
|
22
|
+
## 概述
|
|
23
|
+
|
|
24
|
+
External Catalog 让 Lakehouse 可以**不迁移数据**,直接对外部数据系统(Hive、Iceberg、Databricks)执行只读联邦查询。
|
|
25
|
+
|
|
26
|
+
**支持数据源**:Apache Hive · Iceberg REST Catalog · Databricks Unity Catalog
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## 创建流程(三步)
|
|
31
|
+
|
|
32
|
+
### 步骤 1:创建存储连接
|
|
33
|
+
|
|
34
|
+
```sql
|
|
35
|
+
-- 阿里云 OSS
|
|
36
|
+
CREATE STORAGE CONNECTION IF NOT EXISTS catalog_storage_oss
|
|
37
|
+
TYPE OSS
|
|
38
|
+
ACCESS_ID = 'LTAIxxxxxxxxxxxx'
|
|
39
|
+
ACCESS_KEY = 'T8Gexxxxxxmtxxxxxx'
|
|
40
|
+
ENDPOINT = 'oss-cn-hangzhou-internal.aliyuncs.com';
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### 步骤 2:创建 Catalog Connection
|
|
44
|
+
|
|
45
|
+
```sql
|
|
46
|
+
CREATE CATALOG CONNECTION IF NOT EXISTS hive_catalog_conn
|
|
47
|
+
TYPE hms
|
|
48
|
+
hive_metastore_uris = 'hms-host:9083'
|
|
49
|
+
storage_connection = 'catalog_storage_oss';
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### 步骤 3:创建 External Catalog
|
|
53
|
+
|
|
54
|
+
```sql
|
|
55
|
+
-- ⚠️ CREATE EXTERNAL CATALOG 不支持 COMMENT 子句,加了会报错
|
|
56
|
+
-- ❌ 错误:CREATE EXTERNAL CATALOG my_hive_catalog CONNECTION hive_catalog_conn COMMENT '...';
|
|
57
|
+
-- ✅ 正确:
|
|
58
|
+
CREATE EXTERNAL CATALOG my_hive_catalog
|
|
59
|
+
CONNECTION hive_catalog_conn;
|
|
60
|
+
|
|
61
|
+
-- 如需带选项(如 Iceberg REST):
|
|
62
|
+
CREATE EXTERNAL CATALOG my_iceberg_catalog
|
|
63
|
+
CONNECTION iceberg_conn
|
|
64
|
+
OPTIONS ('key1' = 'value1', 'key2' = 'value2');
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## 验证连通性
|
|
70
|
+
|
|
71
|
+
```sql
|
|
72
|
+
-- 查看 Schema 列表(验证连通)
|
|
73
|
+
SHOW SCHEMAS IN my_hive_catalog;
|
|
74
|
+
|
|
75
|
+
-- 查看表列表
|
|
76
|
+
SHOW TABLES IN my_hive_catalog.my_schema;
|
|
77
|
+
|
|
78
|
+
-- 查询数据
|
|
79
|
+
SELECT * FROM my_hive_catalog.my_schema.my_table LIMIT 10;
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## 查看与管理
|
|
85
|
+
|
|
86
|
+
```sql
|
|
87
|
+
-- 列出所有 Catalog
|
|
88
|
+
SHOW CATALOGS;
|
|
89
|
+
|
|
90
|
+
-- 查看 Catalog 详情
|
|
91
|
+
DESC CATALOG my_hive_catalog;
|
|
92
|
+
|
|
93
|
+
-- 查看表结构
|
|
94
|
+
DESC TABLE my_hive_catalog.my_schema.my_table;
|
|
95
|
+
|
|
96
|
+
-- 删除 Catalog
|
|
97
|
+
DROP CATALOG IF EXISTS my_hive_catalog;
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 联邦查询示例
|
|
103
|
+
|
|
104
|
+
```sql
|
|
105
|
+
-- 外部 Hive 表 JOIN 内部 Lakehouse 表
|
|
106
|
+
SELECT h.order_id, h.amount, d.region_name
|
|
107
|
+
FROM my_hive_catalog.sales.orders h
|
|
108
|
+
JOIN public.dim_region d ON h.region_id = d.id
|
|
109
|
+
WHERE h.order_date >= '2024-01-01';
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
⚠️ 必须使用三层命名空间语法:`catalog.schema.table`
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 常见问题
|
|
117
|
+
|
|
118
|
+
| 问题 | 原因 | 解决方案 |
|
|
119
|
+
|---|---|---|
|
|
120
|
+
| 无法连接 HMS | 网络未打通 | 通过 PrivateLink 打通 Lakehouse 与 HMS 服务器网络 |
|
|
121
|
+
| 权限不足 | 非 instance_admin | 联系管理员授予 instance_admin 角色 |
|
|
122
|
+
| 查询报错找不到表 | 未使用三层语法 | 使用 `catalog.schema.table` 格式 |
|
|
123
|
+
| Databricks 连接失败 | 不在同一云平台 | 确保 Databricks 存储与 Lakehouse 在同一云平台 |
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"怎么在 ClickZetta 里直接查询 Hive 的数据?","expected_skill":"clickzetta-external-catalog","expected_output_contains":["External Catalog","CREATE CATALOG CONNECTION"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"怎么创建 External Catalog 连接 Iceberg REST Catalog?","expected_skill":"clickzetta-external-catalog","expected_output_contains":["CREATE EXTERNAL CATALOG","iceberg"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"联邦查询外部表的三层命名空间怎么写?","expected_skill":"clickzetta-external-catalog","expected_output_contains":["catalog.schema.table"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"怎么不迁移数据直接查询 Databricks 的表?","expected_skill":"clickzetta-external-catalog","expected_output_contains":["Databricks","External Catalog"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"创建 External Catalog 需要什么权限?","expected_skill":"clickzetta-external-catalog","expected_output_contains":["instance_admin"]}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# External Catalog 参考
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/external-catalog-summary 等
|
|
4
|
+
|
|
5
|
+
> ⚠️ External Catalog 当前处于公开预览阶段。目前只有 instance admin 角色可以查询 Catalog。
|
|
6
|
+
|
|
7
|
+
## 概述
|
|
8
|
+
|
|
9
|
+
External Catalog 映射外部数据系统(Hive、Iceberg、Databricks)的数据库,使 Lakehouse 可对其执行**只读**联邦查询。
|
|
10
|
+
|
|
11
|
+
**支持的数据源**:
|
|
12
|
+
- Apache Hive(通过 Hive Metastore)
|
|
13
|
+
- Iceberg REST Catalog(如 Snowflake OpenCatalog)
|
|
14
|
+
- Databricks Unity Catalog
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## 创建流程(以 Hive 为例)
|
|
19
|
+
|
|
20
|
+
### 步骤 1:创建存储连接
|
|
21
|
+
|
|
22
|
+
```sql
|
|
23
|
+
-- OSS
|
|
24
|
+
CREATE STORAGE CONNECTION IF NOT EXISTS catalog_storage_oss
|
|
25
|
+
TYPE OSS
|
|
26
|
+
ACCESS_ID = 'LTAIxxxxxxxxxxxx'
|
|
27
|
+
ACCESS_KEY = 'T8Gexxxxxxmtxxxxxx'
|
|
28
|
+
ENDPOINT = 'oss-cn-hangzhou-internal.aliyuncs.com';
|
|
29
|
+
|
|
30
|
+
-- COS
|
|
31
|
+
CREATE STORAGE CONNECTION IF NOT EXISTS catalog_storage_cos
|
|
32
|
+
TYPE COS
|
|
33
|
+
ACCESS_KEY = '<access_key>'
|
|
34
|
+
SECRET_KEY = '<secret_key>'
|
|
35
|
+
REGION = 'ap-shanghai'
|
|
36
|
+
APP_ID = '1310000503';
|
|
37
|
+
|
|
38
|
+
-- S3
|
|
39
|
+
CREATE STORAGE CONNECTION IF NOT EXISTS catalog_storage_s3
|
|
40
|
+
TYPE S3
|
|
41
|
+
ACCESS_KEY = '<access_key>'
|
|
42
|
+
SECRET_KEY = '<secret_key>'
|
|
43
|
+
REGION = 'us-east-1';
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 步骤 2:创建 Catalog Connection
|
|
47
|
+
|
|
48
|
+
```sql
|
|
49
|
+
-- Hive Metastore
|
|
50
|
+
CREATE CATALOG CONNECTION IF NOT EXISTS catalog_api_connection
|
|
51
|
+
TYPE hms
|
|
52
|
+
hive_metastore_uris = 'host:9083'
|
|
53
|
+
storage_connection = 'catalog_storage_oss';
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
参数说明:
|
|
57
|
+
- `type`:连接类型,目前支持 `hms`(Hive Metastore Service)
|
|
58
|
+
- `hive_metastore_uris`:HMS 服务地址,格式 `host:port`,端口通常为 9083
|
|
59
|
+
- `storage_connection`:已创建的存储连接名称
|
|
60
|
+
|
|
61
|
+
### 步骤 3:创建 External Catalog
|
|
62
|
+
|
|
63
|
+
```sql
|
|
64
|
+
CREATE EXTERNAL CATALOG my_external_catalog
|
|
65
|
+
CONNECTION catalog_api_connection;
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## 查看 Catalog
|
|
71
|
+
|
|
72
|
+
```sql
|
|
73
|
+
-- 列出所有 Catalog
|
|
74
|
+
SHOW CATALOGS;
|
|
75
|
+
|
|
76
|
+
-- 查看 Catalog 详情
|
|
77
|
+
DESC CATALOG my_external_catalog;
|
|
78
|
+
DESC CATALOG EXTENDED my_external_catalog;
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## 查看 Catalog 下的对象
|
|
84
|
+
|
|
85
|
+
```sql
|
|
86
|
+
-- 查看 Schema 列表
|
|
87
|
+
SHOW SCHEMAS IN my_external_catalog;
|
|
88
|
+
|
|
89
|
+
-- 查看 Schema 列表(含类型:managed/external)
|
|
90
|
+
SHOW SCHEMAS EXTENDED IN my_external_catalog;
|
|
91
|
+
|
|
92
|
+
-- 查看表列表
|
|
93
|
+
SHOW TABLES IN my_external_catalog.my_schema;
|
|
94
|
+
|
|
95
|
+
-- 查看表结构
|
|
96
|
+
DESC TABLE my_external_catalog.my_schema.my_table;
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## 查询外部数据
|
|
102
|
+
|
|
103
|
+
```sql
|
|
104
|
+
-- 三层命名空间语法(必须)
|
|
105
|
+
SELECT * FROM my_external_catalog.my_schema.my_table;
|
|
106
|
+
|
|
107
|
+
-- 联邦查询(外部表 JOIN 内部表)
|
|
108
|
+
SELECT e.*, i.region
|
|
109
|
+
FROM my_external_catalog.hive_schema.orders e
|
|
110
|
+
JOIN public.dim_region i ON e.region_id = i.id;
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
⚠️ 查询 External Catalog 下的表**必须**使用三层结构语法(catalog.schema.table),不支持 `USE` 切换 catalog。
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## 删除 Catalog
|
|
118
|
+
|
|
119
|
+
```sql
|
|
120
|
+
DROP CATALOG IF EXISTS my_external_catalog;
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## 注意事项
|
|
126
|
+
|
|
127
|
+
- External Catalog 为**只读**,不支持写入操作
|
|
128
|
+
- HMS 所在服务器网络需与 Lakehouse 打通(可通过 PrivateLink 实现)
|
|
129
|
+
- 目前只有 `instance_admin` 角色可以创建和查询 External Catalog
|
|
130
|
+
- Databricks Unity Catalog 要求与 Lakehouse 在同一云平台(如同在 AWS 上)
|