@clickzetta/cz-cli-darwin-arm64 0.3.40 → 0.3.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +128 -287
- package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
- package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
- package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +103 -11
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +58 -2
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +4 -4
- package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +34 -0
- package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +31 -0
- package/bin/skills/clickzetta-metadata/SKILL.md +28 -30
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +39 -0
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +115 -9
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
- package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
- package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +0 -4
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
|
@@ -1,36 +1,14 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: clickzetta-metadata
|
|
3
3
|
description: |
|
|
4
|
-
查询 ClickZetta Lakehouse
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
覆盖所有 SHOW 命令(TABLES/SCHEMAS/CATALOGS/COLUMNS/VOLUMES/CONNECTIONS/JOBS/VCLUSTERS/
|
|
13
|
-
PIPES/SHARES/USERS/ROLES/GRANTS/FUNCTIONS/TABLE STREAMS/PARTITIONS/SYNONYMS/INDEX/
|
|
14
|
-
DYNAMIC TABLE REFRESH HISTORY/TABLES HISTORY),所有 DESC 命令(TABLE/SCHEMA/HISTORY/
|
|
15
|
-
VCLUSTER/VOLUME/CONNECTION/FUNCTION/VIEW/DYNAMIC TABLE/SHARE/INDEX/TABLE STREAM),
|
|
16
|
-
SHOW CREATE TABLE,load_history(),FROM (SHOW ...) 子查询,上下文函数,
|
|
17
|
-
以及 INFORMATION_SCHEMA 空间级和实例级视图(TABLES/COLUMNS/JOB_HISTORY/USERS/ROLES/
|
|
18
|
-
VOLUMES/CONNECTIONS/MATERIALIZED_VIEW_REFRESH_HISTORY/STORAGE_METERING/INSTANCE_USAGE 等)。
|
|
19
|
-
|
|
20
|
-
当用户说"查看表列表"、"查看字段"、"查看字段信息"、"查看作业"、"查看作业历史"、
|
|
21
|
-
"查看 JOB 历史"、"SHOW TABLES"、"DESC TABLE"、"查看分区"、"查看历史版本"、
|
|
22
|
-
"查看删除的表"、"查看导入历史"、"load_history"、"SHOW JOBS"、"查看集群状态"、
|
|
23
|
-
"查看连接"、"查看权限"、"SHOW GRANTS"、"查看函数"、"查看 Volume"、
|
|
24
|
-
"查看 Volume 列表"、"查看 Share"、"查看 Catalog"、"查看慢查询"、
|
|
25
|
-
"查看 CRU 消耗"、"费用分析"、"成本分析"、"计算费用"、"存储费用"、
|
|
26
|
-
"用量统计"、"成本归因"、"哪个用户消耗最多"、"存储用量排行"、
|
|
27
|
-
"查看用户列表"、"查看角色"、"查看 Connection"、"查看物化视图刷新历史"、
|
|
28
|
-
"元数据查询"、"information_schema"、"查看所有表"、"查看 Schema 列表"、
|
|
29
|
-
"统计存储用量"、"SHOW/DESC 和 information_schema 哪个更快"时触发。
|
|
30
|
-
|
|
31
|
-
注意:本 skill 仅覆盖元数据的只读查询(SHOW/DESC/information_schema)。
|
|
32
|
-
权限变更(GRANT/REVOKE/创建用户/角色管理/数据脱敏)请使用 clickzetta-access-control skill。
|
|
33
|
-
Keywords: SHOW, DESC, DESCRIBE, metadata, load_history, information_schema, table info, column info, job history, system view, cost analysis, CRU
|
|
4
|
+
查询 ClickZetta Lakehouse 元数据,覆盖两种方式:
|
|
5
|
+
SHOW/DESC 命令族(实时,适合单个对象即时查询)和
|
|
6
|
+
INFORMATION_SCHEMA 视图(支持复杂 SQL 分析、费用归因、跨对象统计)。
|
|
7
|
+
当用户说"查看表列表"、"查看字段"、"查看作业历史"、"SHOW TABLES"、
|
|
8
|
+
"DESC TABLE"、"查看分区"、"查看权限"、"SHOW GRANTS"、"查看 Volume"、
|
|
9
|
+
"费用分析"、"成本归因"、"用量统计"、"元数据查询"、"information_schema"时触发。
|
|
10
|
+
注意:本 skill 仅覆盖只读元数据查询;权限变更请使用 clickzetta-access-control。
|
|
11
|
+
Keywords: SHOW, DESC, metadata, load_history, information_schema, job history, cost analysis, CRU
|
|
34
12
|
---
|
|
35
13
|
|
|
36
14
|
# ClickZetta 元数据查询指南
|
|
@@ -72,6 +50,26 @@ cz-cli sql "SELECT * FROM load_history('my_schema.my_table') LIMIT 20" --sync -o
|
|
|
72
50
|
|
|
73
51
|
---
|
|
74
52
|
|
|
53
|
+
## 支持的命令与视图全览
|
|
54
|
+
|
|
55
|
+
**SHOW 命令**:TABLES / SCHEMAS / CATALOGS / COLUMNS / VOLUMES / CONNECTIONS / JOBS / VCLUSTERS /
|
|
56
|
+
PIPES / SHARES / USERS / ROLES / GRANTS / FUNCTIONS / TABLE STREAMS / PARTITIONS / SYNONYMS / INDEX /
|
|
57
|
+
DYNAMIC TABLE REFRESH HISTORY / TABLES HISTORY
|
|
58
|
+
|
|
59
|
+
**DESC 命令**:TABLE / SCHEMA / HISTORY / VCLUSTER / VOLUME / CONNECTION / FUNCTION / VIEW /
|
|
60
|
+
DYNAMIC TABLE / SHARE / INDEX / TABLE STREAM
|
|
61
|
+
|
|
62
|
+
**其他**:SHOW CREATE TABLE、load_history()、FROM (SHOW ...) 子查询、上下文函数
|
|
63
|
+
|
|
64
|
+
**INFORMATION_SCHEMA 视图**(空间级):TABLES / COLUMNS / JOB_HISTORY / USERS / ROLES /
|
|
65
|
+
VOLUMES / CONNECTIONS / MATERIALIZED_VIEW_REFRESH_HISTORY / AUTOMV_REFRESH_HISTORY / SORTKEY_CANDIDATES
|
|
66
|
+
|
|
67
|
+
**INFORMATION_SCHEMA 视图**(实例级,需 INSTANCE ADMIN):WORKSPACES / SCHEMAS / TABLES / COLUMNS /
|
|
68
|
+
VIEWS / USERS / ROLES / JOB_HISTORY / VOLUMES / CONNECTIONS / OBJECT_PRIVILEGES /
|
|
69
|
+
STORAGE_METERING / INSTANCE_USAGE
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
75
73
|
## 参考文档
|
|
76
74
|
|
|
77
75
|
- [SHOW/DESC 完整语法](references/show-desc-reference.md)
|
|
@@ -13,6 +13,45 @@ description: |
|
|
|
13
13
|
|
|
14
14
|
# 对象存储数据管道搭建工作流
|
|
15
15
|
|
|
16
|
+
## 向导:收集必要信息
|
|
17
|
+
|
|
18
|
+
开始搭建对象存储管道前,优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
question({
|
|
22
|
+
questions: [
|
|
23
|
+
{
|
|
24
|
+
question: "云平台?",
|
|
25
|
+
options: [
|
|
26
|
+
{ label: "阿里云 OSS", description: "支持 LIST_PURGE 和 EVENT_NOTIFICATION 两种模式" },
|
|
27
|
+
{ label: "AWS S3", description: "支持 LIST_PURGE 和 EVENT_NOTIFICATION 两种模式" },
|
|
28
|
+
{ label: "腾讯云 COS", description: "仅支持 LIST_PURGE 模式" }
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
question: "导入模式?",
|
|
33
|
+
options: [
|
|
34
|
+
{ label: "持续导入(PIPE)", description: "新文件自动触发导入,近实时" },
|
|
35
|
+
{ label: "批量一次性导入", description: "手动或定时执行 COPY INTO" }
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
question: "文件格式?",
|
|
40
|
+
options: [
|
|
41
|
+
{ label: "CSV", description: "逗号分隔文本" },
|
|
42
|
+
{ label: "JSON / JSONL", description: "JSON 或换行分隔 JSON" },
|
|
43
|
+
{ label: "Parquet", description: "列式存储格式" },
|
|
44
|
+
{ label: "ORC", description: "列式存储格式" }
|
|
45
|
+
]
|
|
46
|
+
}
|
|
47
|
+
]
|
|
48
|
+
})
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**如果用户已经提供了足够信息,直接进入工作流,不再弹出菜单。**
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
16
55
|
## 适用场景
|
|
17
56
|
|
|
18
57
|
- 从对象存储(阿里云 OSS / AWS S3 / 腾讯云 COS)持续自动导入数据到 Lakehouse(PIPE 模式)
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-pipeline-review
|
|
3
|
+
description: |
|
|
4
|
+
对 ClickZetta Lakehouse 数据管道进行全面 Review 与诊断。从任意入口(任务名/schema/表名/
|
|
5
|
+
业务域关键词)出发,自主发现管道涉及的全部对象(Studio 任务、Lakehouse 表、管道对象、
|
|
6
|
+
运行记录),识别调度依赖缺失、DDL 幂等问题、分层跳层、DT 反模式等常见问题,
|
|
7
|
+
给出优先级排序的修复建议并执行。
|
|
8
|
+
当用户说"Review 管道"、"检查数据管道"、"管道诊断"、"管道有问题"、"任务跑失败了"、
|
|
9
|
+
"数据不对"、"管道 Review"、"pipeline review"、"检查 ETL"、"管道健康检查"、
|
|
10
|
+
"数据链路检查"、"管道全貌"、"管道梳理"时触发。
|
|
11
|
+
Keywords: pipeline review, diagnosis, task dependency, data lineage, DT health, pipeline discovery
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# ClickZetta 数据管道 Review 指南
|
|
15
|
+
|
|
16
|
+
## 向导:收集必要信息
|
|
17
|
+
|
|
18
|
+
收到 Review 请求后,**不要立即开始探索**。先通过向导收集必要信息,再启动五阶段流程。
|
|
19
|
+
|
|
20
|
+
### 第 0 步:信息收集(必须完成,不得跳过)
|
|
21
|
+
|
|
22
|
+
优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
question({
|
|
26
|
+
questions: [
|
|
27
|
+
{
|
|
28
|
+
question: "Review 范围?",
|
|
29
|
+
options: [
|
|
30
|
+
{ label: "全量 Review", description: "发现所有问题,给出完整报告" },
|
|
31
|
+
{ label: "专项诊断", description: "只看某类问题,如任务依赖、DT 刷新失败、数据不一致" },
|
|
32
|
+
{ label: "快速健康检查", description: "只看 P0 问题,5 分钟内出结论" }
|
|
33
|
+
]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
question: "执行权限?",
|
|
37
|
+
options: [
|
|
38
|
+
{ label: "可读写", description: "可以执行修复操作(推荐)" },
|
|
39
|
+
{ label: "只读", description: "只能查,输出报告不执行修复" }
|
|
40
|
+
]
|
|
41
|
+
}
|
|
42
|
+
]
|
|
43
|
+
})
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
管道入口(业务域/任务名/schema/表名)和已知症状可在用户回答后追问,或从上下文推断。
|
|
47
|
+
|
|
48
|
+
### 根据回答调整策略
|
|
49
|
+
|
|
50
|
+
| Review 范围 | 执行权限 | 策略 |
|
|
51
|
+
|---|---|---|
|
|
52
|
+
| 全量 Review | 可读写 | 走完五阶段,发现问题后询问是否执行修复 |
|
|
53
|
+
| 全量 Review | 只读 | 走完五阶段,输出问题报告,不执行修复 |
|
|
54
|
+
| 专项诊断 | 任意 | 只执行对应阶段的检查项,跳过无关步骤 |
|
|
55
|
+
| 快速健康检查 | 任意 | 只检查 P0 问题(依赖缺失、DT 持续失败),5 分钟内出结论 |
|
|
56
|
+
|
|
57
|
+
**如果用户已经在请求中提供了足够信息(如"帮我 Review shenyu_gateway 管道,全量 Review,可以修复"),直接进入第一阶段,不再重复询问。**
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 工作模式:五阶段 Review 流程
|
|
62
|
+
|
|
63
|
+
收集到必要信息后,按以下五阶段执行:
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
发现 → 分析 → 识别问题 → 执行修复 → 验证
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## 第一阶段:发现(管道全貌探索)
|
|
72
|
+
|
|
73
|
+
### 入口识别
|
|
74
|
+
|
|
75
|
+
用户可能从任意层给出入口,从入口向上下游展开:
|
|
76
|
+
|
|
77
|
+
| 用户给出的入口 | 展开方向 |
|
|
78
|
+
|---|---|
|
|
79
|
+
| 业务域关键词(如"shenyu_gateway") | 同时搜索 Studio 任务和 Lakehouse schema |
|
|
80
|
+
| Studio 任务名/目录 | 读任务脚本 → 找涉及的表 → 找上下游任务 |
|
|
81
|
+
| Lakehouse 表名/schema | 找写入该表的任务 → 找读取该表的 DT/任务 |
|
|
82
|
+
| 管道对象(Pipe/DT/Stream) | 找源表和目标表 → 找关联任务 |
|
|
83
|
+
| 错误信息/运行 ID | 先定位任务 → 再展开全貌 |
|
|
84
|
+
|
|
85
|
+
### 探索四层
|
|
86
|
+
|
|
87
|
+
**无论入口是什么,都要探索以下四层,缺一不可:**
|
|
88
|
+
|
|
89
|
+
**层 1 — Studio 任务层**
|
|
90
|
+
```bash
|
|
91
|
+
# 按业务域关键词找任务目录
|
|
92
|
+
cz-cli task list-folders
|
|
93
|
+
|
|
94
|
+
# 列出目录下所有任务
|
|
95
|
+
cz-cli task list --folder <folder>
|
|
96
|
+
|
|
97
|
+
# 读每个任务的脚本和配置(重点看:task_type、cron_express、task_dependencies、edit_state)
|
|
98
|
+
cz-cli task content <task_id>
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**层 2 — Lakehouse 对象层**
|
|
102
|
+
```sql
|
|
103
|
+
-- 找相关 schema
|
|
104
|
+
SHOW SCHEMAS;
|
|
105
|
+
|
|
106
|
+
-- 列出各层表
|
|
107
|
+
SHOW TABLES IN <ods_schema>;
|
|
108
|
+
SHOW TABLES IN <dwd_schema>;
|
|
109
|
+
SHOW TABLES IN <dws_schema>;
|
|
110
|
+
SHOW TABLES IN <ads_schema>;
|
|
111
|
+
|
|
112
|
+
-- 找 Dynamic Table
|
|
113
|
+
SHOW TABLES IN <schema> WHERE is_dynamic;
|
|
114
|
+
|
|
115
|
+
-- 找 Pipe
|
|
116
|
+
SHOW PIPES;
|
|
117
|
+
|
|
118
|
+
-- 找 Table Stream
|
|
119
|
+
SHOW TABLE STREAMS;
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
**层 3 — 运行记录层**(配置是"应该怎样",运行记录是"实际怎样")
|
|
123
|
+
```bash
|
|
124
|
+
# 查每个关键任务的最近运行记录
|
|
125
|
+
cz-cli runs list --task <task_name> --limit 10
|
|
126
|
+
|
|
127
|
+
# 发现失败时查日志
|
|
128
|
+
cz-cli runs logs <run_id>
|
|
129
|
+
|
|
130
|
+
# 查运行统计(成功率、平均耗时)
|
|
131
|
+
cz-cli runs stats --task <task_name>
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**层 4 — 管道对象状态层**
|
|
135
|
+
```sql
|
|
136
|
+
-- Dynamic Table 刷新历史(每张 DT 都要查)
|
|
137
|
+
SHOW DYNAMIC TABLE REFRESH HISTORY <schema>.<table> LIMIT 10;
|
|
138
|
+
|
|
139
|
+
-- Pipe 状态
|
|
140
|
+
DESC PIPE <pipe_name>;
|
|
141
|
+
|
|
142
|
+
-- Table Stream 积压
|
|
143
|
+
SELECT COUNT(*) FROM <stream_name>;
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### 发现阶段输出
|
|
147
|
+
|
|
148
|
+
完成四层探索后,向用户呈现管道全貌摘要:
|
|
149
|
+
```
|
|
150
|
+
管道全貌:
|
|
151
|
+
- Studio 任务:N 个(列出名称、类型、状态、cron)
|
|
152
|
+
- ODS 层:N 张表
|
|
153
|
+
- DWD 层:N 张表
|
|
154
|
+
- DWS/ADS 层:N 张 Dynamic Table
|
|
155
|
+
- 管道对象:Pipe × N,Table Stream × N
|
|
156
|
+
- 运行记录:最近 N 次,成功率 X%
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## 第二阶段:分析(深度读取)
|
|
162
|
+
|
|
163
|
+
发现阶段只是"找到了什么",分析阶段要"读懂内容":
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
# 读每个任务的完整脚本
|
|
167
|
+
cz-cli task content <task_id>
|
|
168
|
+
|
|
169
|
+
# 重点关注:
|
|
170
|
+
# - task_dependencies:是否配置了上下游依赖
|
|
171
|
+
# - cron_express:调度时间是否合理
|
|
172
|
+
# - edit_state:20=DRAFT,30=PUBLISHED
|
|
173
|
+
# - task_type:SQL任务/同步任务/实时同步
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**同步任务运行模式判断(不能只看单一字段):**
|
|
177
|
+
|
|
178
|
+
| 字段 | 不能单独判断 | 需要综合判断 |
|
|
179
|
+
|---|---|---|
|
|
180
|
+
| `readMode: BINLOG` | ❌ 不代表 CDC 实时 | 还需看 cron_express、pkWriteMode、运行记录 |
|
|
181
|
+
| `pkWriteMode: OVERWRITE` | 覆盖写 → 离线批量 | 结合 cron 和运行记录确认 |
|
|
182
|
+
| 运行记录只有 1 条手动触发 | → 定时调度可能未生效 | 需确认 cron 是否正常触发 |
|
|
183
|
+
|
|
184
|
+
**综合判断规则**:
|
|
185
|
+
- `cron_express` 有值 + `pkWriteMode: OVERWRITE` + 运行记录为定时触发 → **离线批量同步**
|
|
186
|
+
- `cron_express` 为空 + 任务持续运行状态 → **实时同步(CDC/Kafka)**
|
|
187
|
+
- 运行记录全是手动触发 → **调度未生效,需排查**
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## 第三阶段:识别问题
|
|
192
|
+
|
|
193
|
+
### 检查清单(按优先级)
|
|
194
|
+
|
|
195
|
+
**🔴 P0 — 调度依赖缺失**
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
# 检查每个 ETL/转换任务的依赖配置
|
|
199
|
+
cz-cli task content <task_id>
|
|
200
|
+
# 查看 task_dependencies 字段是否为空数组 []
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
- ETL 转换任务的 `task_dependencies` 为空 → **P0,必须修复**
|
|
204
|
+
- 上游同步任务未完成时下游就开始执行 → 读到旧数据或空数据
|
|
205
|
+
- 运行记录时间线混乱(多次手动触发、时间间隔异常)→ 依赖缺失的典型症状
|
|
206
|
+
|
|
207
|
+
**🔴 P0 — Dynamic Table 刷新持续失败**
|
|
208
|
+
|
|
209
|
+
```sql
|
|
210
|
+
SHOW DYNAMIC TABLE REFRESH HISTORY <schema>.<table> LIMIT 10;
|
|
211
|
+
-- status 连续出现 FAILED → P0
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
**🟡 P1 — DDL 幂等性问题**
|
|
215
|
+
|
|
216
|
+
Dynamic Table 的 DDL 应统一使用 `CREATE OR REPLACE`,不要用 `DROP + CREATE` 两步:
|
|
217
|
+
- `DROP` 和 `CREATE` 之间存在竞态条件
|
|
218
|
+
- 如果 `CREATE` 失败,表已被删除,数据丢失
|
|
219
|
+
|
|
220
|
+
```sql
|
|
221
|
+
-- ❌ 有竞态风险
|
|
222
|
+
DROP DYNAMIC TABLE IF EXISTS schema.table;
|
|
223
|
+
CREATE DYNAMIC TABLE schema.table ...;
|
|
224
|
+
|
|
225
|
+
-- ✅ 原子操作
|
|
226
|
+
CREATE OR REPLACE DYNAMIC TABLE schema.table ...;
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
> ⚠️ `CREATE OR REPLACE` 有类型变更限制:字段类型变更(如 `TINYINT → BOOLEAN`)会报错。
|
|
230
|
+
> 解决方案:用 `CAST(col AS TINYINT)` 保持类型兼容,或先 `DROP` 再 `CREATE`。
|
|
231
|
+
|
|
232
|
+
**🟡 P1 — DWS 层跳过 DWD 直接读 ODS**
|
|
233
|
+
|
|
234
|
+
```sql
|
|
235
|
+
-- 检查 DWS 层 DT 的 SQL 定义,看 FROM 子句引用的是哪一层
|
|
236
|
+
SHOW CREATE TABLE <dws_schema>.<table>;
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
- DWS 层应从 DWD 层读取,不应直接读 ODS
|
|
240
|
+
- 跳层问题:重复计算(DWD 已做的 JSON 解析/类型转换在 DWS 又做一遍)、口径不一致、维护成本高
|
|
241
|
+
|
|
242
|
+
**🟡 P1 — Dynamic Table 定义中包含 ORDER BY**
|
|
243
|
+
|
|
244
|
+
```sql
|
|
245
|
+
-- 查看 DT 定义
|
|
246
|
+
SHOW CREATE TABLE <schema>.<dt_name>;
|
|
247
|
+
-- 如果 AS 子句中有 ORDER BY → 需要移除
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
- DT 的 `ORDER BY` 仅在查询时生效,不影响存储顺序
|
|
251
|
+
- 每次刷新额外消耗计算资源做排序,无实际收益
|
|
252
|
+
- 排序逻辑应放在查询端(BI 工具或下游 SQL)
|
|
253
|
+
|
|
254
|
+
**🟢 P2 — DDL 任务保留 Cron 配置**
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
cz-cli task content <ddl_task_id>
|
|
258
|
+
# edit_state=20(DRAFT)但 cron_express 不为空 → P2
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
- DRAFT 状态不会实际执行,但保留 Cron 配置容易误导维护者
|
|
262
|
+
- 建议清理,非紧急
|
|
263
|
+
|
|
264
|
+
**🟢 P2 — Studio 任务脚本与实际 DT 定义不一致**
|
|
265
|
+
|
|
266
|
+
直接通过 SQL 重建 DT 后,Studio 任务脚本不会自动同步:
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
# 检查:读 Studio 任务脚本
|
|
270
|
+
cz-cli task content <task_id>
|
|
271
|
+
|
|
272
|
+
# 对比:读实际 DT 定义(执行以下 SQL)
|
|
273
|
+
# SHOW CREATE TABLE <schema>.<table>
|
|
274
|
+
|
|
275
|
+
# 如果不一致,同步 Studio 任务脚本
|
|
276
|
+
cz-cli task save-content <task_id> --content "<new_sql>"
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## 第四阶段:执行修复
|
|
282
|
+
|
|
283
|
+
### 修复依赖配置
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
# 为 ETL 任务配置上游依赖
|
|
287
|
+
cz-cli task save-config <task_id> --deps replace \
|
|
288
|
+
--dep-tasks '[{"taskId":<upstream_id>,"taskName":"<upstream_name>"}]'
|
|
289
|
+
|
|
290
|
+
# 部署生效
|
|
291
|
+
cz-cli task deploy <task_id> -y
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### 修复 DT DDL(统一为 CREATE OR REPLACE)
|
|
295
|
+
|
|
296
|
+
```sql
|
|
297
|
+
-- 先确认字段类型,避免类型变更报错
|
|
298
|
+
SHOW CREATE TABLE <schema>.<table>;
|
|
299
|
+
|
|
300
|
+
-- 执行重建(如有类型变更,用 CAST 保持兼容)
|
|
301
|
+
CREATE OR REPLACE DYNAMIC TABLE <schema>.<table>
|
|
302
|
+
REFRESH INTERVAL <n> <unit> vcluster <gp_cluster>
|
|
303
|
+
AS
|
|
304
|
+
SELECT ...
|
|
305
|
+
FROM <dwd_schema>.<table> -- 确保从 DWD 层读取,不跳层
|
|
306
|
+
...; -- 移除 ORDER BY
|
|
307
|
+
|
|
308
|
+
-- 立即触发首次刷新
|
|
309
|
+
REFRESH DYNAMIC TABLE <schema>.<table>;
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### 同步 Studio 任务脚本
|
|
313
|
+
|
|
314
|
+
```bash
|
|
315
|
+
# SQL 重建 DT 后,同步 Studio 任务脚本保持一致
|
|
316
|
+
cz-cli task save-content <task_id> --content "<updated_sql>"
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
### 执行原则
|
|
320
|
+
|
|
321
|
+
- **直接 SQL 操作**(重建 DT、修改表结构)→ 执行对应 SQL,执行前向用户确认
|
|
322
|
+
- **Studio 任务配置**(依赖、Cron、脚本)→ 用 `cz-cli task save-*` + `deploy`
|
|
323
|
+
- **两者都改时**:先改 SQL(数据层),再同步 Studio(配置层)
|
|
324
|
+
|
|
325
|
+
---
|
|
326
|
+
|
|
327
|
+
## 第五阶段:验证
|
|
328
|
+
|
|
329
|
+
修复完成后,**逐项验证**,不跳过:
|
|
330
|
+
|
|
331
|
+
```sql
|
|
332
|
+
-- 1. Dynamic Table 刷新状态
|
|
333
|
+
SHOW DYNAMIC TABLE REFRESH HISTORY <schema>.<table> LIMIT 5;
|
|
334
|
+
-- 确认最近一次 status = SUCCESS
|
|
335
|
+
|
|
336
|
+
-- 2. 各层行数
|
|
337
|
+
SELECT COUNT(*) FROM <ods_schema>.<table>;
|
|
338
|
+
SELECT COUNT(*) FROM <dwd_schema>.<table>;
|
|
339
|
+
SELECT COUNT(*) FROM <dws_schema>.<table>;
|
|
340
|
+
|
|
341
|
+
-- 3. 关键字段非空率
|
|
342
|
+
SELECT ROUND(COUNT(key_field) * 100.0 / COUNT(*), 2) AS non_null_pct
|
|
343
|
+
FROM <schema>.<table>;
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
# 4. 确认任务依赖已生效
|
|
348
|
+
cz-cli task content <task_id>
|
|
349
|
+
# 查看 task_dependencies 不再为空
|
|
350
|
+
|
|
351
|
+
# 5. 确认 Studio 任务脚本已同步
|
|
352
|
+
cz-cli task content <task_id>
|
|
353
|
+
# 对比脚本内容与实际 DT 定义一致
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
向用户输出 Review 结论:
|
|
357
|
+
```
|
|
358
|
+
Review 结论:
|
|
359
|
+
- 发现问题:P0 × N,P1 × N,P2 × N
|
|
360
|
+
- 已修复:(列出每项)
|
|
361
|
+
- 未修复/建议:(列出每项及原因)
|
|
362
|
+
- 验证结果:各层行数、DT 刷新状态
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
## 常见问题速查
|
|
368
|
+
|
|
369
|
+
| 现象 | 根因 | 排查命令 |
|
|
370
|
+
|---|---|---|
|
|
371
|
+
| ETL 任务读到旧数据 | 依赖缺失,上游未完成就开始执行 | `cz-cli task content` 查 task_dependencies |
|
|
372
|
+
| 运行记录时间线混乱 | 依赖缺失,多次手动触发 | `cz-cli runs list` 看触发方式 |
|
|
373
|
+
| DT 刷新报"表已存在" | DROP+CREATE 竞态,或 CREATE OR REPLACE 类型冲突 | `SHOW CREATE TABLE` 确认字段类型 |
|
|
374
|
+
| DT 刷新时间与预期不符 | REFRESH INTERVAL 以创建时间为基准,不对齐整点 | 创建后立即执行 `REFRESH DYNAMIC TABLE` |
|
|
375
|
+
| Studio 脚本与实际 DT 不一致 | 直接 SQL 重建后未同步 Studio | `cz-cli task save-content` 同步 |
|
|
376
|
+
| 同步任务判断为 CDC 但实为离线 | 只看 readMode 字段,未综合判断 | 结合 cron、pkWriteMode、运行记录综合判断 |
|
|
377
|
+
| DWS 数据与 DWD 口径不一致 | DWS 跳层读 ODS,重复计算 | `SHOW CREATE TABLE` 检查 FROM 子句 |
|