@clickzetta/cz-cli-darwin-arm64 0.3.81 → 0.3.83
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-access-control/LICENSE +16 -0
- package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
- package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
- package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
- package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
- package/bin/skills/clickzetta-app-python-sdk/LICENSE +16 -0
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +227 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/LICENSE +16 -0
- package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +237 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/LICENSE +16 -0
- package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
- package/bin/skills/clickzetta-data-science/LICENSE +16 -0
- package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
- package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
- package/bin/skills/clickzetta-data-sharing/LICENSE +16 -0
- package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
- package/bin/skills/clickzetta-dba-guide/LICENSE +16 -0
- package/bin/skills/clickzetta-dba-guide/SKILL.md +542 -0
- package/bin/skills/clickzetta-dba-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-dw-modeling/LICENSE +16 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +351 -0
- package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
- package/bin/skills/clickzetta-dynamic-table/LICENSE +16 -0
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +230 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +253 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +124 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +96 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +109 -0
- package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +135 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +15 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +185 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +427 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +260 -0
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +80 -0
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +190 -0
- package/bin/skills/clickzetta-dynamic-table/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/SKILL.md +27 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-column-validation-rules.md +118 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-conversion-rules.md +225 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-placeholder-rules.md +182 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-refresh-rules.md +98 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-self-reference-rules.md +76 -0
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-workflow.md +109 -0
- package/bin/skills/clickzetta-external-catalog/LICENSE +16 -0
- package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
- package/bin/skills/clickzetta-external-function/LICENSE +16 -0
- package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
- package/bin/skills/clickzetta-file-import-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +190 -0
- package/bin/skills/clickzetta-file-import-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
- package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
- package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
- package/bin/skills/clickzetta-java-sdk/LICENSE +16 -0
- package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +769 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +324 -0
- package/bin/skills/clickzetta-lakehouse-connect/LICENSE +16 -0
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +218 -0
- package/bin/skills/clickzetta-lakehouse-connect/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +35 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +435 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +478 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +225 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +468 -0
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +445 -0
- package/bin/skills/clickzetta-manage-comments/LICENSE +16 -0
- package/bin/skills/clickzetta-manage-comments/SKILL.md +219 -0
- package/bin/skills/clickzetta-manage-comments/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-metadata/LICENSE +16 -0
- package/bin/skills/clickzetta-metadata/SKILL.md +502 -0
- package/bin/skills/clickzetta-metadata/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-metadata/references/instance-views-reference.md +276 -0
- package/bin/skills/clickzetta-metadata/references/metering-views-reference.md +137 -0
- package/bin/skills/clickzetta-metadata/references/show-desc-reference.md +326 -0
- package/bin/skills/clickzetta-metadata/references/views-reference.md +271 -0
- package/bin/skills/clickzetta-monitoring/LICENSE +16 -0
- package/bin/skills/clickzetta-monitoring/SKILL.md +215 -0
- package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +562 -0
- package/bin/skills/clickzetta-oss-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-overview/LICENSE +16 -0
- package/bin/skills/clickzetta-overview/SKILL.md +102 -0
- package/bin/skills/clickzetta-overview/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-overview/references/brands-and-endpoints.md +79 -0
- package/bin/skills/clickzetta-overview/references/object-model.md +311 -0
- package/bin/skills/clickzetta-overview/references/studio-modules.md +173 -0
- package/bin/skills/clickzetta-pipeline-review/LICENSE +16 -0
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
- package/bin/skills/clickzetta-query-optimizer/LICENSE +16 -0
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +156 -0
- package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +56 -0
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +78 -0
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +65 -0
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +49 -0
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +42 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-semantic-view/LICENSE +16 -0
- package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
- package/bin/skills/clickzetta-spark-flink-connector/LICENSE +16 -0
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +485 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +166 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +185 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +129 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +222 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +125 -0
- package/bin/skills/clickzetta-sql-syntax-guide/LICENSE +16 -0
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
- package/bin/skills/clickzetta-table-lineage/LICENSE +16 -0
- package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
- package/bin/skills/clickzetta-table-stream-pipeline/LICENSE +16 -0
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +206 -0
- package/bin/skills/clickzetta-table-stream-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-vcluster-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +212 -0
- package/bin/skills/clickzetta-vcluster-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +54 -0
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +150 -0
- package/bin/skills/clickzetta-volume-manager/LICENSE +16 -0
- package/bin/skills/clickzetta-volume-manager/SKILL.md +292 -0
- package/bin/skills/clickzetta-volume-manager/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +199 -0
- package/bin/skills/clickzetta-zettapark/LICENSE +16 -0
- package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
- package/bin/skills/cz-cli/SKILL.md +313 -0
- package/bin/skills/cz-cli/references/profile-setup.md +120 -0
- package/package.json +1 -1
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# 数据科学统计分析函数参考
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## 近似聚合函数(大表高效统计)
|
|
6
|
+
|
|
7
|
+
### approx_count_distinct — 近似 UV
|
|
8
|
+
|
|
9
|
+
```sql
|
|
10
|
+
-- 使用 HyperLogLog 算法,误差约 2%,比 COUNT(DISTINCT) 快 10x+
|
|
11
|
+
SELECT approx_count_distinct(user_id) AS approx_uv
|
|
12
|
+
FROM my_schema.events;
|
|
13
|
+
|
|
14
|
+
-- 按天统计 DAU
|
|
15
|
+
SELECT
|
|
16
|
+
DATE(event_time) AS dt,
|
|
17
|
+
approx_count_distinct(user_id) AS dau
|
|
18
|
+
FROM my_schema.events
|
|
19
|
+
GROUP BY 1
|
|
20
|
+
ORDER BY 1;
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### approx_percentile — 近似分位数
|
|
24
|
+
|
|
25
|
+
```sql
|
|
26
|
+
-- 中位数、四分位数、P95、P99
|
|
27
|
+
SELECT
|
|
28
|
+
approx_percentile(amount, 0.25) AS p25,
|
|
29
|
+
approx_percentile(amount, 0.50) AS median,
|
|
30
|
+
approx_percentile(amount, 0.75) AS p75,
|
|
31
|
+
approx_percentile(amount, 0.95) AS p95,
|
|
32
|
+
approx_percentile(amount, 0.99) AS p99
|
|
33
|
+
FROM my_schema.orders;
|
|
34
|
+
|
|
35
|
+
-- 分组分位数
|
|
36
|
+
SELECT
|
|
37
|
+
category,
|
|
38
|
+
approx_percentile(price, 0.5) AS median_price
|
|
39
|
+
FROM my_schema.products
|
|
40
|
+
GROUP BY category;
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### approx_histogram — 近似直方图
|
|
44
|
+
|
|
45
|
+
```sql
|
|
46
|
+
-- 返回结构体数组:[{min, max, count}, ...]
|
|
47
|
+
SELECT approx_histogram(amount, 10) AS hist
|
|
48
|
+
FROM my_schema.orders;
|
|
49
|
+
|
|
50
|
+
-- 解析直方图(展开为行)
|
|
51
|
+
SELECT
|
|
52
|
+
bucket.min AS bucket_min,
|
|
53
|
+
bucket.max AS bucket_max,
|
|
54
|
+
bucket.count AS bucket_count
|
|
55
|
+
FROM (
|
|
56
|
+
SELECT EXPLODE(approx_histogram(amount, 10)) AS bucket
|
|
57
|
+
FROM my_schema.orders
|
|
58
|
+
);
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### approx_top_k — 近似 TOP-K 高频值
|
|
62
|
+
|
|
63
|
+
```sql
|
|
64
|
+
-- 找出出现最多的前 10 个城市
|
|
65
|
+
SELECT approx_top_k(city, 10) AS top_cities
|
|
66
|
+
FROM my_schema.orders;
|
|
67
|
+
|
|
68
|
+
-- 返回结构体数组:[{value, count}, ...]
|
|
69
|
+
-- 解析展开(字段名是 value 和 count)
|
|
70
|
+
SELECT item.value AS city, item.count AS cnt
|
|
71
|
+
FROM (
|
|
72
|
+
SELECT EXPLODE(approx_top_k(city, 10)) AS item
|
|
73
|
+
FROM my_schema.orders
|
|
74
|
+
)
|
|
75
|
+
ORDER BY cnt DESC;
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## 精确统计函数
|
|
81
|
+
|
|
82
|
+
### percentile / median
|
|
83
|
+
|
|
84
|
+
```sql
|
|
85
|
+
-- 精确中位数(小表用,大表用 approx_percentile)
|
|
86
|
+
SELECT
|
|
87
|
+
percentile(amount, 0.5) AS exact_median,
|
|
88
|
+
median(amount) AS median_alias -- 等价写法
|
|
89
|
+
FROM my_schema.orders;
|
|
90
|
+
|
|
91
|
+
-- 多分位数
|
|
92
|
+
SELECT percentile(amount, ARRAY(0.25, 0.5, 0.75, 0.9, 0.99))
|
|
93
|
+
FROM my_schema.orders;
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## TABLESAMPLE 采样
|
|
99
|
+
|
|
100
|
+
```sql
|
|
101
|
+
-- ROW 模式:精确行级采样(适合 ML 训练集,< 1000万行)
|
|
102
|
+
SELECT * FROM my_schema.events TABLESAMPLE ROW (10); -- 精确 10%
|
|
103
|
+
SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS); -- 精确 5 行
|
|
104
|
+
|
|
105
|
+
-- SYSTEM 模式:文件级采样(适合大表快速预览,> 1000万行)
|
|
106
|
+
SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000; -- 约 0.1%
|
|
107
|
+
|
|
108
|
+
-- 分层采样(按类别等比例采样)
|
|
109
|
+
SELECT * FROM (
|
|
110
|
+
SELECT *,
|
|
111
|
+
ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn,
|
|
112
|
+
COUNT(*) OVER (PARTITION BY category) AS cat_total
|
|
113
|
+
FROM my_schema.products
|
|
114
|
+
)
|
|
115
|
+
WHERE rn <= CEIL(cat_total * 0.1); -- 每类取 10%
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
| 场景 | 推荐模式 | 说明 |
|
|
119
|
+
|---|---|---|
|
|
120
|
+
| 快速数据预览 | SYSTEM | 极快,适合 > 100万行 |
|
|
121
|
+
| ML 训练集构建 | ROW | 精确随机,保证代表性 |
|
|
122
|
+
| 数据质量抽检 | SYSTEM | 快速抽样验证 |
|
|
123
|
+
| 统计分析 | ROW | 精确概率采样 |
|
|
124
|
+
|
|
125
|
+
> ⚠️ **注意**:TABLESAMPLE 在小表(< 数万行)上可能返回全部数据,百分比采样不精确。小表直接用 `LIMIT` 即可。
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## 窗口函数(时序/排名特征)
|
|
130
|
+
|
|
131
|
+
```sql
|
|
132
|
+
-- 移动平均(7日)
|
|
133
|
+
SELECT
|
|
134
|
+
dt,
|
|
135
|
+
revenue,
|
|
136
|
+
AVG(revenue) OVER (
|
|
137
|
+
ORDER BY dt
|
|
138
|
+
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
|
|
139
|
+
) AS revenue_7d_ma
|
|
140
|
+
FROM daily_stats;
|
|
141
|
+
|
|
142
|
+
-- 环比增长率
|
|
143
|
+
SELECT
|
|
144
|
+
dt,
|
|
145
|
+
revenue,
|
|
146
|
+
LAG(revenue, 1) OVER (ORDER BY dt) AS prev_revenue,
|
|
147
|
+
ROUND(100.0 * (revenue - LAG(revenue, 1) OVER (ORDER BY dt))
|
|
148
|
+
/ NULLIF(LAG(revenue, 1) OVER (ORDER BY dt), 0), 2) AS mom_growth_pct
|
|
149
|
+
FROM daily_stats;
|
|
150
|
+
|
|
151
|
+
-- 用户行为排名(RFM 分析)
|
|
152
|
+
SELECT
|
|
153
|
+
user_id,
|
|
154
|
+
total_amount,
|
|
155
|
+
NTILE(5) OVER (ORDER BY total_amount DESC) AS monetary_quintile,
|
|
156
|
+
NTILE(5) OVER (ORDER BY order_cnt DESC) AS frequency_quintile,
|
|
157
|
+
NTILE(5) OVER (ORDER BY last_order_date DESC) AS recency_quintile
|
|
158
|
+
FROM user_rfm;
|
|
159
|
+
|
|
160
|
+
-- 去重保留最新(数据清洗常用)
|
|
161
|
+
SELECT * FROM (
|
|
162
|
+
SELECT *,
|
|
163
|
+
ROW_NUMBER() OVER (
|
|
164
|
+
PARTITION BY user_id
|
|
165
|
+
ORDER BY update_time DESC
|
|
166
|
+
) AS rn
|
|
167
|
+
FROM my_schema.users_raw
|
|
168
|
+
) WHERE rn = 1;
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## 数据质量检查模板
|
|
174
|
+
|
|
175
|
+
```sql
|
|
176
|
+
-- 一次性输出所有关键质量指标
|
|
177
|
+
SELECT
|
|
178
|
+
COUNT(*) AS total_rows,
|
|
179
|
+
COUNT(DISTINCT user_id) AS unique_users,
|
|
180
|
+
-- 缺失率
|
|
181
|
+
ROUND(100.0 * COUNT(*) FILTER (WHERE user_id IS NULL)
|
|
182
|
+
/ COUNT(*), 2) AS user_id_null_pct,
|
|
183
|
+
ROUND(100.0 * COUNT(*) FILTER (WHERE amount IS NULL)
|
|
184
|
+
/ COUNT(*), 2) AS amount_null_pct,
|
|
185
|
+
-- 异常值
|
|
186
|
+
SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) AS negative_amount_cnt,
|
|
187
|
+
SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END) AS extreme_amount_cnt,
|
|
188
|
+
-- 时间范围
|
|
189
|
+
MIN(order_date) AS earliest_date,
|
|
190
|
+
MAX(order_date) AS latest_date,
|
|
191
|
+
-- 分布
|
|
192
|
+
approx_percentile(amount, 0.5) AS median_amount,
|
|
193
|
+
approx_percentile(amount, 0.99) AS p99_amount
|
|
194
|
+
FROM my_schema.orders;
|
|
195
|
+
```
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# 数据写入、特征工程、模型推理示例
|
|
2
|
+
|
|
3
|
+
## 数据写入
|
|
4
|
+
|
|
5
|
+
| 场景 | 方式 |
|
|
6
|
+
|------|------|
|
|
7
|
+
| ZettaPark 可用(Python 3.10+) | `save_as_table()` 或 `create_dataframe().write` |
|
|
8
|
+
| 本地 CSV/pandas 写入 | `session.create_dataframe(df).write.save_as_table()` |
|
|
9
|
+
| Python 3.9 / ZettaPark 不可用 | cursor 批量 INSERT(见下方) |
|
|
10
|
+
| **禁止** | `df.to_sql()`、SQLAlchemy `clickzetta://...` |
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
# 方式 A:ZettaPark(推荐)
|
|
14
|
+
session.sql("""
|
|
15
|
+
SELECT o.*, u.age_group FROM my_schema.orders_raw o
|
|
16
|
+
LEFT JOIN my_schema.users u ON o.user_id = u.user_id
|
|
17
|
+
WHERE o.amount > 0
|
|
18
|
+
""").write.mode("overwrite").save_as_table("ds_workspace.orders_clean")
|
|
19
|
+
|
|
20
|
+
# 方式 B:pandas → Lakehouse
|
|
21
|
+
session.create_dataframe(local_df).write.mode("append").save_as_table("ds_workspace.features_v1")
|
|
22
|
+
|
|
23
|
+
# 方式 C:cursor 批量 INSERT(fallback)
|
|
24
|
+
import clickzetta, os
|
|
25
|
+
conn = clickzetta.connect(
|
|
26
|
+
service=os.environ["CLICKZETTA_SERVICE"], instance=os.environ["CLICKZETTA_INSTANCE"],
|
|
27
|
+
workspace=os.environ["CLICKZETTA_WORKSPACE"], username=os.environ["CLICKZETTA_USERNAME"],
|
|
28
|
+
password=os.environ["CLICKZETTA_PASSWORD"],
|
|
29
|
+
vcluster=os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
|
|
30
|
+
schema=os.environ.get("CLICKZETTA_SCHEMA", "public"),
|
|
31
|
+
)
|
|
32
|
+
cursor = conn.cursor()
|
|
33
|
+
cursor.execute("CREATE TABLE IF NOT EXISTS ds_workspace.my_table (col1 STRING, col2 BIGINT, col3 DOUBLE)")
|
|
34
|
+
rows = local_df.values.tolist()
|
|
35
|
+
for i in range(0, len(rows), 500):
|
|
36
|
+
batch = rows[i:i+500]
|
|
37
|
+
vals = ",".join(f"({','.join(repr(v) for v in row)})" for row in batch)
|
|
38
|
+
cursor.execute(f"INSERT INTO ds_workspace.my_table VALUES {vals}")
|
|
39
|
+
conn.close()
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
```sql
|
|
43
|
+
-- 设置中间表生命周期(30 天自动清理)
|
|
44
|
+
ALTER TABLE ds_workspace.orders_clean SET PROPERTIES ('data_lifecycle' = '30');
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 特征工程
|
|
50
|
+
|
|
51
|
+
```sql
|
|
52
|
+
-- SQL 侧(利用 Lakehouse 算力,推荐)
|
|
53
|
+
SELECT
|
|
54
|
+
user_id,
|
|
55
|
+
COUNT(*) AS order_cnt_30d,
|
|
56
|
+
SUM(amount) AS total_amount_30d,
|
|
57
|
+
AVG(amount) AS avg_amount_30d,
|
|
58
|
+
STDDEV(amount) AS std_amount_30d,
|
|
59
|
+
DATEDIFF('day', MIN(order_date), MAX(order_date)) AS active_days,
|
|
60
|
+
COUNT(DISTINCT DATE(order_date)) AS active_day_cnt,
|
|
61
|
+
NTILE(10) OVER (ORDER BY SUM(amount) DESC) AS revenue_decile
|
|
62
|
+
FROM my_schema.orders
|
|
63
|
+
WHERE order_date >= CURRENT_DATE - INTERVAL 30 DAY
|
|
64
|
+
GROUP BY user_id;
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
# ZettaPark 侧(Python 逻辑)
|
|
69
|
+
from clickzetta.zettapark.functions import col, when
|
|
70
|
+
|
|
71
|
+
features = session.table("ds_workspace.orders_clean") \
|
|
72
|
+
.with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
|
|
73
|
+
|
|
74
|
+
df = features.to_pandas()
|
|
75
|
+
|
|
76
|
+
from sklearn.preprocessing import StandardScaler
|
|
77
|
+
df[['amount_scaled']] = StandardScaler().fit_transform(df[['amount']])
|
|
78
|
+
|
|
79
|
+
session.create_dataframe(df).write.mode("overwrite").save_as_table("ds_workspace.features_final")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## 模型推理上线
|
|
85
|
+
|
|
86
|
+
### BITMAP 用户画像
|
|
87
|
+
|
|
88
|
+
```sql
|
|
89
|
+
CREATE TABLE ds_workspace.user_tags AS
|
|
90
|
+
SELECT tag_name, group_bitmap_state(user_id) AS user_bitmap
|
|
91
|
+
FROM my_schema.user_behavior GROUP BY tag_name;
|
|
92
|
+
|
|
93
|
+
-- 人群交集
|
|
94
|
+
SELECT bitmap_count(bitmap_and(
|
|
95
|
+
(SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '高消费'),
|
|
96
|
+
(SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '近30天活跃')
|
|
97
|
+
)) AS target_user_count;
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### SQL UDF 批量推理
|
|
101
|
+
|
|
102
|
+
```sql
|
|
103
|
+
-- 调用已部署的模型 UDF(必须用完整 schema 路径)
|
|
104
|
+
INSERT INTO ds_workspace.predictions
|
|
105
|
+
SELECT user_id,
|
|
106
|
+
ds_workspace.credit_score_model(total_amount_30d, order_cnt_30d, active_days, avg_amount_30d) AS score,
|
|
107
|
+
CURRENT_TIMESTAMP() AS predict_time
|
|
108
|
+
FROM ds_workspace.features_final;
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### 向量检索
|
|
112
|
+
|
|
113
|
+
```sql
|
|
114
|
+
SELECT candidate_id,
|
|
115
|
+
cosine_distance(
|
|
116
|
+
(SELECT embedding FROM ds_workspace.user_embeddings WHERE user_id = 'target'),
|
|
117
|
+
embedding
|
|
118
|
+
) AS similarity
|
|
119
|
+
FROM ds_workspace.user_embeddings
|
|
120
|
+
WHERE user_id != 'target'
|
|
121
|
+
ORDER BY similarity LIMIT 10;
|
|
122
|
+
```
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# ZettaPark API 数据科学常用操作
|
|
2
|
+
|
|
3
|
+
> 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
|
|
4
|
+
> **Python 版本**:推荐 3.12(最低 3.10)。安装:`python3.12 -m venv .venv && pip install clickzetta_zettapark_python`
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Session 创建
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
from clickzetta.zettapark.session import Session
|
|
12
|
+
import os
|
|
13
|
+
from dotenv import load_dotenv
|
|
14
|
+
|
|
15
|
+
load_dotenv()
|
|
16
|
+
|
|
17
|
+
session = Session.builder.configs({
|
|
18
|
+
"service": os.environ["CLICKZETTA_SERVICE"],
|
|
19
|
+
"instance": os.environ["CLICKZETTA_INSTANCE"],
|
|
20
|
+
"workspace": os.environ["CLICKZETTA_WORKSPACE"],
|
|
21
|
+
"username": os.environ["CLICKZETTA_USERNAME"],
|
|
22
|
+
"password": os.environ["CLICKZETTA_PASSWORD"],
|
|
23
|
+
"vcluster": os.environ["CLICKZETTA_VCLUSTER"],
|
|
24
|
+
"schema": os.environ.get("CLICKZETTA_SCHEMA", "public"),
|
|
25
|
+
"hints": {
|
|
26
|
+
"sdk.job.timeout": 300,
|
|
27
|
+
"query_tag": "ds_notebook"
|
|
28
|
+
}
|
|
29
|
+
}).create()
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## 数据读取
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
# 读取整张表
|
|
38
|
+
df = session.table("my_schema.orders")
|
|
39
|
+
|
|
40
|
+
# 执行 SQL 查询
|
|
41
|
+
df = session.sql("SELECT * FROM my_schema.orders WHERE amount > 100")
|
|
42
|
+
|
|
43
|
+
# 转为 pandas(小数据集)
|
|
44
|
+
pandas_df = df.to_pandas()
|
|
45
|
+
|
|
46
|
+
# 分批读取大表(避免 OOM)
|
|
47
|
+
pandas_df = session.sql("""
|
|
48
|
+
SELECT * FROM my_schema.events
|
|
49
|
+
TABLESAMPLE ROW (1) -- 1% 精确采样
|
|
50
|
+
""").to_pandas()
|
|
51
|
+
|
|
52
|
+
# 只获取前 N 行
|
|
53
|
+
pandas_df = df.limit(10000).to_pandas()
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## DataFrame 变换
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from clickzetta.zettapark.functions import col, when, lit, sum as F_sum, count as F_count, avg as F_avg
|
|
62
|
+
|
|
63
|
+
# 过滤
|
|
64
|
+
df_filtered = df.filter(col("amount") > 0)
|
|
65
|
+
df_filtered = df.filter((col("status") == "COMPLETED") & (col("amount") > 100))
|
|
66
|
+
|
|
67
|
+
# 选择列
|
|
68
|
+
df_selected = df.select("user_id", "amount", "order_date")
|
|
69
|
+
|
|
70
|
+
# 新增列
|
|
71
|
+
df = df.with_column("log_amount", col("amount").cast("double"))
|
|
72
|
+
df = df.with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
|
|
73
|
+
|
|
74
|
+
# 聚合
|
|
75
|
+
agg_df = df.group_by("user_id").agg(
|
|
76
|
+
F_sum("amount").as_("total_amount"),
|
|
77
|
+
F_count("order_id").as_("order_cnt"),
|
|
78
|
+
F_avg("amount").as_("avg_amount")
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# JOIN
|
|
82
|
+
result = orders.join(users, orders["user_id"] == users["user_id"], "left")
|
|
83
|
+
|
|
84
|
+
# 排序
|
|
85
|
+
df_sorted = df.sort(col("amount").desc())
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## 数据写回
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
# 覆盖写入(常用于特征表更新)
|
|
94
|
+
df.write.mode("overwrite").save_as_table("ds_workspace.features_v1")
|
|
95
|
+
|
|
96
|
+
# 追加写入(常用于预测结果)
|
|
97
|
+
df.write.mode("append").save_as_table("ds_workspace.predictions")
|
|
98
|
+
|
|
99
|
+
# pandas DataFrame 写回
|
|
100
|
+
import pandas as pd
|
|
101
|
+
local_df = pd.DataFrame({"user_id": [1, 2], "score": [0.8, 0.6]})
|
|
102
|
+
session.create_dataframe(local_df).write.mode("overwrite") \
|
|
103
|
+
.save_as_table("ds_workspace.model_scores")
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## 与 pandas/scikit-learn 集成
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
import pandas as pd
|
|
112
|
+
import numpy as np
|
|
113
|
+
from sklearn.preprocessing import StandardScaler
|
|
114
|
+
from sklearn.model_selection import train_test_split
|
|
115
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
|
116
|
+
|
|
117
|
+
# 1. 从 Lakehouse 拉特征
|
|
118
|
+
features_df = session.sql("""
|
|
119
|
+
SELECT user_id, total_amount_30d, order_cnt_30d,
|
|
120
|
+
active_days, avg_amount_30d, label
|
|
121
|
+
FROM ds_workspace.features_final
|
|
122
|
+
""").to_pandas()
|
|
123
|
+
|
|
124
|
+
# 2. 本地处理
|
|
125
|
+
X = features_df.drop(["user_id", "label"], axis=1)
|
|
126
|
+
y = features_df["label"]
|
|
127
|
+
|
|
128
|
+
scaler = StandardScaler()
|
|
129
|
+
X_scaled = scaler.fit_transform(X)
|
|
130
|
+
|
|
131
|
+
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
|
|
132
|
+
|
|
133
|
+
# 3. 训练模型
|
|
134
|
+
model = GradientBoostingClassifier(n_estimators=100)
|
|
135
|
+
model.fit(X_train, y_train)
|
|
136
|
+
|
|
137
|
+
# 4. 预测并写回
|
|
138
|
+
features_df["predicted_score"] = model.predict_proba(X_scaled)[:, 1]
|
|
139
|
+
session.create_dataframe(
|
|
140
|
+
features_df[["user_id", "predicted_score"]]
|
|
141
|
+
).write.mode("overwrite").save_as_table("ds_workspace.predictions")
|
|
142
|
+
|
|
143
|
+
# 5. 保存模型
|
|
144
|
+
import joblib
|
|
145
|
+
joblib.dump(model, "models/gbm_model.pkl")
|
|
146
|
+
joblib.dump(scaler, "models/scaler.pkl")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## 注意事项
|
|
152
|
+
|
|
153
|
+
- `to_pandas()` 会把数据全部拉到本地内存,大表必须先 `TABLESAMPLE` 或 `LIMIT`
|
|
154
|
+
- `collect()` 返回 Row 对象列表,`to_pandas()` 返回 DataFrame,数据科学场景用后者
|
|
155
|
+
- ZettaPark 的 DataFrame 操作是懒执行,只有 `to_pandas()`/`collect()`/`show()`/`save_as_table()` 才真正触发计算
|
|
156
|
+
- 写回时推荐用 `ds_workspace` 这样的专属 Schema,与生产数据隔离
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
ClickZetta Skills License
|
|
2
|
+
© 2026 Yunqi Inc. All rights reserved.
|
|
3
|
+
LICENSE: Use of these materials (including all code, prompts, assets, files, and other components of these skills (collectively, "Skills")) is governed by your agreement with ClickZetta for the Service. If no separate agreement exists, use is governed by ClickZetta's Terms of Service (available at: https://yunqi.tech/documents/user-aggrement).
|
|
4
|
+
Your applicable agreement is referred to as the "Agreement." "Service" is as defined in the Agreement.
|
|
5
|
+
ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the contrary, you may not:
|
|
6
|
+
|
|
7
|
+
Extract from the Service or retain copies of the Skills outside use with the Service;
|
|
8
|
+
Reproduce or copy the Skills, except for temporary copies created automatically during authorized use of the Service;
|
|
9
|
+
Create derivative works based on the Skills;
|
|
10
|
+
Distribute, sublicense, or transfer the Skills to any third party;
|
|
11
|
+
Make, offer to sell, sell, or import any inventions embodied in the Skills; nor,
|
|
12
|
+
Reverse engineer, decompile, or disassemble the Skills.
|
|
13
|
+
|
|
14
|
+
The receipt, viewing, or possession of the Skills does not convey or imply any license or right beyond those expressly granted above.
|
|
15
|
+
Yunqi retains all rights, title, and interest in the Skills, including all copyrights, trademarks, patents, and all other applicable intellectual property rights.
|
|
16
|
+
THE SKILLS ARE PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SKILLS OR THE USE OR OTHER DEALINGS IN THE SKILLS.
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-data-sharing
|
|
3
|
+
description: |
|
|
4
|
+
管理 ClickZetta Lakehouse 跨账户/跨实例数据分享(Share)。无需复制数据,
|
|
5
|
+
实时共享表或视图给其他服务实例。覆盖提供方完整流程(CREATE SHARE →
|
|
6
|
+
GRANT TO SHARE → ALTER SHARE ADD INSTANCE)和消费方流程
|
|
7
|
+
(SHOW SHARES → DESC SHARE → CREATE SCHEMA FROM SHARE → 查询)。
|
|
8
|
+
当用户说"数据分享"、"数据共享"、"Share"、"跨账户共享"、"跨实例共享"、
|
|
9
|
+
"CREATE SHARE"、"GRANT TO SHARE"、"CREATE SCHEMA FROM SHARE"、
|
|
10
|
+
"无复制共享"、"分享数据给其他公司"、"接收共享数据"、"INBOUND"、"OUTBOUND"时触发。
|
|
11
|
+
Keywords: data sharing, SHARE, cross-account, cross-instance, provider, consumer
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# ClickZetta 数据分享
|
|
15
|
+
|
|
16
|
+
数据分享(Share)实现跨账户/跨实例的**无复制、实时只读**数据共享。提供方授权数据,消费方直接查询,无需数据同步。
|
|
17
|
+
|
|
18
|
+
阅读 [references/share-ddl.md](references/share-ddl.md) 了解完整语法。
|
|
19
|
+
|
|
20
|
+
> ⚠️ 创建 Share 需要 `instance_admin` 角色。
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 提供方:分享数据(3步)
|
|
25
|
+
|
|
26
|
+
### 步骤 1:创建 Share 对象
|
|
27
|
+
|
|
28
|
+
```sql
|
|
29
|
+
CREATE SHARE my_share;
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### 步骤 2:将表/视图加入 Share
|
|
33
|
+
|
|
34
|
+
```sql
|
|
35
|
+
-- 分享指定表
|
|
36
|
+
GRANT SELECT, READ METADATA ON TABLE public.orders TO SHARE my_share;
|
|
37
|
+
|
|
38
|
+
-- 分享视图(推荐:用视图控制分享字段和行范围)
|
|
39
|
+
GRANT SELECT, READ METADATA ON VIEW public.orders_public_view TO SHARE my_share;
|
|
40
|
+
|
|
41
|
+
-- 分享多张表
|
|
42
|
+
GRANT SELECT, READ METADATA ON TABLE public.orders, public.customers TO SHARE my_share;
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### 步骤 3:指定接收方实例
|
|
46
|
+
|
|
47
|
+
```sql
|
|
48
|
+
-- 添加接收方(消费方提供其实例名称)
|
|
49
|
+
ALTER SHARE my_share ADD INSTANCE consumer_instance_id;
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## 消费方:使用共享数据(3步)
|
|
55
|
+
|
|
56
|
+
### 步骤 1:查看收到的 Share
|
|
57
|
+
|
|
58
|
+
```sql
|
|
59
|
+
SHOW SHARES WHERE kind = 'INBOUND';
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 步骤 2:查看 Share 内容
|
|
63
|
+
|
|
64
|
+
```sql
|
|
65
|
+
-- 格式:DESC SHARE <提供方实例名>.<share名>
|
|
66
|
+
DESC SHARE provider_instance.my_share;
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### 步骤 3:创建本地只读 Schema
|
|
70
|
+
|
|
71
|
+
```sql
|
|
72
|
+
-- 格式:CREATE SCHEMA <本地名> FROM SHARE SHARE <实例>.<share>.<schema>
|
|
73
|
+
CREATE SCHEMA shared_data FROM SHARE SHARE provider_instance.my_share.public;
|
|
74
|
+
|
|
75
|
+
-- 直接查询
|
|
76
|
+
SELECT * FROM shared_data.orders LIMIT 10;
|
|
77
|
+
|
|
78
|
+
-- 与本地表关联
|
|
79
|
+
SELECT o.*, c.region
|
|
80
|
+
FROM shared_data.orders o
|
|
81
|
+
JOIN my_schema.dim_customer c ON o.customer_id = c.id;
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## 管理操作
|
|
87
|
+
|
|
88
|
+
```sql
|
|
89
|
+
-- 查看所有 Share(含 INBOUND/OUTBOUND)
|
|
90
|
+
SHOW SHARES;
|
|
91
|
+
|
|
92
|
+
-- 只看分享出去的
|
|
93
|
+
SHOW SHARES WHERE kind = 'OUTBOUND';
|
|
94
|
+
|
|
95
|
+
-- 查看 Share 包含的对象
|
|
96
|
+
DESC SHARE my_share;
|
|
97
|
+
|
|
98
|
+
-- 撤销某张表的分享
|
|
99
|
+
REVOKE SELECT, READ METADATA ON TABLE public.orders FROM SHARE my_share;
|
|
100
|
+
|
|
101
|
+
-- 移除接收方(立即生效)
|
|
102
|
+
ALTER SHARE my_share REMOVE INSTANCE consumer_instance_id;
|
|
103
|
+
|
|
104
|
+
-- 删除 Share
|
|
105
|
+
DROP SHARE IF EXISTS my_share;
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## 典型场景
|
|
111
|
+
|
|
112
|
+
### 场景:A 公司向 B 公司分享数据
|
|
113
|
+
|
|
114
|
+
**A 公司(提供方)操作:**
|
|
115
|
+
|
|
116
|
+
```sql
|
|
117
|
+
-- 1. 创建 Share
|
|
118
|
+
CREATE SHARE partner_share;
|
|
119
|
+
|
|
120
|
+
-- 2. 创建视图控制分享范围(只分享脱敏后的数据)
|
|
121
|
+
CREATE VIEW public.orders_for_partner AS
|
|
122
|
+
SELECT order_id, product_id, amount, order_date
|
|
123
|
+
FROM public.orders
|
|
124
|
+
WHERE status = 'completed';
|
|
125
|
+
|
|
126
|
+
-- 3. 将视图加入 Share
|
|
127
|
+
GRANT SELECT, READ METADATA ON VIEW public.orders_for_partner TO SHARE partner_share;
|
|
128
|
+
|
|
129
|
+
-- 4. 指定 B 公司实例(B 公司提供其实例名)
|
|
130
|
+
ALTER SHARE partner_share ADD INSTANCE b_company_instance;
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**B 公司(消费方)操作:**
|
|
134
|
+
|
|
135
|
+
```sql
|
|
136
|
+
-- 1. 查看收到的 Share
|
|
137
|
+
SHOW SHARES WHERE kind = 'INBOUND';
|
|
138
|
+
|
|
139
|
+
-- 2. 查看内容
|
|
140
|
+
DESC SHARE a_company_instance.partner_share;
|
|
141
|
+
|
|
142
|
+
-- 3. 创建本地 Schema
|
|
143
|
+
CREATE SCHEMA a_company_data FROM SHARE SHARE a_company_instance.partner_share.public;
|
|
144
|
+
|
|
145
|
+
-- 4. 查询使用
|
|
146
|
+
SELECT * FROM a_company_data.orders_for_partner
|
|
147
|
+
WHERE order_date >= '2024-01-01';
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## 常见问题
|
|
153
|
+
|
|
154
|
+
| 问题 | 原因 | 解决方案 |
|
|
155
|
+
|---|---|---|
|
|
156
|
+
| CREATE SHARE 报权限不足 | 需要 instance_admin 角色 | 联系管理员授予 instance_admin |
|
|
157
|
+
| 消费方看不到 Share | 提供方未 ADD INSTANCE | 提供方执行 `ALTER SHARE ADD INSTANCE` |
|
|
158
|
+
| DESC SHARE 报错 | instance_name 填错 | 通过 `SHOW SHARES` 确认 provider_instance 字段 |
|
|
159
|
+
| 共享 Schema 下查不到表 | GRANT 时未包含该表 | 提供方重新 `GRANT ... TO SHARE` |
|
|
160
|
+
| 想只分享部分列/行 | 直接分享表会暴露全量数据 | 创建 VIEW 过滤后再分享 VIEW |
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"怎么把表数据分享给另一个实例?不复制数据的那种","expected_skill":"clickzetta-data-sharing","expected_output_contains":["CREATE SHARE","GRANT"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"作为消费方,怎么接收别人分享的数据?","expected_skill":"clickzetta-data-sharing","expected_output_contains":["SHOW SHARES","CREATE SCHEMA FROM SHARE"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"数据分享的提供方完整流程是什么?","expected_skill":"clickzetta-data-sharing","expected_output_contains":["SHARE","GRANT","INSTANCE"]}
|