@clickzetta/cz-cli-darwin-arm64 0.3.81 → 0.3.84

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-access-control/LICENSE +16 -0
  3. package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
  4. package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
  5. package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
  6. package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
  7. package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
  8. package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
  9. package/bin/skills/clickzetta-app-python-sdk/LICENSE +16 -0
  10. package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
  11. package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
  12. package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
  13. package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
  14. package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
  15. package/bin/skills/clickzetta-batch-sync-pipeline/LICENSE +16 -0
  16. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +227 -0
  17. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -0
  18. package/bin/skills/clickzetta-bi-connect/LICENSE +16 -0
  19. package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
  20. package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
  21. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
  22. package/bin/skills/clickzetta-cdc-sync-pipeline/LICENSE +16 -0
  23. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
  24. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
  25. package/bin/skills/clickzetta-data-ingest-pipeline/LICENSE +16 -0
  26. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +237 -0
  27. package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
  28. package/bin/skills/clickzetta-data-retention/LICENSE +16 -0
  29. package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
  30. package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
  31. package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
  32. package/bin/skills/clickzetta-data-science/LICENSE +16 -0
  33. package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
  34. package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
  35. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
  36. package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
  37. package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
  38. package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
  39. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
  40. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
  41. package/bin/skills/clickzetta-data-sharing/LICENSE +16 -0
  42. package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
  43. package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
  44. package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
  45. package/bin/skills/clickzetta-dba-guide/LICENSE +16 -0
  46. package/bin/skills/clickzetta-dba-guide/SKILL.md +542 -0
  47. package/bin/skills/clickzetta-dba-guide/eval_cases.jsonl +3 -0
  48. package/bin/skills/clickzetta-dw-modeling/LICENSE +16 -0
  49. package/bin/skills/clickzetta-dw-modeling/SKILL.md +351 -0
  50. package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
  51. package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
  52. package/bin/skills/clickzetta-dynamic-table/LICENSE +16 -0
  53. package/bin/skills/clickzetta-dynamic-table/SKILL.md +230 -0
  54. package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +253 -0
  55. package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +124 -0
  56. package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +96 -0
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +109 -0
  58. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +135 -0
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +15 -0
  60. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +185 -0
  61. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +427 -0
  62. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +260 -0
  63. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +80 -0
  64. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +190 -0
  65. package/bin/skills/clickzetta-dynamic-table/eval_cases.jsonl +5 -0
  66. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/SKILL.md +27 -0
  67. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-column-validation-rules.md +118 -0
  68. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-conversion-rules.md +225 -0
  69. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-placeholder-rules.md +182 -0
  70. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-refresh-rules.md +98 -0
  71. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-self-reference-rules.md +76 -0
  72. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-workflow.md +109 -0
  73. package/bin/skills/clickzetta-external-catalog/LICENSE +16 -0
  74. package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
  75. package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
  76. package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
  77. package/bin/skills/clickzetta-external-function/LICENSE +16 -0
  78. package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
  79. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
  80. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
  81. package/bin/skills/clickzetta-file-import-pipeline/LICENSE +16 -0
  82. package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +190 -0
  83. package/bin/skills/clickzetta-file-import-pipeline/eval_cases.jsonl +5 -0
  84. package/bin/skills/clickzetta-index-manager/LICENSE +16 -0
  85. package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
  86. package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
  87. package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
  88. package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
  89. package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
  90. package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
  91. package/bin/skills/clickzetta-java-sdk/LICENSE +16 -0
  92. package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
  93. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
  94. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
  95. package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
  96. package/bin/skills/clickzetta-kafka-ingest-pipeline/LICENSE +16 -0
  97. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +769 -0
  98. package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +5 -0
  99. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +324 -0
  100. package/bin/skills/clickzetta-lakehouse-connect/LICENSE +16 -0
  101. package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +218 -0
  102. package/bin/skills/clickzetta-lakehouse-connect/eval_cases.jsonl +3 -0
  103. package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +35 -0
  104. package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +435 -0
  105. package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +478 -0
  106. package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +225 -0
  107. package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +468 -0
  108. package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +445 -0
  109. package/bin/skills/clickzetta-manage-comments/LICENSE +16 -0
  110. package/bin/skills/clickzetta-manage-comments/SKILL.md +219 -0
  111. package/bin/skills/clickzetta-manage-comments/eval_cases.jsonl +3 -0
  112. package/bin/skills/clickzetta-metadata/LICENSE +16 -0
  113. package/bin/skills/clickzetta-metadata/SKILL.md +502 -0
  114. package/bin/skills/clickzetta-metadata/eval_cases.jsonl +5 -0
  115. package/bin/skills/clickzetta-metadata/references/instance-views-reference.md +276 -0
  116. package/bin/skills/clickzetta-metadata/references/metering-views-reference.md +137 -0
  117. package/bin/skills/clickzetta-metadata/references/show-desc-reference.md +326 -0
  118. package/bin/skills/clickzetta-metadata/references/views-reference.md +271 -0
  119. package/bin/skills/clickzetta-monitoring/LICENSE +16 -0
  120. package/bin/skills/clickzetta-monitoring/SKILL.md +215 -0
  121. package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
  122. package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
  123. package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
  124. package/bin/skills/clickzetta-oss-ingest-pipeline/LICENSE +16 -0
  125. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +562 -0
  126. package/bin/skills/clickzetta-oss-ingest-pipeline/eval_cases.jsonl +5 -0
  127. package/bin/skills/clickzetta-overview/LICENSE +16 -0
  128. package/bin/skills/clickzetta-overview/SKILL.md +102 -0
  129. package/bin/skills/clickzetta-overview/eval_cases.jsonl +5 -0
  130. package/bin/skills/clickzetta-overview/references/brands-and-endpoints.md +79 -0
  131. package/bin/skills/clickzetta-overview/references/object-model.md +311 -0
  132. package/bin/skills/clickzetta-overview/references/studio-modules.md +173 -0
  133. package/bin/skills/clickzetta-pipeline-review/LICENSE +16 -0
  134. package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
  135. package/bin/skills/clickzetta-query-optimizer/LICENSE +16 -0
  136. package/bin/skills/clickzetta-query-optimizer/SKILL.md +156 -0
  137. package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
  138. package/bin/skills/clickzetta-query-optimizer/references/explain.md +56 -0
  139. package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +78 -0
  140. package/bin/skills/clickzetta-query-optimizer/references/optimize.md +65 -0
  141. package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +49 -0
  142. package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +42 -0
  143. package/bin/skills/clickzetta-realtime-sync-pipeline/LICENSE +16 -0
  144. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
  145. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
  146. package/bin/skills/clickzetta-semantic-view/LICENSE +16 -0
  147. package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
  148. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
  149. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
  150. package/bin/skills/clickzetta-spark-flink-connector/LICENSE +16 -0
  151. package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
  152. package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
  153. package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
  154. package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
  155. package/bin/skills/clickzetta-sql-pipeline-manager/LICENSE +16 -0
  156. package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +485 -0
  157. package/bin/skills/clickzetta-sql-pipeline-manager/eval_cases.jsonl +12 -0
  158. package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +166 -0
  159. package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +185 -0
  160. package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +129 -0
  161. package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +222 -0
  162. package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +125 -0
  163. package/bin/skills/clickzetta-sql-syntax-guide/LICENSE +16 -0
  164. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
  165. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
  166. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
  167. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
  168. package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
  169. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
  170. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
  171. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
  172. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
  173. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
  174. package/bin/skills/clickzetta-studio-task-manager/LICENSE +16 -0
  175. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
  176. package/bin/skills/clickzetta-table-lineage/LICENSE +16 -0
  177. package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
  178. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
  179. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
  180. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
  181. package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
  182. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
  183. package/bin/skills/clickzetta-table-stream-pipeline/LICENSE +16 -0
  184. package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +206 -0
  185. package/bin/skills/clickzetta-table-stream-pipeline/eval_cases.jsonl +5 -0
  186. package/bin/skills/clickzetta-vcluster-manager/LICENSE +16 -0
  187. package/bin/skills/clickzetta-vcluster-manager/SKILL.md +212 -0
  188. package/bin/skills/clickzetta-vcluster-manager/eval_cases.jsonl +5 -0
  189. package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +54 -0
  190. package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +150 -0
  191. package/bin/skills/clickzetta-volume-manager/LICENSE +16 -0
  192. package/bin/skills/clickzetta-volume-manager/SKILL.md +292 -0
  193. package/bin/skills/clickzetta-volume-manager/eval_cases.jsonl +5 -0
  194. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +199 -0
  195. package/bin/skills/clickzetta-zettapark/LICENSE +16 -0
  196. package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
  197. package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
  198. package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
  199. package/bin/skills/cz-cli/SKILL.md +313 -0
  200. package/bin/skills/cz-cli/references/profile-setup.md +120 -0
  201. package/package.json +1 -1
@@ -0,0 +1,195 @@
1
+ # 数据科学统计分析函数参考
2
+
3
+ ---
4
+
5
+ ## 近似聚合函数(大表高效统计)
6
+
7
+ ### approx_count_distinct — 近似 UV
8
+
9
+ ```sql
10
+ -- 使用 HyperLogLog 算法,误差约 2%,比 COUNT(DISTINCT) 快 10x+
11
+ SELECT approx_count_distinct(user_id) AS approx_uv
12
+ FROM my_schema.events;
13
+
14
+ -- 按天统计 DAU
15
+ SELECT
16
+ DATE(event_time) AS dt,
17
+ approx_count_distinct(user_id) AS dau
18
+ FROM my_schema.events
19
+ GROUP BY 1
20
+ ORDER BY 1;
21
+ ```
22
+
23
+ ### approx_percentile — 近似分位数
24
+
25
+ ```sql
26
+ -- 中位数、四分位数、P95、P99
27
+ SELECT
28
+ approx_percentile(amount, 0.25) AS p25,
29
+ approx_percentile(amount, 0.50) AS median,
30
+ approx_percentile(amount, 0.75) AS p75,
31
+ approx_percentile(amount, 0.95) AS p95,
32
+ approx_percentile(amount, 0.99) AS p99
33
+ FROM my_schema.orders;
34
+
35
+ -- 分组分位数
36
+ SELECT
37
+ category,
38
+ approx_percentile(price, 0.5) AS median_price
39
+ FROM my_schema.products
40
+ GROUP BY category;
41
+ ```
42
+
43
+ ### approx_histogram — 近似直方图
44
+
45
+ ```sql
46
+ -- 返回结构体数组:[{min, max, count}, ...]
47
+ SELECT approx_histogram(amount, 10) AS hist
48
+ FROM my_schema.orders;
49
+
50
+ -- 解析直方图(展开为行)
51
+ SELECT
52
+ bucket.min AS bucket_min,
53
+ bucket.max AS bucket_max,
54
+ bucket.count AS bucket_count
55
+ FROM (
56
+ SELECT EXPLODE(approx_histogram(amount, 10)) AS bucket
57
+ FROM my_schema.orders
58
+ );
59
+ ```
60
+
61
+ ### approx_top_k — 近似 TOP-K 高频值
62
+
63
+ ```sql
64
+ -- 找出出现最多的前 10 个城市
65
+ SELECT approx_top_k(city, 10) AS top_cities
66
+ FROM my_schema.orders;
67
+
68
+ -- 返回结构体数组:[{value, count}, ...]
69
+ -- 解析展开(字段名是 value 和 count)
70
+ SELECT item.value AS city, item.count AS cnt
71
+ FROM (
72
+ SELECT EXPLODE(approx_top_k(city, 10)) AS item
73
+ FROM my_schema.orders
74
+ )
75
+ ORDER BY cnt DESC;
76
+ ```
77
+
78
+ ---
79
+
80
+ ## 精确统计函数
81
+
82
+ ### percentile / median
83
+
84
+ ```sql
85
+ -- 精确中位数(小表用,大表用 approx_percentile)
86
+ SELECT
87
+ percentile(amount, 0.5) AS exact_median,
88
+ median(amount) AS median_alias -- 等价写法
89
+ FROM my_schema.orders;
90
+
91
+ -- 多分位数
92
+ SELECT percentile(amount, ARRAY(0.25, 0.5, 0.75, 0.9, 0.99))
93
+ FROM my_schema.orders;
94
+ ```
95
+
96
+ ---
97
+
98
+ ## TABLESAMPLE 采样
99
+
100
+ ```sql
101
+ -- ROW 模式:精确行级采样(适合 ML 训练集,< 1000万行)
102
+ SELECT * FROM my_schema.events TABLESAMPLE ROW (10); -- 精确 10%
103
+ SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS); -- 精确 5 行
104
+
105
+ -- SYSTEM 模式:文件级采样(适合大表快速预览,> 1000万行)
106
+ SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000; -- 约 0.1%
107
+
108
+ -- 分层采样(按类别等比例采样)
109
+ SELECT * FROM (
110
+ SELECT *,
111
+ ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn,
112
+ COUNT(*) OVER (PARTITION BY category) AS cat_total
113
+ FROM my_schema.products
114
+ )
115
+ WHERE rn <= CEIL(cat_total * 0.1); -- 每类取 10%
116
+ ```
117
+
118
+ | 场景 | 推荐模式 | 说明 |
119
+ |---|---|---|
120
+ | 快速数据预览 | SYSTEM | 极快,适合 > 100万行 |
121
+ | ML 训练集构建 | ROW | 精确随机,保证代表性 |
122
+ | 数据质量抽检 | SYSTEM | 快速抽样验证 |
123
+ | 统计分析 | ROW | 精确概率采样 |
124
+
125
+ > ⚠️ **注意**:TABLESAMPLE 在小表(< 数万行)上可能返回全部数据,百分比采样不精确。小表直接用 `LIMIT` 即可。
126
+
127
+ ---
128
+
129
+ ## 窗口函数(时序/排名特征)
130
+
131
+ ```sql
132
+ -- 移动平均(7日)
133
+ SELECT
134
+ dt,
135
+ revenue,
136
+ AVG(revenue) OVER (
137
+ ORDER BY dt
138
+ ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
139
+ ) AS revenue_7d_ma
140
+ FROM daily_stats;
141
+
142
+ -- 环比增长率
143
+ SELECT
144
+ dt,
145
+ revenue,
146
+ LAG(revenue, 1) OVER (ORDER BY dt) AS prev_revenue,
147
+ ROUND(100.0 * (revenue - LAG(revenue, 1) OVER (ORDER BY dt))
148
+ / NULLIF(LAG(revenue, 1) OVER (ORDER BY dt), 0), 2) AS mom_growth_pct
149
+ FROM daily_stats;
150
+
151
+ -- 用户行为排名(RFM 分析)
152
+ SELECT
153
+ user_id,
154
+ total_amount,
155
+ NTILE(5) OVER (ORDER BY total_amount DESC) AS monetary_quintile,
156
+ NTILE(5) OVER (ORDER BY order_cnt DESC) AS frequency_quintile,
157
+ NTILE(5) OVER (ORDER BY last_order_date DESC) AS recency_quintile
158
+ FROM user_rfm;
159
+
160
+ -- 去重保留最新(数据清洗常用)
161
+ SELECT * FROM (
162
+ SELECT *,
163
+ ROW_NUMBER() OVER (
164
+ PARTITION BY user_id
165
+ ORDER BY update_time DESC
166
+ ) AS rn
167
+ FROM my_schema.users_raw
168
+ ) WHERE rn = 1;
169
+ ```
170
+
171
+ ---
172
+
173
+ ## 数据质量检查模板
174
+
175
+ ```sql
176
+ -- 一次性输出所有关键质量指标
177
+ SELECT
178
+ COUNT(*) AS total_rows,
179
+ COUNT(DISTINCT user_id) AS unique_users,
180
+ -- 缺失率
181
+ ROUND(100.0 * COUNT(*) FILTER (WHERE user_id IS NULL)
182
+ / COUNT(*), 2) AS user_id_null_pct,
183
+ ROUND(100.0 * COUNT(*) FILTER (WHERE amount IS NULL)
184
+ / COUNT(*), 2) AS amount_null_pct,
185
+ -- 异常值
186
+ SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) AS negative_amount_cnt,
187
+ SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END) AS extreme_amount_cnt,
188
+ -- 时间范围
189
+ MIN(order_date) AS earliest_date,
190
+ MAX(order_date) AS latest_date,
191
+ -- 分布
192
+ approx_percentile(amount, 0.5) AS median_amount,
193
+ approx_percentile(amount, 0.99) AS p99_amount
194
+ FROM my_schema.orders;
195
+ ```
@@ -0,0 +1,122 @@
1
+ # 数据写入、特征工程、模型推理示例
2
+
3
+ ## 数据写入
4
+
5
+ | 场景 | 方式 |
6
+ |------|------|
7
+ | ZettaPark 可用(Python 3.10+) | `save_as_table()` 或 `create_dataframe().write` |
8
+ | 本地 CSV/pandas 写入 | `session.create_dataframe(df).write.save_as_table()` |
9
+ | Python 3.9 / ZettaPark 不可用 | cursor 批量 INSERT(见下方) |
10
+ | **禁止** | `df.to_sql()`、SQLAlchemy `clickzetta://...` |
11
+
12
+ ```python
13
+ # 方式 A:ZettaPark(推荐)
14
+ session.sql("""
15
+ SELECT o.*, u.age_group FROM my_schema.orders_raw o
16
+ LEFT JOIN my_schema.users u ON o.user_id = u.user_id
17
+ WHERE o.amount > 0
18
+ """).write.mode("overwrite").save_as_table("ds_workspace.orders_clean")
19
+
20
+ # 方式 B:pandas → Lakehouse
21
+ session.create_dataframe(local_df).write.mode("append").save_as_table("ds_workspace.features_v1")
22
+
23
+ # 方式 C:cursor 批量 INSERT(fallback)
24
+ import clickzetta, os
25
+ conn = clickzetta.connect(
26
+ service=os.environ["CLICKZETTA_SERVICE"], instance=os.environ["CLICKZETTA_INSTANCE"],
27
+ workspace=os.environ["CLICKZETTA_WORKSPACE"], username=os.environ["CLICKZETTA_USERNAME"],
28
+ password=os.environ["CLICKZETTA_PASSWORD"],
29
+ vcluster=os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
30
+ schema=os.environ.get("CLICKZETTA_SCHEMA", "public"),
31
+ )
32
+ cursor = conn.cursor()
33
+ cursor.execute("CREATE TABLE IF NOT EXISTS ds_workspace.my_table (col1 STRING, col2 BIGINT, col3 DOUBLE)")
34
+ rows = local_df.values.tolist()
35
+ for i in range(0, len(rows), 500):
36
+ batch = rows[i:i+500]
37
+ vals = ",".join(f"({','.join(repr(v) for v in row)})" for row in batch)
38
+ cursor.execute(f"INSERT INTO ds_workspace.my_table VALUES {vals}")
39
+ conn.close()
40
+ ```
41
+
42
+ ```sql
43
+ -- 设置中间表生命周期(30 天自动清理)
44
+ ALTER TABLE ds_workspace.orders_clean SET PROPERTIES ('data_lifecycle' = '30');
45
+ ```
46
+
47
+ ---
48
+
49
+ ## 特征工程
50
+
51
+ ```sql
52
+ -- SQL 侧(利用 Lakehouse 算力,推荐)
53
+ SELECT
54
+ user_id,
55
+ COUNT(*) AS order_cnt_30d,
56
+ SUM(amount) AS total_amount_30d,
57
+ AVG(amount) AS avg_amount_30d,
58
+ STDDEV(amount) AS std_amount_30d,
59
+ DATEDIFF('day', MIN(order_date), MAX(order_date)) AS active_days,
60
+ COUNT(DISTINCT DATE(order_date)) AS active_day_cnt,
61
+ NTILE(10) OVER (ORDER BY SUM(amount) DESC) AS revenue_decile
62
+ FROM my_schema.orders
63
+ WHERE order_date >= CURRENT_DATE - INTERVAL 30 DAY
64
+ GROUP BY user_id;
65
+ ```
66
+
67
+ ```python
68
+ # ZettaPark 侧(Python 逻辑)
69
+ from clickzetta.zettapark.functions import col, when
70
+
71
+ features = session.table("ds_workspace.orders_clean") \
72
+ .with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
73
+
74
+ df = features.to_pandas()
75
+
76
+ from sklearn.preprocessing import StandardScaler
77
+ df[['amount_scaled']] = StandardScaler().fit_transform(df[['amount']])
78
+
79
+ session.create_dataframe(df).write.mode("overwrite").save_as_table("ds_workspace.features_final")
80
+ ```
81
+
82
+ ---
83
+
84
+ ## 模型推理上线
85
+
86
+ ### BITMAP 用户画像
87
+
88
+ ```sql
89
+ CREATE TABLE ds_workspace.user_tags AS
90
+ SELECT tag_name, group_bitmap_state(user_id) AS user_bitmap
91
+ FROM my_schema.user_behavior GROUP BY tag_name;
92
+
93
+ -- 人群交集
94
+ SELECT bitmap_count(bitmap_and(
95
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '高消费'),
96
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '近30天活跃')
97
+ )) AS target_user_count;
98
+ ```
99
+
100
+ ### SQL UDF 批量推理
101
+
102
+ ```sql
103
+ -- 调用已部署的模型 UDF(必须用完整 schema 路径)
104
+ INSERT INTO ds_workspace.predictions
105
+ SELECT user_id,
106
+ ds_workspace.credit_score_model(total_amount_30d, order_cnt_30d, active_days, avg_amount_30d) AS score,
107
+ CURRENT_TIMESTAMP() AS predict_time
108
+ FROM ds_workspace.features_final;
109
+ ```
110
+
111
+ ### 向量检索
112
+
113
+ ```sql
114
+ SELECT candidate_id,
115
+ cosine_distance(
116
+ (SELECT embedding FROM ds_workspace.user_embeddings WHERE user_id = 'target'),
117
+ embedding
118
+ ) AS similarity
119
+ FROM ds_workspace.user_embeddings
120
+ WHERE user_id != 'target'
121
+ ORDER BY similarity LIMIT 10;
122
+ ```
@@ -0,0 +1,156 @@
1
+ # ZettaPark API 数据科学常用操作
2
+
3
+ > 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
4
+ > **Python 版本**:推荐 3.12(最低 3.10)。安装:`python3.12 -m venv .venv && pip install clickzetta_zettapark_python`
5
+
6
+ ---
7
+
8
+ ## Session 创建
9
+
10
+ ```python
11
+ from clickzetta.zettapark.session import Session
12
+ import os
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+
17
+ session = Session.builder.configs({
18
+ "service": os.environ["CLICKZETTA_SERVICE"],
19
+ "instance": os.environ["CLICKZETTA_INSTANCE"],
20
+ "workspace": os.environ["CLICKZETTA_WORKSPACE"],
21
+ "username": os.environ["CLICKZETTA_USERNAME"],
22
+ "password": os.environ["CLICKZETTA_PASSWORD"],
23
+ "vcluster": os.environ["CLICKZETTA_VCLUSTER"],
24
+ "schema": os.environ.get("CLICKZETTA_SCHEMA", "public"),
25
+ "hints": {
26
+ "sdk.job.timeout": 300,
27
+ "query_tag": "ds_notebook"
28
+ }
29
+ }).create()
30
+ ```
31
+
32
+ ---
33
+
34
+ ## 数据读取
35
+
36
+ ```python
37
+ # 读取整张表
38
+ df = session.table("my_schema.orders")
39
+
40
+ # 执行 SQL 查询
41
+ df = session.sql("SELECT * FROM my_schema.orders WHERE amount > 100")
42
+
43
+ # 转为 pandas(小数据集)
44
+ pandas_df = df.to_pandas()
45
+
46
+ # 分批读取大表(避免 OOM)
47
+ pandas_df = session.sql("""
48
+ SELECT * FROM my_schema.events
49
+ TABLESAMPLE ROW (1) -- 1% 精确采样
50
+ """).to_pandas()
51
+
52
+ # 只获取前 N 行
53
+ pandas_df = df.limit(10000).to_pandas()
54
+ ```
55
+
56
+ ---
57
+
58
+ ## DataFrame 变换
59
+
60
+ ```python
61
+ from clickzetta.zettapark.functions import col, when, lit, sum as F_sum, count as F_count, avg as F_avg
62
+
63
+ # 过滤
64
+ df_filtered = df.filter(col("amount") > 0)
65
+ df_filtered = df.filter((col("status") == "COMPLETED") & (col("amount") > 100))
66
+
67
+ # 选择列
68
+ df_selected = df.select("user_id", "amount", "order_date")
69
+
70
+ # 新增列
71
+ df = df.with_column("log_amount", col("amount").cast("double"))
72
+ df = df.with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
73
+
74
+ # 聚合
75
+ agg_df = df.group_by("user_id").agg(
76
+ F_sum("amount").as_("total_amount"),
77
+ F_count("order_id").as_("order_cnt"),
78
+ F_avg("amount").as_("avg_amount")
79
+ )
80
+
81
+ # JOIN
82
+ result = orders.join(users, orders["user_id"] == users["user_id"], "left")
83
+
84
+ # 排序
85
+ df_sorted = df.sort(col("amount").desc())
86
+ ```
87
+
88
+ ---
89
+
90
+ ## 数据写回
91
+
92
+ ```python
93
+ # 覆盖写入(常用于特征表更新)
94
+ df.write.mode("overwrite").save_as_table("ds_workspace.features_v1")
95
+
96
+ # 追加写入(常用于预测结果)
97
+ df.write.mode("append").save_as_table("ds_workspace.predictions")
98
+
99
+ # pandas DataFrame 写回
100
+ import pandas as pd
101
+ local_df = pd.DataFrame({"user_id": [1, 2], "score": [0.8, 0.6]})
102
+ session.create_dataframe(local_df).write.mode("overwrite") \
103
+ .save_as_table("ds_workspace.model_scores")
104
+ ```
105
+
106
+ ---
107
+
108
+ ## 与 pandas/scikit-learn 集成
109
+
110
+ ```python
111
+ import pandas as pd
112
+ import numpy as np
113
+ from sklearn.preprocessing import StandardScaler
114
+ from sklearn.model_selection import train_test_split
115
+ from sklearn.ensemble import GradientBoostingClassifier
116
+
117
+ # 1. 从 Lakehouse 拉特征
118
+ features_df = session.sql("""
119
+ SELECT user_id, total_amount_30d, order_cnt_30d,
120
+ active_days, avg_amount_30d, label
121
+ FROM ds_workspace.features_final
122
+ """).to_pandas()
123
+
124
+ # 2. 本地处理
125
+ X = features_df.drop(["user_id", "label"], axis=1)
126
+ y = features_df["label"]
127
+
128
+ scaler = StandardScaler()
129
+ X_scaled = scaler.fit_transform(X)
130
+
131
+ X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
132
+
133
+ # 3. 训练模型
134
+ model = GradientBoostingClassifier(n_estimators=100)
135
+ model.fit(X_train, y_train)
136
+
137
+ # 4. 预测并写回
138
+ features_df["predicted_score"] = model.predict_proba(X_scaled)[:, 1]
139
+ session.create_dataframe(
140
+ features_df[["user_id", "predicted_score"]]
141
+ ).write.mode("overwrite").save_as_table("ds_workspace.predictions")
142
+
143
+ # 5. 保存模型
144
+ import joblib
145
+ joblib.dump(model, "models/gbm_model.pkl")
146
+ joblib.dump(scaler, "models/scaler.pkl")
147
+ ```
148
+
149
+ ---
150
+
151
+ ## 注意事项
152
+
153
+ - `to_pandas()` 会把数据全部拉到本地内存,大表必须先 `TABLESAMPLE` 或 `LIMIT`
154
+ - `collect()` 返回 Row 对象列表,`to_pandas()` 返回 DataFrame,数据科学场景用后者
155
+ - ZettaPark 的 DataFrame 操作是懒执行,只有 `to_pandas()`/`collect()`/`show()`/`save_as_table()` 才真正触发计算
156
+ - 写回时推荐用 `ds_workspace` 这样的专属 Schema,与生产数据隔离
@@ -0,0 +1,16 @@
1
+ ClickZetta Skills License
2
+ © 2026 Yunqi Inc. All rights reserved.
3
+ LICENSE: Use of these materials (including all code, prompts, assets, files, and other components of these skills (collectively, "Skills")) is governed by your agreement with ClickZetta for the Service. If no separate agreement exists, use is governed by ClickZetta's Terms of Service (available at: https://yunqi.tech/documents/user-aggrement).
4
+ Your applicable agreement is referred to as the "Agreement." "Service" is as defined in the Agreement.
5
+ ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the contrary, you may not:
6
+
7
+ Extract from the Service or retain copies of the Skills outside use with the Service;
8
+ Reproduce or copy the Skills, except for temporary copies created automatically during authorized use of the Service;
9
+ Create derivative works based on the Skills;
10
+ Distribute, sublicense, or transfer the Skills to any third party;
11
+ Make, offer to sell, sell, or import any inventions embodied in the Skills; nor,
12
+ Reverse engineer, decompile, or disassemble the Skills.
13
+
14
+ The receipt, viewing, or possession of the Skills does not convey or imply any license or right beyond those expressly granted above.
15
+ Yunqi retains all rights, title, and interest in the Skills, including all copyrights, trademarks, patents, and all other applicable intellectual property rights.
16
+ THE SKILLS ARE PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SKILLS OR THE USE OR OTHER DEALINGS IN THE SKILLS.
@@ -0,0 +1,160 @@
1
+ ---
2
+ name: clickzetta-data-sharing
3
+ description: |
4
+ 管理 ClickZetta Lakehouse 跨账户/跨实例数据分享(Share)。无需复制数据,
5
+ 实时共享表或视图给其他服务实例。覆盖提供方完整流程(CREATE SHARE →
6
+ GRANT TO SHARE → ALTER SHARE ADD INSTANCE)和消费方流程
7
+ (SHOW SHARES → DESC SHARE → CREATE SCHEMA FROM SHARE → 查询)。
8
+ 当用户说"数据分享"、"数据共享"、"Share"、"跨账户共享"、"跨实例共享"、
9
+ "CREATE SHARE"、"GRANT TO SHARE"、"CREATE SCHEMA FROM SHARE"、
10
+ "无复制共享"、"分享数据给其他公司"、"接收共享数据"、"INBOUND"、"OUTBOUND"时触发。
11
+ Keywords: data sharing, SHARE, cross-account, cross-instance, provider, consumer
12
+ ---
13
+
14
+ # ClickZetta 数据分享
15
+
16
+ 数据分享(Share)实现跨账户/跨实例的**无复制、实时只读**数据共享。提供方授权数据,消费方直接查询,无需数据同步。
17
+
18
+ 阅读 [references/share-ddl.md](references/share-ddl.md) 了解完整语法。
19
+
20
+ > ⚠️ 创建 Share 需要 `instance_admin` 角色。
21
+
22
+ ---
23
+
24
+ ## 提供方:分享数据(3步)
25
+
26
+ ### 步骤 1:创建 Share 对象
27
+
28
+ ```sql
29
+ CREATE SHARE my_share;
30
+ ```
31
+
32
+ ### 步骤 2:将表/视图加入 Share
33
+
34
+ ```sql
35
+ -- 分享指定表
36
+ GRANT SELECT, READ METADATA ON TABLE public.orders TO SHARE my_share;
37
+
38
+ -- 分享视图(推荐:用视图控制分享字段和行范围)
39
+ GRANT SELECT, READ METADATA ON VIEW public.orders_public_view TO SHARE my_share;
40
+
41
+ -- 分享多张表
42
+ GRANT SELECT, READ METADATA ON TABLE public.orders, public.customers TO SHARE my_share;
43
+ ```
44
+
45
+ ### 步骤 3:指定接收方实例
46
+
47
+ ```sql
48
+ -- 添加接收方(消费方提供其实例名称)
49
+ ALTER SHARE my_share ADD INSTANCE consumer_instance_id;
50
+ ```
51
+
52
+ ---
53
+
54
+ ## 消费方:使用共享数据(3步)
55
+
56
+ ### 步骤 1:查看收到的 Share
57
+
58
+ ```sql
59
+ SHOW SHARES WHERE kind = 'INBOUND';
60
+ ```
61
+
62
+ ### 步骤 2:查看 Share 内容
63
+
64
+ ```sql
65
+ -- 格式:DESC SHARE <提供方实例名>.<share名>
66
+ DESC SHARE provider_instance.my_share;
67
+ ```
68
+
69
+ ### 步骤 3:创建本地只读 Schema
70
+
71
+ ```sql
72
+ -- 格式:CREATE SCHEMA <本地名> FROM SHARE SHARE <实例>.<share>.<schema>
73
+ CREATE SCHEMA shared_data FROM SHARE SHARE provider_instance.my_share.public;
74
+
75
+ -- 直接查询
76
+ SELECT * FROM shared_data.orders LIMIT 10;
77
+
78
+ -- 与本地表关联
79
+ SELECT o.*, c.region
80
+ FROM shared_data.orders o
81
+ JOIN my_schema.dim_customer c ON o.customer_id = c.id;
82
+ ```
83
+
84
+ ---
85
+
86
+ ## 管理操作
87
+
88
+ ```sql
89
+ -- 查看所有 Share(含 INBOUND/OUTBOUND)
90
+ SHOW SHARES;
91
+
92
+ -- 只看分享出去的
93
+ SHOW SHARES WHERE kind = 'OUTBOUND';
94
+
95
+ -- 查看 Share 包含的对象
96
+ DESC SHARE my_share;
97
+
98
+ -- 撤销某张表的分享
99
+ REVOKE SELECT, READ METADATA ON TABLE public.orders FROM SHARE my_share;
100
+
101
+ -- 移除接收方(立即生效)
102
+ ALTER SHARE my_share REMOVE INSTANCE consumer_instance_id;
103
+
104
+ -- 删除 Share
105
+ DROP SHARE IF EXISTS my_share;
106
+ ```
107
+
108
+ ---
109
+
110
+ ## 典型场景
111
+
112
+ ### 场景:A 公司向 B 公司分享数据
113
+
114
+ **A 公司(提供方)操作:**
115
+
116
+ ```sql
117
+ -- 1. 创建 Share
118
+ CREATE SHARE partner_share;
119
+
120
+ -- 2. 创建视图控制分享范围(只分享脱敏后的数据)
121
+ CREATE VIEW public.orders_for_partner AS
122
+ SELECT order_id, product_id, amount, order_date
123
+ FROM public.orders
124
+ WHERE status = 'completed';
125
+
126
+ -- 3. 将视图加入 Share
127
+ GRANT SELECT, READ METADATA ON VIEW public.orders_for_partner TO SHARE partner_share;
128
+
129
+ -- 4. 指定 B 公司实例(B 公司提供其实例名)
130
+ ALTER SHARE partner_share ADD INSTANCE b_company_instance;
131
+ ```
132
+
133
+ **B 公司(消费方)操作:**
134
+
135
+ ```sql
136
+ -- 1. 查看收到的 Share
137
+ SHOW SHARES WHERE kind = 'INBOUND';
138
+
139
+ -- 2. 查看内容
140
+ DESC SHARE a_company_instance.partner_share;
141
+
142
+ -- 3. 创建本地 Schema
143
+ CREATE SCHEMA a_company_data FROM SHARE SHARE a_company_instance.partner_share.public;
144
+
145
+ -- 4. 查询使用
146
+ SELECT * FROM a_company_data.orders_for_partner
147
+ WHERE order_date >= '2024-01-01';
148
+ ```
149
+
150
+ ---
151
+
152
+ ## 常见问题
153
+
154
+ | 问题 | 原因 | 解决方案 |
155
+ |---|---|---|
156
+ | CREATE SHARE 报权限不足 | 需要 instance_admin 角色 | 联系管理员授予 instance_admin |
157
+ | 消费方看不到 Share | 提供方未 ADD INSTANCE | 提供方执行 `ALTER SHARE ADD INSTANCE` |
158
+ | DESC SHARE 报错 | instance_name 填错 | 通过 `SHOW SHARES` 确认 provider_instance 字段 |
159
+ | 共享 Schema 下查不到表 | GRANT 时未包含该表 | 提供方重新 `GRANT ... TO SHARE` |
160
+ | 想只分享部分列/行 | 直接分享表会暴露全量数据 | 创建 VIEW 过滤后再分享 VIEW |
@@ -0,0 +1,3 @@
1
+ {"case_id":"001","type":"should_call","user_input":"怎么把表数据分享给另一个实例?不复制数据的那种","expected_skill":"clickzetta-data-sharing","expected_output_contains":["CREATE SHARE","GRANT"]}
2
+ {"case_id":"002","type":"should_call","user_input":"作为消费方,怎么接收别人分享的数据?","expected_skill":"clickzetta-data-sharing","expected_output_contains":["SHOW SHARES","CREATE SCHEMA FROM SHARE"]}
3
+ {"case_id":"003","type":"should_call","user_input":"数据分享的提供方完整流程是什么?","expected_skill":"clickzetta-data-sharing","expected_output_contains":["SHARE","GRANT","INSTANCE"]}