@clickzetta/cz-cli-darwin-x64 0.3.81 → 0.3.83

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-access-control/LICENSE +16 -0
  3. package/bin/skills/clickzetta-access-control/SKILL.md +243 -0
  4. package/bin/skills/clickzetta-access-control/eval_cases.jsonl +3 -0
  5. package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +86 -0
  6. package/bin/skills/clickzetta-access-control/references/grant-revoke.md +103 -0
  7. package/bin/skills/clickzetta-access-control/references/role-management.md +66 -0
  8. package/bin/skills/clickzetta-access-control/references/user-management.md +61 -0
  9. package/bin/skills/clickzetta-app-python-sdk/LICENSE +16 -0
  10. package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
  11. package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
  12. package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
  13. package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
  14. package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
  15. package/bin/skills/clickzetta-batch-sync-pipeline/LICENSE +16 -0
  16. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +227 -0
  17. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -0
  18. package/bin/skills/clickzetta-bi-connect/LICENSE +16 -0
  19. package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
  20. package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
  21. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
  22. package/bin/skills/clickzetta-cdc-sync-pipeline/LICENSE +16 -0
  23. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
  24. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
  25. package/bin/skills/clickzetta-data-ingest-pipeline/LICENSE +16 -0
  26. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +237 -0
  27. package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
  28. package/bin/skills/clickzetta-data-retention/LICENSE +16 -0
  29. package/bin/skills/clickzetta-data-retention/SKILL.md +160 -0
  30. package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +5 -0
  31. package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +175 -0
  32. package/bin/skills/clickzetta-data-science/LICENSE +16 -0
  33. package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
  34. package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
  35. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
  36. package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
  37. package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
  38. package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
  39. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
  40. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
  41. package/bin/skills/clickzetta-data-sharing/LICENSE +16 -0
  42. package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
  43. package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
  44. package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
  45. package/bin/skills/clickzetta-dba-guide/LICENSE +16 -0
  46. package/bin/skills/clickzetta-dba-guide/SKILL.md +542 -0
  47. package/bin/skills/clickzetta-dba-guide/eval_cases.jsonl +3 -0
  48. package/bin/skills/clickzetta-dw-modeling/LICENSE +16 -0
  49. package/bin/skills/clickzetta-dw-modeling/SKILL.md +351 -0
  50. package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +4 -0
  51. package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +100 -0
  52. package/bin/skills/clickzetta-dynamic-table/LICENSE +16 -0
  53. package/bin/skills/clickzetta-dynamic-table/SKILL.md +230 -0
  54. package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +253 -0
  55. package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +124 -0
  56. package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +96 -0
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +109 -0
  58. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +135 -0
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +15 -0
  60. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +185 -0
  61. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +427 -0
  62. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +260 -0
  63. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +80 -0
  64. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +190 -0
  65. package/bin/skills/clickzetta-dynamic-table/eval_cases.jsonl +5 -0
  66. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/SKILL.md +27 -0
  67. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-column-validation-rules.md +118 -0
  68. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-conversion-rules.md +225 -0
  69. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-placeholder-rules.md +182 -0
  70. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-refresh-rules.md +98 -0
  71. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-self-reference-rules.md +76 -0
  72. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-workflow.md +109 -0
  73. package/bin/skills/clickzetta-external-catalog/LICENSE +16 -0
  74. package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
  75. package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
  76. package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
  77. package/bin/skills/clickzetta-external-function/LICENSE +16 -0
  78. package/bin/skills/clickzetta-external-function/SKILL.md +203 -0
  79. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -0
  80. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +171 -0
  81. package/bin/skills/clickzetta-file-import-pipeline/LICENSE +16 -0
  82. package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +190 -0
  83. package/bin/skills/clickzetta-file-import-pipeline/eval_cases.jsonl +5 -0
  84. package/bin/skills/clickzetta-index-manager/LICENSE +16 -0
  85. package/bin/skills/clickzetta-index-manager/SKILL.md +140 -0
  86. package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +5 -0
  87. package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +67 -0
  88. package/bin/skills/clickzetta-index-manager/references/index-management.md +73 -0
  89. package/bin/skills/clickzetta-index-manager/references/inverted-index.md +80 -0
  90. package/bin/skills/clickzetta-index-manager/references/vector-index.md +81 -0
  91. package/bin/skills/clickzetta-java-sdk/LICENSE +16 -0
  92. package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
  93. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
  94. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
  95. package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
  96. package/bin/skills/clickzetta-kafka-ingest-pipeline/LICENSE +16 -0
  97. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +769 -0
  98. package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +5 -0
  99. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +324 -0
  100. package/bin/skills/clickzetta-lakehouse-connect/LICENSE +16 -0
  101. package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +218 -0
  102. package/bin/skills/clickzetta-lakehouse-connect/eval_cases.jsonl +3 -0
  103. package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +35 -0
  104. package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +435 -0
  105. package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +478 -0
  106. package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +225 -0
  107. package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +468 -0
  108. package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +445 -0
  109. package/bin/skills/clickzetta-manage-comments/LICENSE +16 -0
  110. package/bin/skills/clickzetta-manage-comments/SKILL.md +219 -0
  111. package/bin/skills/clickzetta-manage-comments/eval_cases.jsonl +3 -0
  112. package/bin/skills/clickzetta-metadata/LICENSE +16 -0
  113. package/bin/skills/clickzetta-metadata/SKILL.md +502 -0
  114. package/bin/skills/clickzetta-metadata/eval_cases.jsonl +5 -0
  115. package/bin/skills/clickzetta-metadata/references/instance-views-reference.md +276 -0
  116. package/bin/skills/clickzetta-metadata/references/metering-views-reference.md +137 -0
  117. package/bin/skills/clickzetta-metadata/references/show-desc-reference.md +326 -0
  118. package/bin/skills/clickzetta-metadata/references/views-reference.md +271 -0
  119. package/bin/skills/clickzetta-monitoring/LICENSE +16 -0
  120. package/bin/skills/clickzetta-monitoring/SKILL.md +215 -0
  121. package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +5 -0
  122. package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +97 -0
  123. package/bin/skills/clickzetta-monitoring/references/show-jobs.md +48 -0
  124. package/bin/skills/clickzetta-oss-ingest-pipeline/LICENSE +16 -0
  125. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +562 -0
  126. package/bin/skills/clickzetta-oss-ingest-pipeline/eval_cases.jsonl +5 -0
  127. package/bin/skills/clickzetta-overview/LICENSE +16 -0
  128. package/bin/skills/clickzetta-overview/SKILL.md +102 -0
  129. package/bin/skills/clickzetta-overview/eval_cases.jsonl +5 -0
  130. package/bin/skills/clickzetta-overview/references/brands-and-endpoints.md +79 -0
  131. package/bin/skills/clickzetta-overview/references/object-model.md +311 -0
  132. package/bin/skills/clickzetta-overview/references/studio-modules.md +173 -0
  133. package/bin/skills/clickzetta-pipeline-review/LICENSE +16 -0
  134. package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
  135. package/bin/skills/clickzetta-query-optimizer/LICENSE +16 -0
  136. package/bin/skills/clickzetta-query-optimizer/SKILL.md +156 -0
  137. package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +5 -0
  138. package/bin/skills/clickzetta-query-optimizer/references/explain.md +56 -0
  139. package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +78 -0
  140. package/bin/skills/clickzetta-query-optimizer/references/optimize.md +65 -0
  141. package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +49 -0
  142. package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +42 -0
  143. package/bin/skills/clickzetta-realtime-sync-pipeline/LICENSE +16 -0
  144. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
  145. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
  146. package/bin/skills/clickzetta-semantic-view/LICENSE +16 -0
  147. package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
  148. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
  149. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
  150. package/bin/skills/clickzetta-spark-flink-connector/LICENSE +16 -0
  151. package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
  152. package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
  153. package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
  154. package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
  155. package/bin/skills/clickzetta-sql-pipeline-manager/LICENSE +16 -0
  156. package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +485 -0
  157. package/bin/skills/clickzetta-sql-pipeline-manager/eval_cases.jsonl +12 -0
  158. package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +166 -0
  159. package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +185 -0
  160. package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +129 -0
  161. package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +222 -0
  162. package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +125 -0
  163. package/bin/skills/clickzetta-sql-syntax-guide/LICENSE +16 -0
  164. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
  165. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
  166. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
  167. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
  168. package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
  169. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
  170. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
  171. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
  172. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
  173. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
  174. package/bin/skills/clickzetta-studio-task-manager/LICENSE +16 -0
  175. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
  176. package/bin/skills/clickzetta-table-lineage/LICENSE +16 -0
  177. package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
  178. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
  179. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
  180. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
  181. package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
  182. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
  183. package/bin/skills/clickzetta-table-stream-pipeline/LICENSE +16 -0
  184. package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +206 -0
  185. package/bin/skills/clickzetta-table-stream-pipeline/eval_cases.jsonl +5 -0
  186. package/bin/skills/clickzetta-vcluster-manager/LICENSE +16 -0
  187. package/bin/skills/clickzetta-vcluster-manager/SKILL.md +212 -0
  188. package/bin/skills/clickzetta-vcluster-manager/eval_cases.jsonl +5 -0
  189. package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +54 -0
  190. package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +150 -0
  191. package/bin/skills/clickzetta-volume-manager/LICENSE +16 -0
  192. package/bin/skills/clickzetta-volume-manager/SKILL.md +292 -0
  193. package/bin/skills/clickzetta-volume-manager/eval_cases.jsonl +5 -0
  194. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +199 -0
  195. package/bin/skills/clickzetta-zettapark/LICENSE +16 -0
  196. package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
  197. package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
  198. package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
  199. package/bin/skills/cz-cli/SKILL.md +313 -0
  200. package/bin/skills/cz-cli/references/profile-setup.md +120 -0
  201. package/package.json +1 -1
@@ -0,0 +1,125 @@
1
+ ---
2
+ name: clickzetta-data-science
3
+ description: |
4
+ 数据科学家使用 ClickZetta Lakehouse 的端到端工作流指南。按工作阶段组织:
5
+ 开发环境准备(Python 3.10+ 检查/搭建)、Jupyter Notebook 配置与使用、
6
+ 项目结构规范(Cookiecutter DS 标准)、数据发现、数据质量评估、
7
+ 数据清洗与整合、数据集构建、EDA 探索分析、
8
+ 特征工程(SQL + ZettaPark)、模型推理上线(BITMAP 用户画像/UDF 批量推理/向量检索)。
9
+ 当用户说"数据科学"、"机器学习"、"特征工程"、"EDA"、"数据探索"、
10
+ "ZettaPark 机器学习"、"Jupyter 连接 Lakehouse"、"notebook"、"ipynb"、
11
+ "jupyter kernel"、"%%sql"、"magic command"、"pandas 读取数据"、
12
+ "数据质量检查"、"数据采样"、"TABLESAMPLE"、"approx_percentile"、
13
+ "BITMAP 用户画像"、"人群圈选"、"批量推理"、"Python 3.10"、
14
+ "scikit-learn"、"项目目录结构"、"config.json"、".env"时触发。
15
+ Keywords: data science, Jupyter, EDA, feature engineering, ML, pandas, notebook
16
+ ---
17
+
18
+ # ClickZetta Lakehouse 数据科学工作流
19
+
20
+ ## 工作流全景
21
+
22
+ ```
23
+ 环境准备 → Jupyter 配置 → 项目结构 → 数据发现 → 数据质量评估 → 数据清洗整合
24
+
25
+ 模型推理上线 ← 特征工程 ← EDA ← 数据集构建
26
+ ```
27
+
28
+ ---
29
+
30
+ ## 硬性前提条件
31
+
32
+ **Python 3.10+**(ZettaPark 硬性要求)。用户环境是 3.9 或更低时,先给升级方案再继续:
33
+
34
+ ```bash
35
+ brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9
36
+ python -m venv .venv && source .venv/bin/activate
37
+ ```
38
+
39
+ 详细搭建步骤见 [references/setup.md](references/setup.md)。
40
+
41
+ ---
42
+
43
+ ## 项目结构
44
+
45
+ ```
46
+ my-ds-project/
47
+ ├── notebooks/ # 00-env-check.ipynb 必须是第一个
48
+ │ ├── 00-env-check.ipynb
49
+ │ ├── 01-data-discovery.ipynb
50
+ │ ├── 02-data-quality.ipynb
51
+ │ ├── 03-eda.ipynb
52
+ │ ├── 04-feature-engineering.ipynb
53
+ │ └── 05-modeling.ipynb
54
+ ├── src/
55
+ │ ├── config.py # 连接配置,见 references/setup.md
56
+ │ ├── data/
57
+ │ └── features/
58
+ ├── sql/
59
+ ├── data/ # 全部加入 .gitignore
60
+ ├── models/ # 全部加入 .gitignore
61
+ ├── .env # 绝不入 git
62
+ └── .env.example # 入 git
63
+ ```
64
+
65
+ 环境变量命名规范:`CLICKZETTA_SERVICE` / `CLICKZETTA_INSTANCE` / `CLICKZETTA_WORKSPACE` / `CLICKZETTA_USERNAME` / `CLICKZETTA_PASSWORD` / `CLICKZETTA_VCLUSTER` / `CLICKZETTA_SCHEMA`。
66
+
67
+ ---
68
+
69
+ ## 数据写入规则(禁止事项)
70
+
71
+ | 方式 | 结论 |
72
+ |------|------|
73
+ | `session.create_dataframe(df).write.save_as_table()` | ✅ 推荐 |
74
+ | `cursor` 批量 INSERT(每批 500 行) | ✅ Python 3.9 / ZettaPark 不可用时的 fallback |
75
+ | `df.to_sql(conn, ...)` | ❌ 禁止,报 `'list' object has no attribute 'keys'` |
76
+ | SQLAlchemy `clickzetta://...` | ❌ 禁止,dialect 不可靠 |
77
+
78
+ 代码模板见 [references/write-and-infer.md](references/write-and-infer.md)。
79
+
80
+ ---
81
+
82
+ ## 数据查看规则
83
+
84
+ - 快速查看用 `.show()`,不需要 pandas 时不要 `.to_pandas()`
85
+ - 大表操作默认加 `TABLESAMPLE ROW(10)` 采样,避免 OOM
86
+
87
+ ---
88
+
89
+ ## 数据验证规则
90
+
91
+ 导入数据后,**立即用已知基准值验证统计结果**,再进行后续分析。
92
+
93
+ 常见陷阱:运动员/用户级别的原始数据,团体项目每个参与者各有一条记录,直接 SUM 会重复计算。正确做法:先 `SELECT DISTINCT event, medal, ...` 去重,再聚合。
94
+
95
+ ---
96
+
97
+ ## ClickZetta SQL 不支持的语法
98
+
99
+ | 不支持 | 替代方案 |
100
+ |--------|---------|
101
+ | `CREATE OR REPLACE TABLE` | `CREATE TABLE IF NOT EXISTS`(普通表不支持 OR REPLACE) |
102
+ | `ARRAY_AGG(col IGNORE NULLS)` | `MAX(col)` 或 `COALESCE()` |
103
+ | `QUALIFY` 子句 | 子查询 + `WHERE rn = 1` |
104
+ | `UNION` / `INTERSECT` / `EXCEPT` | JOIN + 应用层合并 |
105
+ | `BEGIN; COMMIT; ROLLBACK;` | 用 MERGE 实现原子操作 |
106
+ | `NOW()` | `CURRENT_TIMESTAMP()` |
107
+
108
+ 遇到其他语法报错,加载 `clickzetta-sql-syntax-guide` skill。
109
+
110
+ ---
111
+
112
+ ## Schema 上下文
113
+
114
+ Python 代码中 SQL 语句始终使用完整表名 `schema.table`,不依赖当前 schema 上下文。
115
+
116
+ ---
117
+
118
+ ## 参考文档
119
+
120
+ - [环境搭建与项目配置](references/setup.md) — 环境搭建、config.py 模板、Jupyter 配置
121
+ - [数据发现/质量/清洗/EDA 示例](references/data-patterns.md)
122
+ - [数据写入/特征工程/模型推理示例](references/write-and-infer.md)
123
+ - [ZettaPark API](references/zettapark-api.md)
124
+ - [统计分析函数](references/stats-functions.md)
125
+ - [BITMAP 用户画像](references/bitmap-profile.md)
@@ -0,0 +1,12 @@
1
+ {"case_id":"001","type":"should_call","user_input":"怎么用 Jupyter Notebook 连接 Lakehouse","expected_skill":"clickzetta-data-science","expected_output_contains":["Jupyter"]}
2
+ {"case_id":"002","type":"should_call","user_input":"数据科学项目目录结构怎么组织","expected_skill":"clickzetta-data-science","expected_output_contains":["项目结构"]}
3
+ {"case_id":"003","type":"should_call","user_input":"怎么做 EDA 探索性数据分析","expected_skill":"clickzetta-data-science","expected_output_contains":["EDA"]}
4
+ {"case_id":"004","type":"should_call","user_input":"TABLESAMPLE 怎么做数据采样","expected_skill":"clickzetta-data-science","expected_output_contains":["TABLESAMPLE"]}
5
+ {"case_id":"005","type":"should_call","user_input":"怎么用 ZettaPark 做特征工程","expected_skill":"clickzetta-data-science","expected_output_contains":["特征工程"]}
6
+ {"case_id":"006","type":"should_call","user_input":"BITMAP 用户画像怎么做人群圈选","expected_skill":"clickzetta-data-science","expected_output_contains":["BITMAP"]}
7
+ {"case_id":"007","type":"should_call","user_input":"%%sql magic command 怎么在 notebook 里用","expected_skill":"clickzetta-data-science","expected_output_contains":["%%sql"]}
8
+ {"case_id":"008","type":"should_not_call","user_input":"TensorFlow 怎么训练模型","forbidden_skill":"clickzetta-data-science"}
9
+ {"case_id":"009","type":"should_not_call","user_input":"帮我写一个 ETL 管道","forbidden_skill":"clickzetta-data-science"}
10
+ {"case_id":"010","type":"should_not_call","user_input":"怎么创建 VCluster","forbidden_skill":"clickzetta-data-science"}
11
+ {"case_id":"011","type":"should_not_call","user_input":"怎么做数据分享","forbidden_skill":"clickzetta-data-science"}
12
+ {"case_id":"012","type":"should_not_call","user_input":"scikit-learn 怎么安装","forbidden_skill":"clickzetta-data-science"}
@@ -0,0 +1,146 @@
1
+ # BITMAP 用户画像参考
2
+
3
+ > 来源:https://www.yunqi.tech/documents/bitmap-type
4
+
5
+ BITMAP 是 ClickZetta 中用于高效存储和处理整数集合的数据类型,基于 Roaring Bitmap 压缩算法,特别适合用户画像、人群圈选、UV 统计等数据科学场景。
6
+
7
+ ---
8
+
9
+ ## 核心限制
10
+
11
+ - 支持 **64 位无符号整数**(0 到 2^64-1)
12
+ - **不支持**比较操作(<、>、=)
13
+ - **不支持** ORDER BY、GROUP BY、DISTINCT
14
+ - **不能**作为 PRIMARY KEY、PARTITION KEY、CLUSTER KEY
15
+
16
+ ---
17
+
18
+ ## 构建用户标签 BITMAP
19
+
20
+ ```sql
21
+ -- 方式 1:从行数据聚合构建(最常用)
22
+ CREATE TABLE ds_workspace.user_tags AS
23
+ SELECT
24
+ tag_name,
25
+ group_bitmap_state(user_id) AS user_bitmap
26
+ FROM (
27
+ -- 高消费用户
28
+ SELECT 'high_value' AS tag_name, user_id
29
+ FROM my_schema.orders
30
+ WHERE total_amount_30d > 1000
31
+ UNION ALL
32
+ -- 近30天活跃用户
33
+ SELECT 'active_30d' AS tag_name, user_id
34
+ FROM my_schema.events
35
+ WHERE event_date >= CURRENT_DATE - INTERVAL 30 DAY
36
+ UNION ALL
37
+ -- 已流失用户(90天未活跃)
38
+ SELECT 'churned' AS tag_name, user_id
39
+ FROM my_schema.users
40
+ WHERE last_active_date < CURRENT_DATE - INTERVAL 90 DAY
41
+ ) t
42
+ GROUP BY tag_name;
43
+
44
+ -- 方式 2:从数组构建
45
+ INSERT INTO ds_workspace.user_tags VALUES
46
+ ('vip', bitmap_build(ARRAY(1001, 1002, 1003, 1004)));
47
+ ```
48
+
49
+ ---
50
+
51
+ ## 人群圈选操作
52
+
53
+ ```sql
54
+ -- 交集:同时满足多个标签(AND)
55
+ SELECT bitmap_count(
56
+ bitmap_and(
57
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
58
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'active_30d')
59
+ )
60
+ ) AS target_count;
61
+
62
+ -- 并集:满足任一标签(OR)
63
+ SELECT bitmap_count(
64
+ bitmap_or(
65
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
66
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'active_30d')
67
+ )
68
+ ) AS reach_count;
69
+
70
+ -- 差集:排除某类用户(ANDNOT)
71
+ SELECT bitmap_count(
72
+ bitmap_andnot(
73
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
74
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'churned')
75
+ )
76
+ ) AS targetable_count;
77
+
78
+ -- 获取目标用户 ID 列表
79
+ SELECT bitmap_to_array(
80
+ bitmap_andnot(
81
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
82
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'churned')
83
+ )
84
+ ) AS target_user_ids;
85
+ ```
86
+
87
+ ---
88
+
89
+ ## UV 统计(去重计数)
90
+
91
+ ```sql
92
+ -- 日活跃用户数(DAU)
93
+ SELECT
94
+ event_date,
95
+ bitmap_count(group_bitmap_state(user_id)) AS dau
96
+ FROM my_schema.events
97
+ GROUP BY event_date
98
+ ORDER BY event_date;
99
+
100
+ -- 周活跃用户数(WAU)—— 跨天去重
101
+ SELECT
102
+ DATE_TRUNC('week', event_date) AS week_start,
103
+ bitmap_count(
104
+ bitmap_or_agg(daily_bitmap) -- 合并多天 bitmap
105
+ ) AS wau
106
+ FROM (
107
+ SELECT event_date,
108
+ group_bitmap_state(user_id) AS daily_bitmap
109
+ FROM my_schema.events
110
+ GROUP BY event_date
111
+ ) t
112
+ GROUP BY 1;
113
+
114
+ -- 用户留存分析(新用户 vs 回访用户)
115
+ SELECT
116
+ bitmap_count(
117
+ bitmap_and(new_users.user_bitmap, return_users.user_bitmap)
118
+ ) AS retained_users,
119
+ bitmap_count(
120
+ bitmap_andnot(new_users.user_bitmap, return_users.user_bitmap)
121
+ ) AS lost_users
122
+ FROM
123
+ (SELECT group_bitmap_state(user_id) AS user_bitmap
124
+ FROM my_schema.events WHERE event_date = '2024-01-01') AS new_users,
125
+ (SELECT group_bitmap_state(user_id) AS user_bitmap
126
+ FROM my_schema.events WHERE event_date = '2024-01-08') AS return_users;
127
+ ```
128
+
129
+ ---
130
+
131
+ ## 常用 BITMAP 函数速查
132
+
133
+ | 函数 | 说明 | 示例 |
134
+ |---|---|---|
135
+ | `group_bitmap_state(col)` | 聚合构建 BITMAP | `GROUP BY tag` |
136
+ | `bitmap_count(bm)` | 计算元素个数(UV) | `bitmap_count(user_bm)` |
137
+ | `bitmap_and(a, b)` | 交集 | 同时满足 A 和 B |
138
+ | `bitmap_or(a, b)` | 并集 | 满足 A 或 B |
139
+ | `bitmap_andnot(a, b)` | 差集 | 在 A 中但不在 B 中 |
140
+ | `bitmap_xor(a, b)` | 异或(只在一个中) | A、B 各自独有的 |
141
+ | `bitmap_to_array(bm)` | 转为整数数组 | 获取用户 ID 列表 |
142
+ | `bitmap_build(arr)` | 从数组构建 | `bitmap_build(ARRAY(1,2,3))` |
143
+ | `bitmap_contains(bm, val)` | 检查是否包含某值 | `bitmap_contains(bm, user_id)` |
144
+ | `bitmap_min(bm)` | 最小元素 | — |
145
+ | `bitmap_max(bm)` | 最大元素 | — |
146
+ | `to_bitmap(val)` | 单值转 BITMAP | `to_bitmap(user_id)` |
@@ -0,0 +1,110 @@
1
+ # 数据发现、质量评估、清洗、EDA 示例
2
+
3
+ ## 数据发现
4
+
5
+ ```python
6
+ from src.config import get_session
7
+ session = get_session()
8
+
9
+ session.sql("SHOW SCHEMAS").show()
10
+ session.sql("SHOW TABLES IN my_schema").show()
11
+ session.sql("DESC EXTENDED my_schema.orders").show()
12
+ session.sql("""
13
+ SELECT table_name, row_count,
14
+ ROUND(bytes/1024.0/1024/1024, 2) AS size_gb
15
+ FROM information_schema.tables
16
+ WHERE table_schema = 'my_schema'
17
+ ORDER BY bytes DESC
18
+ """).show()
19
+ ```
20
+
21
+ ---
22
+
23
+ ## 数据质量评估
24
+
25
+ ```sql
26
+ -- 基础统计
27
+ SELECT
28
+ COUNT(*) AS total_rows,
29
+ COUNT(DISTINCT user_id) AS unique_users,
30
+ MIN(event_time) AS earliest, MAX(event_time) AS latest,
31
+ ROUND(100.0 * SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS user_id_null_pct,
32
+ ROUND(100.0 * SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS amount_null_pct
33
+ FROM my_schema.orders;
34
+
35
+ -- 主键重复检查
36
+ SELECT order_id, COUNT(*) AS cnt
37
+ FROM my_schema.orders GROUP BY order_id HAVING cnt > 1 LIMIT 10;
38
+
39
+ -- 数值分布(大表高效)
40
+ SELECT
41
+ approx_percentile(amount, 0.25) AS p25,
42
+ approx_percentile(amount, 0.50) AS median,
43
+ approx_percentile(amount, 0.75) AS p75,
44
+ approx_percentile(amount, 0.99) AS p99,
45
+ MIN(amount) AS min_val, MAX(amount) AS max_val
46
+ FROM my_schema.orders;
47
+
48
+ -- 高频值 TOP-K
49
+ SELECT approx_top_k(status, 10) AS top_statuses FROM my_schema.orders;
50
+
51
+ -- 近似 UV
52
+ SELECT approx_count_distinct(user_id) AS approx_uv FROM my_schema.events;
53
+ ```
54
+
55
+ ---
56
+
57
+ ## 数据清洗
58
+
59
+ ```sql
60
+ -- 去重(保留最新一条)
61
+ SELECT * FROM (
62
+ SELECT *, ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY update_time DESC) AS rn
63
+ FROM my_schema.orders_raw
64
+ ) WHERE rn = 1;
65
+
66
+ -- 缺失值处理 + 类型转换
67
+ SELECT
68
+ order_id, user_id,
69
+ COALESCE(amount, 0.0) AS amount,
70
+ COALESCE(status, 'UNKNOWN') AS status,
71
+ CAST(order_date AS DATE) AS order_date
72
+ FROM my_schema.orders_raw
73
+ WHERE user_id IS NOT NULL;
74
+
75
+ -- 多表整合
76
+ SELECT o.order_id, o.user_id, o.amount, o.order_date,
77
+ u.age_group, u.city, p.category, p.brand
78
+ FROM my_schema.orders o
79
+ LEFT JOIN my_schema.users u ON o.user_id = u.user_id
80
+ LEFT JOIN my_schema.products p ON o.product_id = p.product_id;
81
+ ```
82
+
83
+ ---
84
+
85
+ ## EDA
86
+
87
+ ```python
88
+ # 采样策略
89
+ df_quick = session.sql("""
90
+ SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000
91
+ """).to_pandas() # SYSTEM:文件级,极快,适合 >100万行预览
92
+
93
+ df_ml = session.sql("""
94
+ SELECT * FROM my_schema.events TABLESAMPLE ROW (10)
95
+ """).to_pandas() # ROW:行级精确,适合 ML 训练集
96
+
97
+ # 时序分析
98
+ session.sql("""
99
+ SELECT
100
+ DATE_TRUNC('day', order_time) AS dt,
101
+ COUNT(*) AS daily_orders,
102
+ SUM(amount) AS daily_revenue,
103
+ AVG(SUM(amount)) OVER (
104
+ ORDER BY DATE_TRUNC('day', order_time)
105
+ ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
106
+ ) AS revenue_7d_ma
107
+ FROM my_schema.orders
108
+ GROUP BY 1 ORDER BY 1
109
+ """).to_pandas().plot(x='dt', y=['daily_revenue', 'revenue_7d_ma'])
110
+ ```
@@ -0,0 +1,160 @@
1
+ # 环境搭建与项目配置
2
+
3
+ ## 环境搭建
4
+
5
+ ```bash
6
+ # 方式 1:venv(推荐)
7
+ python3.12 -m venv .venv
8
+ source .venv/bin/activate # macOS/Linux
9
+ pip install clickzetta_zettapark_python clickzetta-connector-python \
10
+ python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
11
+ -i https://pypi.tuna.tsinghua.edu.cn/simple
12
+
13
+ # 方式 2:pyenv(需要切换 Python 版本时)
14
+ pyenv install 3.12.9 && pyenv local 3.12.9
15
+ python -m venv .venv && source .venv/bin/activate
16
+ pip install clickzetta_zettapark_python clickzetta-connector-python \
17
+ python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
18
+ -i https://pypi.tuna.tsinghua.edu.cn/simple
19
+
20
+ # 方式 3:conda
21
+ conda create -n lakehouse-ds python=3.12 -y && conda activate lakehouse-ds
22
+ pip install clickzetta_zettapark_python clickzetta-connector-python \
23
+ python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
24
+ -i https://pypi.tuna.tsinghua.edu.cn/simple
25
+ ```
26
+
27
+ | 问题 | 修复 |
28
+ |------|------|
29
+ | Python 3.8/3.9 | `pyenv install 3.12.9` 或 `python3.12 -m venv .venv` |
30
+ | `pyarrow` 版本冲突 | `pip install pyarrow==14.0.0` |
31
+ | M1/M2 Mac 报错 | `pip install --no-binary :all:` 或改用 conda |
32
+ | 连接超时 | VCluster 未启动,在 Studio 中手动启动 |
33
+
34
+ ---
35
+
36
+ ## Jupyter Kernel 配置
37
+
38
+ ```bash
39
+ # 注册 venv 为 Jupyter kernel(关键步骤,否则 notebook 用系统 Python)
40
+ source .venv/bin/activate
41
+ pip install ipykernel jupyterlab
42
+ python -m ipykernel install --user --name lakehouse-ds --display-name "Python (lakehouse-ds)"
43
+
44
+ # 启动 JupyterLab
45
+ jupyter lab --port=8888
46
+ ```
47
+
48
+ VS Code / Cursor:打开 `.ipynb` → 右上角 "Select Kernel" → 选 "Python (lakehouse-ds)"
49
+
50
+ | 问题 | 修复 |
51
+ |------|------|
52
+ | `ModuleNotFoundError: clickzetta` | kernel 未选对,切换到注册的 venv kernel |
53
+ | `.env` 读不到 | `load_dotenv(dotenv_path='../.env')` 指定路径 |
54
+ | `to_pandas()` OOM | 加 `TABLESAMPLE ROW(1)` 或 `LIMIT` |
55
+ | 图表不显示 | notebook 开头加 `%matplotlib inline` |
56
+
57
+ ---
58
+
59
+ ## src/config.py 模板
60
+
61
+ ```python
62
+ import os, sys
63
+ from pathlib import Path
64
+ from dotenv import load_dotenv
65
+ from clickzetta.zettapark.session import Session
66
+ import clickzetta
67
+
68
+ # 多位置查找 .env
69
+ for _p in [
70
+ Path(__file__).parent.parent / ".env",
71
+ Path.home() / ".config" / "kilo" / ".env",
72
+ Path.home() / ".czcode" / ".env",
73
+ Path.home() / ".env",
74
+ ]:
75
+ if _p.exists():
76
+ load_dotenv(dotenv_path=_p)
77
+ break
78
+
79
+ def check_environment():
80
+ """在 00-env-check.ipynb 里调用,打印环境诊断。"""
81
+ ver = sys.version_info
82
+ if ver < (3, 10):
83
+ raise RuntimeError(
84
+ f"Python {ver.major}.{ver.minor} 不满足要求。ZettaPark 需要 Python 3.10+。\n"
85
+ "升级:brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9"
86
+ )
87
+ print(f"✅ Python {ver.major}.{ver.minor}.{ver.micro}")
88
+ for pkg, mod in [
89
+ ("clickzetta_zettapark_python", "clickzetta.zettapark"),
90
+ ("clickzetta-connector-python", "clickzetta"),
91
+ ("pandas", "pandas"), ("python-dotenv", "dotenv"),
92
+ ]:
93
+ try:
94
+ m = __import__(mod.split(".")[0])
95
+ print(f"✅ {pkg}: {getattr(m, '__version__', 'ok')}")
96
+ except ImportError:
97
+ print(f"❌ {pkg}: 未安装 → pip install {pkg}")
98
+ try:
99
+ s = get_session()
100
+ print(f"✅ Lakehouse: {s.sql('SELECT current_workspace(), current_user()').collect()}")
101
+ except Exception as e:
102
+ print(f"❌ Lakehouse 连接失败: {e}")
103
+
104
+ def get_session() -> Session:
105
+ return Session.builder.configs({
106
+ "service": os.environ["CLICKZETTA_SERVICE"],
107
+ "instance": os.environ["CLICKZETTA_INSTANCE"],
108
+ "workspace": os.environ["CLICKZETTA_WORKSPACE"],
109
+ "username": os.environ["CLICKZETTA_USERNAME"],
110
+ "password": os.environ["CLICKZETTA_PASSWORD"],
111
+ "vcluster": os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
112
+ "schema": os.environ.get("CLICKZETTA_SCHEMA", "public"),
113
+ }).create()
114
+
115
+ def get_connector_connection():
116
+ """仅用于 pd.read_sql。禁止用于 df.to_sql()。"""
117
+ return clickzetta.connect(
118
+ service=os.environ["CLICKZETTA_SERVICE"],
119
+ instance=os.environ["CLICKZETTA_INSTANCE"],
120
+ workspace=os.environ["CLICKZETTA_WORKSPACE"],
121
+ username=os.environ["CLICKZETTA_USERNAME"],
122
+ password=os.environ["CLICKZETTA_PASSWORD"],
123
+ vcluster=os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
124
+ schema=os.environ.get("CLICKZETTA_SCHEMA", "public"),
125
+ )
126
+ ```
127
+
128
+ ---
129
+
130
+ ## .env 模板
131
+
132
+ ```bash
133
+ CLICKZETTA_SERVICE=cn-shanghai-alicloud.api.clickzetta.com
134
+ CLICKZETTA_INSTANCE=<instance-id>
135
+ CLICKZETTA_WORKSPACE=<workspace>
136
+ CLICKZETTA_USERNAME=<username>
137
+ CLICKZETTA_PASSWORD=<password>
138
+ CLICKZETTA_VCLUSTER=default_ap
139
+ CLICKZETTA_SCHEMA=ds_workspace
140
+ ```
141
+
142
+ ## pyproject.toml
143
+
144
+ ```toml
145
+ [project]
146
+ name = "my-lakehouse-ds-project"
147
+ requires-python = ">=3.10"
148
+ dependencies = [
149
+ "clickzetta_zettapark_python>=0.1.2",
150
+ "clickzetta-connector-python>=1.0.0",
151
+ "python-dotenv>=1.0.0",
152
+ "pandas>=2.0.0",
153
+ "numpy>=1.24.0",
154
+ "scikit-learn>=1.3.0",
155
+ "pyarrow>=14.0.0",
156
+ "jupyterlab>=4.0.0",
157
+ "matplotlib>=3.7.0",
158
+ "seaborn>=0.12.0",
159
+ ]
160
+ ```