@clickzetta/cz-cli-darwin-x64 0.3.80 → 0.3.81

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/bin/cz-cli +0 -0
  2. package/package.json +1 -1
  3. package/bin/skills/clickzetta-access-control/LICENSE +0 -16
  4. package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
  5. package/bin/skills/clickzetta-access-control/eval_cases.jsonl +0 -3
  6. package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
  7. package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
  8. package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
  9. package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
  10. package/bin/skills/clickzetta-app-python-sdk/LICENSE +0 -16
  11. package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
  12. package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +0 -12
  13. package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
  14. package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
  15. package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
  16. package/bin/skills/clickzetta-batch-sync-pipeline/LICENSE +0 -16
  17. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -227
  18. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +0 -5
  19. package/bin/skills/clickzetta-bi-connect/LICENSE +0 -16
  20. package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
  21. package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +0 -5
  22. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
  23. package/bin/skills/clickzetta-cdc-sync-pipeline/LICENSE +0 -16
  24. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -633
  25. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +0 -5
  26. package/bin/skills/clickzetta-data-ingest-pipeline/LICENSE +0 -16
  27. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -237
  28. package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +0 -5
  29. package/bin/skills/clickzetta-data-retention/LICENSE +0 -16
  30. package/bin/skills/clickzetta-data-retention/SKILL.md +0 -160
  31. package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +0 -5
  32. package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +0 -175
  33. package/bin/skills/clickzetta-data-science/LICENSE +0 -16
  34. package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
  35. package/bin/skills/clickzetta-data-science/eval_cases.jsonl +0 -12
  36. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
  37. package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
  38. package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
  39. package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
  40. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
  41. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
  42. package/bin/skills/clickzetta-data-sharing/LICENSE +0 -16
  43. package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
  44. package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +0 -3
  45. package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
  46. package/bin/skills/clickzetta-dba-guide/LICENSE +0 -16
  47. package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -542
  48. package/bin/skills/clickzetta-dba-guide/eval_cases.jsonl +0 -3
  49. package/bin/skills/clickzetta-dw-modeling/LICENSE +0 -16
  50. package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -351
  51. package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +0 -4
  52. package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
  53. package/bin/skills/clickzetta-dynamic-table/LICENSE +0 -16
  54. package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -230
  55. package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -253
  56. package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
  58. package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
  59. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  60. package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
  61. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  62. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -427
  63. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  64. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
  65. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
  66. package/bin/skills/clickzetta-dynamic-table/eval_cases.jsonl +0 -5
  67. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/SKILL.md +0 -27
  68. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-column-validation-rules.md +0 -118
  69. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-conversion-rules.md +0 -225
  70. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-placeholder-rules.md +0 -182
  71. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-refresh-rules.md +0 -98
  72. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-self-reference-rules.md +0 -76
  73. package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-workflow.md +0 -109
  74. package/bin/skills/clickzetta-external-catalog/LICENSE +0 -16
  75. package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -123
  76. package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +0 -5
  77. package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
  78. package/bin/skills/clickzetta-external-function/LICENSE +0 -16
  79. package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
  80. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +0 -4
  81. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
  82. package/bin/skills/clickzetta-file-import-pipeline/LICENSE +0 -16
  83. package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -190
  84. package/bin/skills/clickzetta-file-import-pipeline/eval_cases.jsonl +0 -5
  85. package/bin/skills/clickzetta-index-manager/LICENSE +0 -16
  86. package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
  87. package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +0 -5
  88. package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
  89. package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
  90. package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
  91. package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
  92. package/bin/skills/clickzetta-java-sdk/LICENSE +0 -16
  93. package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
  94. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +0 -12
  95. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
  96. package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
  97. package/bin/skills/clickzetta-kafka-ingest-pipeline/LICENSE +0 -16
  98. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -769
  99. package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +0 -5
  100. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -324
  101. package/bin/skills/clickzetta-lakehouse-connect/LICENSE +0 -16
  102. package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
  103. package/bin/skills/clickzetta-lakehouse-connect/eval_cases.jsonl +0 -3
  104. package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
  105. package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
  106. package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
  107. package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
  108. package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
  109. package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
  110. package/bin/skills/clickzetta-manage-comments/LICENSE +0 -16
  111. package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
  112. package/bin/skills/clickzetta-manage-comments/eval_cases.jsonl +0 -3
  113. package/bin/skills/clickzetta-metadata/LICENSE +0 -16
  114. package/bin/skills/clickzetta-metadata/SKILL.md +0 -502
  115. package/bin/skills/clickzetta-metadata/eval_cases.jsonl +0 -5
  116. package/bin/skills/clickzetta-metadata/references/instance-views-reference.md +0 -276
  117. package/bin/skills/clickzetta-metadata/references/metering-views-reference.md +0 -137
  118. package/bin/skills/clickzetta-metadata/references/show-desc-reference.md +0 -326
  119. package/bin/skills/clickzetta-metadata/references/views-reference.md +0 -271
  120. package/bin/skills/clickzetta-monitoring/LICENSE +0 -16
  121. package/bin/skills/clickzetta-monitoring/SKILL.md +0 -215
  122. package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +0 -5
  123. package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
  124. package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
  125. package/bin/skills/clickzetta-oss-ingest-pipeline/LICENSE +0 -16
  126. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -562
  127. package/bin/skills/clickzetta-oss-ingest-pipeline/eval_cases.jsonl +0 -5
  128. package/bin/skills/clickzetta-overview/LICENSE +0 -16
  129. package/bin/skills/clickzetta-overview/SKILL.md +0 -102
  130. package/bin/skills/clickzetta-overview/eval_cases.jsonl +0 -5
  131. package/bin/skills/clickzetta-overview/references/brands-and-endpoints.md +0 -79
  132. package/bin/skills/clickzetta-overview/references/object-model.md +0 -311
  133. package/bin/skills/clickzetta-overview/references/studio-modules.md +0 -173
  134. package/bin/skills/clickzetta-pipeline-review/LICENSE +0 -16
  135. package/bin/skills/clickzetta-pipeline-review/SKILL.md +0 -377
  136. package/bin/skills/clickzetta-query-optimizer/LICENSE +0 -16
  137. package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
  138. package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +0 -5
  139. package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
  140. package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
  141. package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
  142. package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
  143. package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
  144. package/bin/skills/clickzetta-realtime-sync-pipeline/LICENSE +0 -16
  145. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -323
  146. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +0 -5
  147. package/bin/skills/clickzetta-semantic-view/LICENSE +0 -16
  148. package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
  149. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +0 -12
  150. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
  151. package/bin/skills/clickzetta-spark-flink-connector/LICENSE +0 -16
  152. package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
  153. package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +0 -5
  154. package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
  155. package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
  156. package/bin/skills/clickzetta-sql-pipeline-manager/LICENSE +0 -16
  157. package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -485
  158. package/bin/skills/clickzetta-sql-pipeline-manager/eval_cases.jsonl +0 -12
  159. package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
  160. package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -185
  161. package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
  162. package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -222
  163. package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -125
  164. package/bin/skills/clickzetta-sql-syntax-guide/LICENSE +0 -16
  165. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  166. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  167. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  168. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  169. package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
  170. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  171. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  172. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
  173. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  174. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  175. package/bin/skills/clickzetta-studio-task-manager/LICENSE +0 -16
  176. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +0 -652
  177. package/bin/skills/clickzetta-table-lineage/LICENSE +0 -16
  178. package/bin/skills/clickzetta-table-lineage/SKILL.md +0 -90
  179. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +0 -1
  180. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +0 -14
  181. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +0 -38
  182. package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +0 -562
  183. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +0 -25
  184. package/bin/skills/clickzetta-table-stream-pipeline/LICENSE +0 -16
  185. package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -206
  186. package/bin/skills/clickzetta-table-stream-pipeline/eval_cases.jsonl +0 -5
  187. package/bin/skills/clickzetta-vcluster-manager/LICENSE +0 -16
  188. package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
  189. package/bin/skills/clickzetta-vcluster-manager/eval_cases.jsonl +0 -5
  190. package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
  191. package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
  192. package/bin/skills/clickzetta-volume-manager/LICENSE +0 -16
  193. package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -292
  194. package/bin/skills/clickzetta-volume-manager/eval_cases.jsonl +0 -5
  195. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -199
  196. package/bin/skills/clickzetta-zettapark/LICENSE +0 -16
  197. package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
  198. package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +0 -12
  199. package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
  200. package/bin/skills/cz-cli/SKILL.md +0 -311
  201. package/bin/skills/cz-cli/references/profile-setup.md +0 -120
@@ -1,248 +0,0 @@
1
- ---
2
- name: clickzetta-zettapark
3
- description: |
4
- 使用 ZettaPark Python 库操作 ClickZetta Lakehouse 数据。ZettaPark 提供类 pandas 的
5
- DataFrame API,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行。
6
- 覆盖 Session 创建、DataFrame 构建与转换(filter/select/join/groupBy)、
7
- 结果收集(collect/to_pandas/show)、写入表(save_as_table)、
8
- 文件操作(PUT/GET)、执行 SQL 等完整工作流。
9
- 当用户说"ZettaPark"、"zettapark"、"DataFrame API"、"Python 操作 Lakehouse"、
10
- "save_as_table"、"session.table"、"session.sql"、"collect()"、"to_pandas"、
11
- "Python 数据工程"、"Python 写入 Lakehouse"、"Python 读取 Lakehouse"、
12
- "clickzetta_zettapark_python"时触发。
13
- Keywords: ZettaPark, DataFrame, pandas-like, Python, SQL translation, distributed compute
14
- ---
15
-
16
- # ClickZetta ZettaPark
17
-
18
- ZettaPark 是 ClickZetta Lakehouse 的 Python DataFrame 框架,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行,提供类 pandas 的开发体验。
19
-
20
- 阅读 [references/zettapark-api.md](references/zettapark-api.md) 了解完整 API。
21
-
22
- ## 安装
23
-
24
- > ⚠️ **Python 版本要求**:推荐 **Python 3.12**(最低 3.10,不支持 3.9 及以下)
25
-
26
- ```bash
27
- # 方式 1:venv(Python 内置,推荐)
28
- python3.12 -m venv .venv
29
- source .venv/bin/activate # macOS/Linux | .venv\Scripts\activate (Windows)
30
- pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
31
-
32
- # 方式 2:pyenv(需要切换 Python 版本时)
33
- pyenv install 3.12.9 && pyenv local 3.12.9
34
- python -m venv .venv && source .venv/bin/activate
35
- pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
36
-
37
- # 方式 3:conda(数据科学环境)
38
- conda create -n lakehouse python=3.12 -y && conda activate lakehouse
39
- pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
40
- ```
41
-
42
- ---
43
-
44
- ## 创建会话
45
-
46
- ```python
47
- from clickzetta.zettapark.session import Session
48
-
49
- connection_parameters = {
50
- "username": "your_username",
51
- "password": "your_password",
52
- "service": "cn-shanghai-alicloud.api.clickzetta.com",
53
- "instance": "your_instance_id",
54
- "workspace": "your_workspace",
55
- "schema": "public",
56
- "vcluster": "default_ap",
57
- }
58
-
59
- session = Session.builder.configs(connection_parameters).create()
60
-
61
- # 验证连接
62
- session.sql("SELECT current_user(), current_workspace()").show()
63
- ```
64
-
65
- ---
66
-
67
- ## 核心工作流
68
-
69
- ### 读取数据
70
-
71
- ```python
72
- from clickzetta.zettapark import functions as F
73
-
74
- # 从表读取
75
- df = session.table("orders")
76
- df = session.table("my_schema.orders")
77
-
78
- # 从 SQL 读取
79
- df = session.sql("SELECT * FROM orders WHERE year = 2024")
80
-
81
- # 从 Python 数据创建
82
- df = session.create_dataframe([[1, "Alice", 100.0], [2, "Bob", 200.0]],
83
- schema=["id", "name", "amount"])
84
- ```
85
-
86
- ### 转换数据
87
-
88
- ```python
89
- # 过滤、选择、新增列
90
- result = (
91
- session.table("orders")
92
- .filter(F.col("status") == "completed")
93
- .select("order_id", "customer_id", "amount")
94
- .with_column("tax", F.col("amount") * 0.1)
95
- .sort(F.col("amount").desc())
96
- .limit(100)
97
- )
98
- ```
99
-
100
- ### 聚合
101
-
102
- ```python
103
- summary = (
104
- session.table("orders")
105
- .group_by("category")
106
- .agg(
107
- F.sum("amount").as_("total"),
108
- F.count("*").as_("cnt"),
109
- F.avg("amount").as_("avg_amount"),
110
- )
111
- )
112
- summary.show()
113
- ```
114
-
115
- ### JOIN
116
-
117
- ```python
118
- orders = session.table("orders")
119
- customers = session.table("customers")
120
-
121
- result = orders.join(
122
- customers,
123
- orders["customer_id"] == customers["id"],
124
- "left"
125
- ).select(
126
- orders["order_id"],
127
- customers["name"],
128
- orders["amount"]
129
- )
130
- ```
131
-
132
- ### 写入数据
133
-
134
- ```python
135
- # 追加到已有表
136
- df.write.save_as_table("result_table", mode="append")
137
-
138
- # 覆盖写入(自动建表)
139
- df.write.save_as_table("result_table", mode="overwrite")
140
- ```
141
-
142
- ### 获取结果
143
-
144
- ```python
145
- # 打印预览
146
- df.show(20)
147
-
148
- # 收集为 Row 列表
149
- rows = df.collect()
150
- for row in rows:
151
- print(row["id"], row["name"])
152
-
153
- # 转为 Pandas DataFrame(小数据量)
154
- pandas_df = df.to_pandas()
155
-
156
- # 获取行数
157
- print(df.count())
158
- ```
159
-
160
- ---
161
-
162
- ## 典型场景
163
-
164
- ### 场景 1:ETL 数据处理
165
-
166
- ```python
167
- from clickzetta.zettapark.session import Session
168
- from clickzetta.zettapark import functions as F
169
-
170
- session = Session.builder.configs(config).create()
171
-
172
- # 读取原始数据
173
- raw = session.table("bronze.raw_orders")
174
-
175
- # 清洗转换
176
- cleaned = (
177
- raw
178
- .filter(F.isnotnull(F.col("order_id")))
179
- .filter(F.col("amount") > 0)
180
- .with_column("order_date", F.col("created_at").cast("DATE"))
181
- .with_column("year_month", F.date_format(F.col("order_date"), "yyyy-MM"))
182
- .select("order_id", "customer_id", "amount", "order_date", "year_month")
183
- )
184
-
185
- # 写入 Silver 层
186
- cleaned.write.save_as_table("silver.orders_cleaned", mode="overwrite")
187
-
188
- session.close()
189
- ```
190
-
191
- ### 场景 2:特征工程(机器学习)
192
-
193
- ```python
194
- from clickzetta.zettapark import functions as F
195
-
196
- customer = session.table("clickzetta_sample_data.tpch_100g.customer")
197
- orders = session.table("clickzetta_sample_data.tpch_100g.orders")
198
-
199
- # 构建客户消费特征
200
- customer_features = (
201
- orders
202
- .group_by("o_custkey")
203
- .agg(
204
- F.sum("o_totalprice").as_("total_spend"),
205
- F.count("*").as_("order_count"),
206
- F.avg("o_totalprice").as_("avg_order_value"),
207
- F.max("o_orderdate").as_("last_order_date"),
208
- )
209
- .join(customer, orders["o_custkey"] == customer["c_custkey"])
210
- .select("c_custkey", "c_name", "total_spend", "order_count", "avg_order_value")
211
- )
212
-
213
- customer_features.write.save_as_table("ml_features.customer_features", mode="overwrite")
214
- ```
215
-
216
- ### 场景 3:从本地文件导入
217
-
218
- ```python
219
- import json
220
- import gzip
221
- from clickzetta.zettapark.session import Session
222
-
223
- session = Session.builder.configs(config).create()
224
-
225
- # 读取本地 JSON 数据
226
- data = []
227
- with gzip.open('data.json.gz', 'rt', encoding='utf-8') as f:
228
- for line in f:
229
- if line.strip():
230
- data.append(json.loads(line))
231
-
232
- # 创建 DataFrame 并写入
233
- df = session.create_dataframe(data)
234
- df.write.save_as_table("my_table", mode="overwrite")
235
-
236
- session.close()
237
- ```
238
-
239
- ---
240
-
241
- ## 常见问题
242
-
243
- | 问题 | 原因 | 解决方案 |
244
- |---|---|---|
245
- | `collect()` 超时 | 数据量过大或集群规格不足 | 增大 `sdk.job.timeout`,或先 `limit()` 测试 |
246
- | `to_pandas()` 内存溢出 | 结果集过大 | 先聚合/过滤再转 pandas,或分批处理 |
247
- | 列名冲突(JOIN 后) | 两表有同名列 | 用 `df_left["col"]` 明确指定来源 |
248
- | `save_as_table` 报错 | 表已存在且 mode 不对 | 使用 `mode="overwrite"` 或 `mode="append"` |
@@ -1,12 +0,0 @@
1
- {"case_id":"001","type":"should_call","user_input":"用 ZettaPark 读取 orders 表并过滤 amount > 100","expected_skill":"clickzetta-zettapark","expected_output_contains":["session.table","filter"]}
2
- {"case_id":"002","type":"should_call","user_input":"ZettaPark 怎么安装?需要什么 Python 版本?","expected_skill":"clickzetta-zettapark","expected_output_contains":["pip install","3.12"]}
3
- {"case_id":"003","type":"should_call","user_input":"怎么用 DataFrame API 做 group by 聚合","expected_skill":"clickzetta-zettapark","expected_output_contains":["group_by","agg"]}
4
- {"case_id":"004","type":"should_call","user_input":"save_as_table 怎么用?支持哪些写入模式?","expected_skill":"clickzetta-zettapark","expected_output_contains":["save_as_table","overwrite","append"]}
5
- {"case_id":"005","type":"should_call","user_input":"ZettaPark 怎么把结果转成 pandas DataFrame","expected_skill":"clickzetta-zettapark","expected_output_contains":["to_pandas"]}
6
- {"case_id":"006","type":"should_call","user_input":"用 session.sql 执行一段 SQL 查询","expected_skill":"clickzetta-zettapark","expected_output_contains":["session.sql"]}
7
- {"case_id":"007","type":"should_call","user_input":"ZettaPark 怎么 join 两张表","expected_skill":"clickzetta-zettapark","expected_output_contains":["join"]}
8
- {"case_id":"008","type":"should_not_call","user_input":"帮我写一个 Flask Web 应用","forbidden_skill":"clickzetta-zettapark"}
9
- {"case_id":"009","type":"should_not_call","user_input":"pandas 怎么读取 CSV 文件","forbidden_skill":"clickzetta-zettapark"}
10
- {"case_id":"010","type":"should_not_call","user_input":"怎么用 JDBC 连接 Lakehouse","forbidden_skill":"clickzetta-zettapark"}
11
- {"case_id":"011","type":"should_not_call","user_input":"帮我创建一个 VCluster","forbidden_skill":"clickzetta-zettapark"}
12
- {"case_id":"012","type":"should_not_call","user_input":"Spark DataFrame 怎么用","forbidden_skill":"clickzetta-zettapark"}
@@ -1,283 +0,0 @@
1
- # ZettaPark 快速参考
2
-
3
- > 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
4
-
5
- ## 安装
6
-
7
- ```bash
8
- pip install clickzetta_zettapark_python -U -i https://pypi.tuna.tsinghua.edu.cn/simple
9
- ```
10
-
11
- ---
12
-
13
- ## 创建会话
14
-
15
- ```python
16
- from clickzetta.zettapark.session import Session
17
-
18
- connection_parameters = {
19
- "username": "your_username",
20
- "password": "your_password",
21
- "service": "cn-shanghai-alicloud.api.clickzetta.com",
22
- "instance": "your_instance_id",
23
- "workspace": "your_workspace",
24
- "schema": "public",
25
- "vcluster": "default_ap",
26
- }
27
-
28
- session = Session.builder.configs(connection_parameters).create()
29
- ```
30
-
31
- 带 hints(超时、query_tag 等):
32
-
33
- ```python
34
- connection_parameters = {
35
- "username": "your_username",
36
- "password": "your_password",
37
- "service": "cn-shanghai-alicloud.api.clickzetta.com",
38
- "instance": "your_instance_id",
39
- "workspace": "your_workspace",
40
- "schema": "public",
41
- "vcluster": "default_ap",
42
- "hints": {
43
- "sdk.job.timeout": 300,
44
- "query_tag": "my_zettapark_app",
45
- }
46
- }
47
-
48
- session = Session.builder.configs(connection_parameters).create()
49
- ```
50
-
51
- 从 JSON 配置文件读取:
52
-
53
- ```python
54
- import json
55
- with open('config.json', 'r') as f:
56
- config = json.load(f)
57
- session = Session.builder.configs(config).create()
58
- ```
59
-
60
- 验证连接:
61
-
62
- ```python
63
- session.sql("SELECT current_user(), current_workspace(), current_vcluster()").show()
64
- ```
65
-
66
- 关闭会话:
67
-
68
- ```python
69
- session.close()
70
- ```
71
-
72
- ---
73
-
74
- ## 构建 DataFrame
75
-
76
- ```python
77
- # 从表创建
78
- df = session.table("my_schema.my_table")
79
-
80
- # 从 SQL 创建
81
- df = session.sql("SELECT * FROM orders WHERE year = 2024")
82
-
83
- # 从 Python 数据创建
84
- df = session.create_dataframe([1, 2, 3, 4]).to_df("id")
85
- df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema=["id", "name"])
86
-
87
- # 从 Row 对象创建
88
- from clickzetta.zettapark import Row
89
- df = session.create_dataframe([Row(id=1, name="Alice"), Row(id=2, name="Bob")])
90
-
91
- # 带 Schema 创建
92
- from clickzetta.zettapark.types import IntegerType, StringType, StructType, StructField
93
- schema = StructType([StructField("id", IntegerType()), StructField("name", StringType())])
94
- df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema)
95
-
96
- # 范围序列
97
- df = session.range(1, 10, 2).to_df("n") # 1,3,5,7,9
98
- ```
99
-
100
- ---
101
-
102
- ## DataFrame 转换操作
103
-
104
- ```python
105
- from clickzetta.zettapark import functions as F
106
-
107
- # 过滤行
108
- df.filter(F.col("age") > 18)
109
- df.filter(F.col("status") == "active")
110
- df.where(F.col("amount") > 1000)
111
-
112
- # 选择列
113
- df.select("id", "name", "amount")
114
- df.select(F.col("id"), F.col("name").as_("user_name"))
115
-
116
- # 新增/修改列
117
- df.with_column("total", F.col("price") * F.col("qty"))
118
- df.with_column("upper_name", F.upper(F.col("name")))
119
-
120
- # 重命名列
121
- df.rename(F.col("old_name"), "new_name")
122
-
123
- # 排序
124
- df.sort(F.col("amount").desc())
125
- df.order_by(F.col("created_at").asc())
126
-
127
- # 去重
128
- df.distinct()
129
- df.drop_duplicates(["user_id"])
130
-
131
- # 限制行数
132
- df.limit(100)
133
-
134
- # 删除列
135
- df.drop("unnecessary_col")
136
- ```
137
-
138
- ---
139
-
140
- ## 聚合操作
141
-
142
- ```python
143
- from clickzetta.zettapark import functions as F
144
-
145
- # 分组聚合
146
- df.group_by("category").agg(
147
- F.sum("amount").as_("total_amount"),
148
- F.count("*").as_("order_count"),
149
- F.avg("price").as_("avg_price"),
150
- F.max("amount").as_("max_amount"),
151
- F.min("amount").as_("min_amount"),
152
- )
153
-
154
- # 全局聚合
155
- df.agg(F.count("*"), F.sum("amount"))
156
- ```
157
-
158
- ---
159
-
160
- ## JOIN 操作
161
-
162
- ```python
163
- # 内连接
164
- df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"])
165
-
166
- # 左连接
167
- df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"], "left")
168
-
169
- # 选择连接后的列(避免列名冲突)
170
- result = df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"]) \
171
- .select(df_orders["order_id"], df_customers["name"], df_orders["amount"])
172
- ```
173
-
174
- ---
175
-
176
- ## 执行与结果获取
177
-
178
- ```python
179
- # 打印前 N 行(触发执行)
180
- df.show()
181
- df.show(20)
182
-
183
- # 收集所有结果为 Row 列表
184
- rows = df.collect()
185
- for row in rows:
186
- print(row["id"], row["name"])
187
-
188
- # 转换为 Pandas DataFrame
189
- pandas_df = df.to_pandas()
190
-
191
- # 获取行数
192
- count = df.count()
193
-
194
- # 获取列名
195
- print(df.columns)
196
-
197
- # 查看 Schema
198
- df.schema.print_tree()
199
- ```
200
-
201
- ---
202
-
203
- ## 写入数据
204
-
205
- ```python
206
- # 写入已有表(追加)
207
- df.write.save_as_table("my_table", mode="append")
208
-
209
- # 覆盖写入
210
- df.write.save_as_table("my_table", mode="overwrite")
211
-
212
- # 自动建表并写入(overwrite 会重建表)
213
- df.write.save_as_table("new_table", mode="overwrite")
214
-
215
- # 写入指定 Schema 下的表
216
- df.write.save_as_table("my_schema.my_table", mode="append")
217
- ```
218
-
219
- ---
220
-
221
- ## 执行 SQL
222
-
223
- ```python
224
- # 执行 DDL/DML
225
- session.sql("CREATE TABLE IF NOT EXISTS t (id INT, name STRING)").collect()
226
- session.sql("INSERT INTO t VALUES (1, 'Alice')").collect()
227
-
228
- # 执行查询并获取 DataFrame
229
- df = session.sql("SELECT * FROM orders WHERE amount > 1000")
230
- df.show()
231
-
232
- # 切换 Schema
233
- session.use_schema("my_schema")
234
- ```
235
-
236
- ---
237
-
238
- ## 文件操作(Volume)
239
-
240
- ```python
241
- # 上传文件到 User Volume
242
- session.file.put("/local/path/data.csv", "volume:user://~/data/")
243
-
244
- # 下载文件
245
- session.file.get("volume:user://~/data/data.csv", "/local/output/")
246
-
247
- # 列出 User Volume 文件
248
- session.sql("LIST USER VOLUME").show()
249
- session.sql("SHOW USER VOLUME DIRECTORY").show()
250
- ```
251
-
252
- ---
253
-
254
- ## 常用 functions 速查
255
-
256
- ```python
257
- from clickzetta.zettapark import functions as F
258
-
259
- # 字符串
260
- F.upper(col), F.lower(col), F.concat(col1, col2)
261
- F.substring(col, 1, 3), F.trim(col), F.length(col)
262
-
263
- # 数值
264
- F.abs(col), F.round(col, 2), F.floor(col), F.ceil(col)
265
- F.sqrt(col), F.pow(col, 2)
266
-
267
- # 日期时间
268
- F.current_date(), F.current_timestamp()
269
- F.year(col), F.month(col), F.day(col)
270
- F.date_add(col, 7), F.datediff(col1, col2)
271
-
272
- # 条件
273
- F.when(F.col("status") == "A", "Active").otherwise("Inactive")
274
- F.coalesce(col1, col2) # 第一个非 null 值
275
- F.isnull(col), F.isnotnull(col)
276
-
277
- # 聚合
278
- F.count("*"), F.sum(col), F.avg(col), F.max(col), F.min(col)
279
- F.count_distinct(col)
280
-
281
- # 类型转换
282
- F.col("amount").cast(IntegerType())
283
- ```