@clickzetta/cz-cli-darwin-x64 0.3.78 → 0.3.81
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-access-control/LICENSE +0 -16
- package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
- package/bin/skills/clickzetta-access-control/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
- package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
- package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
- package/bin/skills/clickzetta-app-python-sdk/LICENSE +0 -16
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +0 -12
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
- package/bin/skills/clickzetta-batch-sync-pipeline/LICENSE +0 -16
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -227
- package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-bi-connect/LICENSE +0 -16
- package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
- package/bin/skills/clickzetta-cdc-sync-pipeline/LICENSE +0 -16
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -633
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-data-ingest-pipeline/LICENSE +0 -16
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -237
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-data-retention/LICENSE +0 -16
- package/bin/skills/clickzetta-data-retention/SKILL.md +0 -160
- package/bin/skills/clickzetta-data-retention/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-data-retention/references/lifecycle-reference.md +0 -175
- package/bin/skills/clickzetta-data-science/LICENSE +0 -16
- package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +0 -12
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
- package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
- package/bin/skills/clickzetta-data-sharing/LICENSE +0 -16
- package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
- package/bin/skills/clickzetta-dba-guide/LICENSE +0 -16
- package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -542
- package/bin/skills/clickzetta-dba-guide/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-dw-modeling/LICENSE +0 -16
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -351
- package/bin/skills/clickzetta-dw-modeling/eval_cases.jsonl +0 -4
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
- package/bin/skills/clickzetta-dynamic-table/LICENSE +0 -16
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -230
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -253
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
- package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
- package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -427
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
- package/bin/skills/clickzetta-dynamic-table/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/SKILL.md +0 -27
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-column-validation-rules.md +0 -118
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-conversion-rules.md +0 -225
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-placeholder-rules.md +0 -182
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-refresh-rules.md +0 -98
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-self-reference-rules.md +0 -76
- package/bin/skills/clickzetta-dynamic-table/sql-to-dt/references/sql2dt-workflow.md +0 -109
- package/bin/skills/clickzetta-external-catalog/LICENSE +0 -16
- package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -123
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
- package/bin/skills/clickzetta-external-function/LICENSE +0 -16
- package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +0 -4
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
- package/bin/skills/clickzetta-file-import-pipeline/LICENSE +0 -16
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -190
- package/bin/skills/clickzetta-file-import-pipeline/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-index-manager/LICENSE +0 -16
- package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
- package/bin/skills/clickzetta-index-manager/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
- package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
- package/bin/skills/clickzetta-java-sdk/LICENSE +0 -16
- package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +0 -12
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
- package/bin/skills/clickzetta-kafka-ingest-pipeline/LICENSE +0 -16
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -769
- package/bin/skills/clickzetta-kafka-ingest-pipeline/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -324
- package/bin/skills/clickzetta-lakehouse-connect/LICENSE +0 -16
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
- package/bin/skills/clickzetta-lakehouse-connect/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
- package/bin/skills/clickzetta-manage-comments/LICENSE +0 -16
- package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
- package/bin/skills/clickzetta-manage-comments/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-metadata/LICENSE +0 -16
- package/bin/skills/clickzetta-metadata/SKILL.md +0 -502
- package/bin/skills/clickzetta-metadata/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-metadata/references/instance-views-reference.md +0 -276
- package/bin/skills/clickzetta-metadata/references/metering-views-reference.md +0 -137
- package/bin/skills/clickzetta-metadata/references/show-desc-reference.md +0 -326
- package/bin/skills/clickzetta-metadata/references/views-reference.md +0 -271
- package/bin/skills/clickzetta-monitoring/LICENSE +0 -16
- package/bin/skills/clickzetta-monitoring/SKILL.md +0 -215
- package/bin/skills/clickzetta-monitoring/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
- package/bin/skills/clickzetta-oss-ingest-pipeline/LICENSE +0 -16
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -562
- package/bin/skills/clickzetta-oss-ingest-pipeline/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-overview/LICENSE +0 -16
- package/bin/skills/clickzetta-overview/SKILL.md +0 -102
- package/bin/skills/clickzetta-overview/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-overview/references/brands-and-endpoints.md +0 -79
- package/bin/skills/clickzetta-overview/references/object-model.md +0 -311
- package/bin/skills/clickzetta-overview/references/studio-modules.md +0 -173
- package/bin/skills/clickzetta-pipeline-review/LICENSE +0 -16
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +0 -377
- package/bin/skills/clickzetta-query-optimizer/LICENSE +0 -16
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
- package/bin/skills/clickzetta-query-optimizer/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
- package/bin/skills/clickzetta-realtime-sync-pipeline/LICENSE +0 -16
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -323
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-semantic-view/LICENSE +0 -16
- package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +0 -12
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
- package/bin/skills/clickzetta-spark-flink-connector/LICENSE +0 -16
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
- package/bin/skills/clickzetta-sql-pipeline-manager/LICENSE +0 -16
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -485
- package/bin/skills/clickzetta-sql-pipeline-manager/eval_cases.jsonl +0 -12
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -185
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -222
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -125
- package/bin/skills/clickzetta-sql-syntax-guide/LICENSE +0 -16
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- package/bin/skills/clickzetta-studio-task-manager/LICENSE +0 -16
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +0 -652
- package/bin/skills/clickzetta-table-lineage/LICENSE +0 -16
- package/bin/skills/clickzetta-table-lineage/SKILL.md +0 -90
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +0 -1
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +0 -14
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +0 -38
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +0 -562
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +0 -25
- package/bin/skills/clickzetta-table-stream-pipeline/LICENSE +0 -16
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -206
- package/bin/skills/clickzetta-table-stream-pipeline/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-vcluster-manager/LICENSE +0 -16
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
- package/bin/skills/clickzetta-vcluster-manager/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
- package/bin/skills/clickzetta-volume-manager/LICENSE +0 -16
- package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -292
- package/bin/skills/clickzetta-volume-manager/eval_cases.jsonl +0 -5
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -199
- package/bin/skills/clickzetta-zettapark/LICENSE +0 -16
- package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +0 -12
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
- package/bin/skills/cz-cli/SKILL.md +0 -311
- package/bin/skills/cz-cli/references/profile-setup.md +0 -120
|
@@ -1,248 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: clickzetta-zettapark
|
|
3
|
-
description: |
|
|
4
|
-
使用 ZettaPark Python 库操作 ClickZetta Lakehouse 数据。ZettaPark 提供类 pandas 的
|
|
5
|
-
DataFrame API,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行。
|
|
6
|
-
覆盖 Session 创建、DataFrame 构建与转换(filter/select/join/groupBy)、
|
|
7
|
-
结果收集(collect/to_pandas/show)、写入表(save_as_table)、
|
|
8
|
-
文件操作(PUT/GET)、执行 SQL 等完整工作流。
|
|
9
|
-
当用户说"ZettaPark"、"zettapark"、"DataFrame API"、"Python 操作 Lakehouse"、
|
|
10
|
-
"save_as_table"、"session.table"、"session.sql"、"collect()"、"to_pandas"、
|
|
11
|
-
"Python 数据工程"、"Python 写入 Lakehouse"、"Python 读取 Lakehouse"、
|
|
12
|
-
"clickzetta_zettapark_python"时触发。
|
|
13
|
-
Keywords: ZettaPark, DataFrame, pandas-like, Python, SQL translation, distributed compute
|
|
14
|
-
---
|
|
15
|
-
|
|
16
|
-
# ClickZetta ZettaPark
|
|
17
|
-
|
|
18
|
-
ZettaPark 是 ClickZetta Lakehouse 的 Python DataFrame 框架,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行,提供类 pandas 的开发体验。
|
|
19
|
-
|
|
20
|
-
阅读 [references/zettapark-api.md](references/zettapark-api.md) 了解完整 API。
|
|
21
|
-
|
|
22
|
-
## 安装
|
|
23
|
-
|
|
24
|
-
> ⚠️ **Python 版本要求**:推荐 **Python 3.12**(最低 3.10,不支持 3.9 及以下)
|
|
25
|
-
|
|
26
|
-
```bash
|
|
27
|
-
# 方式 1:venv(Python 内置,推荐)
|
|
28
|
-
python3.12 -m venv .venv
|
|
29
|
-
source .venv/bin/activate # macOS/Linux | .venv\Scripts\activate (Windows)
|
|
30
|
-
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
31
|
-
|
|
32
|
-
# 方式 2:pyenv(需要切换 Python 版本时)
|
|
33
|
-
pyenv install 3.12.9 && pyenv local 3.12.9
|
|
34
|
-
python -m venv .venv && source .venv/bin/activate
|
|
35
|
-
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
36
|
-
|
|
37
|
-
# 方式 3:conda(数据科学环境)
|
|
38
|
-
conda create -n lakehouse python=3.12 -y && conda activate lakehouse
|
|
39
|
-
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
40
|
-
```
|
|
41
|
-
|
|
42
|
-
---
|
|
43
|
-
|
|
44
|
-
## 创建会话
|
|
45
|
-
|
|
46
|
-
```python
|
|
47
|
-
from clickzetta.zettapark.session import Session
|
|
48
|
-
|
|
49
|
-
connection_parameters = {
|
|
50
|
-
"username": "your_username",
|
|
51
|
-
"password": "your_password",
|
|
52
|
-
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
53
|
-
"instance": "your_instance_id",
|
|
54
|
-
"workspace": "your_workspace",
|
|
55
|
-
"schema": "public",
|
|
56
|
-
"vcluster": "default_ap",
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
session = Session.builder.configs(connection_parameters).create()
|
|
60
|
-
|
|
61
|
-
# 验证连接
|
|
62
|
-
session.sql("SELECT current_user(), current_workspace()").show()
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
---
|
|
66
|
-
|
|
67
|
-
## 核心工作流
|
|
68
|
-
|
|
69
|
-
### 读取数据
|
|
70
|
-
|
|
71
|
-
```python
|
|
72
|
-
from clickzetta.zettapark import functions as F
|
|
73
|
-
|
|
74
|
-
# 从表读取
|
|
75
|
-
df = session.table("orders")
|
|
76
|
-
df = session.table("my_schema.orders")
|
|
77
|
-
|
|
78
|
-
# 从 SQL 读取
|
|
79
|
-
df = session.sql("SELECT * FROM orders WHERE year = 2024")
|
|
80
|
-
|
|
81
|
-
# 从 Python 数据创建
|
|
82
|
-
df = session.create_dataframe([[1, "Alice", 100.0], [2, "Bob", 200.0]],
|
|
83
|
-
schema=["id", "name", "amount"])
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
### 转换数据
|
|
87
|
-
|
|
88
|
-
```python
|
|
89
|
-
# 过滤、选择、新增列
|
|
90
|
-
result = (
|
|
91
|
-
session.table("orders")
|
|
92
|
-
.filter(F.col("status") == "completed")
|
|
93
|
-
.select("order_id", "customer_id", "amount")
|
|
94
|
-
.with_column("tax", F.col("amount") * 0.1)
|
|
95
|
-
.sort(F.col("amount").desc())
|
|
96
|
-
.limit(100)
|
|
97
|
-
)
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
### 聚合
|
|
101
|
-
|
|
102
|
-
```python
|
|
103
|
-
summary = (
|
|
104
|
-
session.table("orders")
|
|
105
|
-
.group_by("category")
|
|
106
|
-
.agg(
|
|
107
|
-
F.sum("amount").as_("total"),
|
|
108
|
-
F.count("*").as_("cnt"),
|
|
109
|
-
F.avg("amount").as_("avg_amount"),
|
|
110
|
-
)
|
|
111
|
-
)
|
|
112
|
-
summary.show()
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
### JOIN
|
|
116
|
-
|
|
117
|
-
```python
|
|
118
|
-
orders = session.table("orders")
|
|
119
|
-
customers = session.table("customers")
|
|
120
|
-
|
|
121
|
-
result = orders.join(
|
|
122
|
-
customers,
|
|
123
|
-
orders["customer_id"] == customers["id"],
|
|
124
|
-
"left"
|
|
125
|
-
).select(
|
|
126
|
-
orders["order_id"],
|
|
127
|
-
customers["name"],
|
|
128
|
-
orders["amount"]
|
|
129
|
-
)
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
### 写入数据
|
|
133
|
-
|
|
134
|
-
```python
|
|
135
|
-
# 追加到已有表
|
|
136
|
-
df.write.save_as_table("result_table", mode="append")
|
|
137
|
-
|
|
138
|
-
# 覆盖写入(自动建表)
|
|
139
|
-
df.write.save_as_table("result_table", mode="overwrite")
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
### 获取结果
|
|
143
|
-
|
|
144
|
-
```python
|
|
145
|
-
# 打印预览
|
|
146
|
-
df.show(20)
|
|
147
|
-
|
|
148
|
-
# 收集为 Row 列表
|
|
149
|
-
rows = df.collect()
|
|
150
|
-
for row in rows:
|
|
151
|
-
print(row["id"], row["name"])
|
|
152
|
-
|
|
153
|
-
# 转为 Pandas DataFrame(小数据量)
|
|
154
|
-
pandas_df = df.to_pandas()
|
|
155
|
-
|
|
156
|
-
# 获取行数
|
|
157
|
-
print(df.count())
|
|
158
|
-
```
|
|
159
|
-
|
|
160
|
-
---
|
|
161
|
-
|
|
162
|
-
## 典型场景
|
|
163
|
-
|
|
164
|
-
### 场景 1:ETL 数据处理
|
|
165
|
-
|
|
166
|
-
```python
|
|
167
|
-
from clickzetta.zettapark.session import Session
|
|
168
|
-
from clickzetta.zettapark import functions as F
|
|
169
|
-
|
|
170
|
-
session = Session.builder.configs(config).create()
|
|
171
|
-
|
|
172
|
-
# 读取原始数据
|
|
173
|
-
raw = session.table("bronze.raw_orders")
|
|
174
|
-
|
|
175
|
-
# 清洗转换
|
|
176
|
-
cleaned = (
|
|
177
|
-
raw
|
|
178
|
-
.filter(F.isnotnull(F.col("order_id")))
|
|
179
|
-
.filter(F.col("amount") > 0)
|
|
180
|
-
.with_column("order_date", F.col("created_at").cast("DATE"))
|
|
181
|
-
.with_column("year_month", F.date_format(F.col("order_date"), "yyyy-MM"))
|
|
182
|
-
.select("order_id", "customer_id", "amount", "order_date", "year_month")
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
# 写入 Silver 层
|
|
186
|
-
cleaned.write.save_as_table("silver.orders_cleaned", mode="overwrite")
|
|
187
|
-
|
|
188
|
-
session.close()
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
### 场景 2:特征工程(机器学习)
|
|
192
|
-
|
|
193
|
-
```python
|
|
194
|
-
from clickzetta.zettapark import functions as F
|
|
195
|
-
|
|
196
|
-
customer = session.table("clickzetta_sample_data.tpch_100g.customer")
|
|
197
|
-
orders = session.table("clickzetta_sample_data.tpch_100g.orders")
|
|
198
|
-
|
|
199
|
-
# 构建客户消费特征
|
|
200
|
-
customer_features = (
|
|
201
|
-
orders
|
|
202
|
-
.group_by("o_custkey")
|
|
203
|
-
.agg(
|
|
204
|
-
F.sum("o_totalprice").as_("total_spend"),
|
|
205
|
-
F.count("*").as_("order_count"),
|
|
206
|
-
F.avg("o_totalprice").as_("avg_order_value"),
|
|
207
|
-
F.max("o_orderdate").as_("last_order_date"),
|
|
208
|
-
)
|
|
209
|
-
.join(customer, orders["o_custkey"] == customer["c_custkey"])
|
|
210
|
-
.select("c_custkey", "c_name", "total_spend", "order_count", "avg_order_value")
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
customer_features.write.save_as_table("ml_features.customer_features", mode="overwrite")
|
|
214
|
-
```
|
|
215
|
-
|
|
216
|
-
### 场景 3:从本地文件导入
|
|
217
|
-
|
|
218
|
-
```python
|
|
219
|
-
import json
|
|
220
|
-
import gzip
|
|
221
|
-
from clickzetta.zettapark.session import Session
|
|
222
|
-
|
|
223
|
-
session = Session.builder.configs(config).create()
|
|
224
|
-
|
|
225
|
-
# 读取本地 JSON 数据
|
|
226
|
-
data = []
|
|
227
|
-
with gzip.open('data.json.gz', 'rt', encoding='utf-8') as f:
|
|
228
|
-
for line in f:
|
|
229
|
-
if line.strip():
|
|
230
|
-
data.append(json.loads(line))
|
|
231
|
-
|
|
232
|
-
# 创建 DataFrame 并写入
|
|
233
|
-
df = session.create_dataframe(data)
|
|
234
|
-
df.write.save_as_table("my_table", mode="overwrite")
|
|
235
|
-
|
|
236
|
-
session.close()
|
|
237
|
-
```
|
|
238
|
-
|
|
239
|
-
---
|
|
240
|
-
|
|
241
|
-
## 常见问题
|
|
242
|
-
|
|
243
|
-
| 问题 | 原因 | 解决方案 |
|
|
244
|
-
|---|---|---|
|
|
245
|
-
| `collect()` 超时 | 数据量过大或集群规格不足 | 增大 `sdk.job.timeout`,或先 `limit()` 测试 |
|
|
246
|
-
| `to_pandas()` 内存溢出 | 结果集过大 | 先聚合/过滤再转 pandas,或分批处理 |
|
|
247
|
-
| 列名冲突(JOIN 后) | 两表有同名列 | 用 `df_left["col"]` 明确指定来源 |
|
|
248
|
-
| `save_as_table` 报错 | 表已存在且 mode 不对 | 使用 `mode="overwrite"` 或 `mode="append"` |
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
{"case_id":"001","type":"should_call","user_input":"用 ZettaPark 读取 orders 表并过滤 amount > 100","expected_skill":"clickzetta-zettapark","expected_output_contains":["session.table","filter"]}
|
|
2
|
-
{"case_id":"002","type":"should_call","user_input":"ZettaPark 怎么安装?需要什么 Python 版本?","expected_skill":"clickzetta-zettapark","expected_output_contains":["pip install","3.12"]}
|
|
3
|
-
{"case_id":"003","type":"should_call","user_input":"怎么用 DataFrame API 做 group by 聚合","expected_skill":"clickzetta-zettapark","expected_output_contains":["group_by","agg"]}
|
|
4
|
-
{"case_id":"004","type":"should_call","user_input":"save_as_table 怎么用?支持哪些写入模式?","expected_skill":"clickzetta-zettapark","expected_output_contains":["save_as_table","overwrite","append"]}
|
|
5
|
-
{"case_id":"005","type":"should_call","user_input":"ZettaPark 怎么把结果转成 pandas DataFrame","expected_skill":"clickzetta-zettapark","expected_output_contains":["to_pandas"]}
|
|
6
|
-
{"case_id":"006","type":"should_call","user_input":"用 session.sql 执行一段 SQL 查询","expected_skill":"clickzetta-zettapark","expected_output_contains":["session.sql"]}
|
|
7
|
-
{"case_id":"007","type":"should_call","user_input":"ZettaPark 怎么 join 两张表","expected_skill":"clickzetta-zettapark","expected_output_contains":["join"]}
|
|
8
|
-
{"case_id":"008","type":"should_not_call","user_input":"帮我写一个 Flask Web 应用","forbidden_skill":"clickzetta-zettapark"}
|
|
9
|
-
{"case_id":"009","type":"should_not_call","user_input":"pandas 怎么读取 CSV 文件","forbidden_skill":"clickzetta-zettapark"}
|
|
10
|
-
{"case_id":"010","type":"should_not_call","user_input":"怎么用 JDBC 连接 Lakehouse","forbidden_skill":"clickzetta-zettapark"}
|
|
11
|
-
{"case_id":"011","type":"should_not_call","user_input":"帮我创建一个 VCluster","forbidden_skill":"clickzetta-zettapark"}
|
|
12
|
-
{"case_id":"012","type":"should_not_call","user_input":"Spark DataFrame 怎么用","forbidden_skill":"clickzetta-zettapark"}
|
|
@@ -1,283 +0,0 @@
|
|
|
1
|
-
# ZettaPark 快速参考
|
|
2
|
-
|
|
3
|
-
> 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
|
|
4
|
-
|
|
5
|
-
## 安装
|
|
6
|
-
|
|
7
|
-
```bash
|
|
8
|
-
pip install clickzetta_zettapark_python -U -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
9
|
-
```
|
|
10
|
-
|
|
11
|
-
---
|
|
12
|
-
|
|
13
|
-
## 创建会话
|
|
14
|
-
|
|
15
|
-
```python
|
|
16
|
-
from clickzetta.zettapark.session import Session
|
|
17
|
-
|
|
18
|
-
connection_parameters = {
|
|
19
|
-
"username": "your_username",
|
|
20
|
-
"password": "your_password",
|
|
21
|
-
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
22
|
-
"instance": "your_instance_id",
|
|
23
|
-
"workspace": "your_workspace",
|
|
24
|
-
"schema": "public",
|
|
25
|
-
"vcluster": "default_ap",
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
session = Session.builder.configs(connection_parameters).create()
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
带 hints(超时、query_tag 等):
|
|
32
|
-
|
|
33
|
-
```python
|
|
34
|
-
connection_parameters = {
|
|
35
|
-
"username": "your_username",
|
|
36
|
-
"password": "your_password",
|
|
37
|
-
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
38
|
-
"instance": "your_instance_id",
|
|
39
|
-
"workspace": "your_workspace",
|
|
40
|
-
"schema": "public",
|
|
41
|
-
"vcluster": "default_ap",
|
|
42
|
-
"hints": {
|
|
43
|
-
"sdk.job.timeout": 300,
|
|
44
|
-
"query_tag": "my_zettapark_app",
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
session = Session.builder.configs(connection_parameters).create()
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
从 JSON 配置文件读取:
|
|
52
|
-
|
|
53
|
-
```python
|
|
54
|
-
import json
|
|
55
|
-
with open('config.json', 'r') as f:
|
|
56
|
-
config = json.load(f)
|
|
57
|
-
session = Session.builder.configs(config).create()
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
验证连接:
|
|
61
|
-
|
|
62
|
-
```python
|
|
63
|
-
session.sql("SELECT current_user(), current_workspace(), current_vcluster()").show()
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
关闭会话:
|
|
67
|
-
|
|
68
|
-
```python
|
|
69
|
-
session.close()
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
---
|
|
73
|
-
|
|
74
|
-
## 构建 DataFrame
|
|
75
|
-
|
|
76
|
-
```python
|
|
77
|
-
# 从表创建
|
|
78
|
-
df = session.table("my_schema.my_table")
|
|
79
|
-
|
|
80
|
-
# 从 SQL 创建
|
|
81
|
-
df = session.sql("SELECT * FROM orders WHERE year = 2024")
|
|
82
|
-
|
|
83
|
-
# 从 Python 数据创建
|
|
84
|
-
df = session.create_dataframe([1, 2, 3, 4]).to_df("id")
|
|
85
|
-
df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema=["id", "name"])
|
|
86
|
-
|
|
87
|
-
# 从 Row 对象创建
|
|
88
|
-
from clickzetta.zettapark import Row
|
|
89
|
-
df = session.create_dataframe([Row(id=1, name="Alice"), Row(id=2, name="Bob")])
|
|
90
|
-
|
|
91
|
-
# 带 Schema 创建
|
|
92
|
-
from clickzetta.zettapark.types import IntegerType, StringType, StructType, StructField
|
|
93
|
-
schema = StructType([StructField("id", IntegerType()), StructField("name", StringType())])
|
|
94
|
-
df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema)
|
|
95
|
-
|
|
96
|
-
# 范围序列
|
|
97
|
-
df = session.range(1, 10, 2).to_df("n") # 1,3,5,7,9
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
---
|
|
101
|
-
|
|
102
|
-
## DataFrame 转换操作
|
|
103
|
-
|
|
104
|
-
```python
|
|
105
|
-
from clickzetta.zettapark import functions as F
|
|
106
|
-
|
|
107
|
-
# 过滤行
|
|
108
|
-
df.filter(F.col("age") > 18)
|
|
109
|
-
df.filter(F.col("status") == "active")
|
|
110
|
-
df.where(F.col("amount") > 1000)
|
|
111
|
-
|
|
112
|
-
# 选择列
|
|
113
|
-
df.select("id", "name", "amount")
|
|
114
|
-
df.select(F.col("id"), F.col("name").as_("user_name"))
|
|
115
|
-
|
|
116
|
-
# 新增/修改列
|
|
117
|
-
df.with_column("total", F.col("price") * F.col("qty"))
|
|
118
|
-
df.with_column("upper_name", F.upper(F.col("name")))
|
|
119
|
-
|
|
120
|
-
# 重命名列
|
|
121
|
-
df.rename(F.col("old_name"), "new_name")
|
|
122
|
-
|
|
123
|
-
# 排序
|
|
124
|
-
df.sort(F.col("amount").desc())
|
|
125
|
-
df.order_by(F.col("created_at").asc())
|
|
126
|
-
|
|
127
|
-
# 去重
|
|
128
|
-
df.distinct()
|
|
129
|
-
df.drop_duplicates(["user_id"])
|
|
130
|
-
|
|
131
|
-
# 限制行数
|
|
132
|
-
df.limit(100)
|
|
133
|
-
|
|
134
|
-
# 删除列
|
|
135
|
-
df.drop("unnecessary_col")
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
---
|
|
139
|
-
|
|
140
|
-
## 聚合操作
|
|
141
|
-
|
|
142
|
-
```python
|
|
143
|
-
from clickzetta.zettapark import functions as F
|
|
144
|
-
|
|
145
|
-
# 分组聚合
|
|
146
|
-
df.group_by("category").agg(
|
|
147
|
-
F.sum("amount").as_("total_amount"),
|
|
148
|
-
F.count("*").as_("order_count"),
|
|
149
|
-
F.avg("price").as_("avg_price"),
|
|
150
|
-
F.max("amount").as_("max_amount"),
|
|
151
|
-
F.min("amount").as_("min_amount"),
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
# 全局聚合
|
|
155
|
-
df.agg(F.count("*"), F.sum("amount"))
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
---
|
|
159
|
-
|
|
160
|
-
## JOIN 操作
|
|
161
|
-
|
|
162
|
-
```python
|
|
163
|
-
# 内连接
|
|
164
|
-
df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"])
|
|
165
|
-
|
|
166
|
-
# 左连接
|
|
167
|
-
df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"], "left")
|
|
168
|
-
|
|
169
|
-
# 选择连接后的列(避免列名冲突)
|
|
170
|
-
result = df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"]) \
|
|
171
|
-
.select(df_orders["order_id"], df_customers["name"], df_orders["amount"])
|
|
172
|
-
```
|
|
173
|
-
|
|
174
|
-
---
|
|
175
|
-
|
|
176
|
-
## 执行与结果获取
|
|
177
|
-
|
|
178
|
-
```python
|
|
179
|
-
# 打印前 N 行(触发执行)
|
|
180
|
-
df.show()
|
|
181
|
-
df.show(20)
|
|
182
|
-
|
|
183
|
-
# 收集所有结果为 Row 列表
|
|
184
|
-
rows = df.collect()
|
|
185
|
-
for row in rows:
|
|
186
|
-
print(row["id"], row["name"])
|
|
187
|
-
|
|
188
|
-
# 转换为 Pandas DataFrame
|
|
189
|
-
pandas_df = df.to_pandas()
|
|
190
|
-
|
|
191
|
-
# 获取行数
|
|
192
|
-
count = df.count()
|
|
193
|
-
|
|
194
|
-
# 获取列名
|
|
195
|
-
print(df.columns)
|
|
196
|
-
|
|
197
|
-
# 查看 Schema
|
|
198
|
-
df.schema.print_tree()
|
|
199
|
-
```
|
|
200
|
-
|
|
201
|
-
---
|
|
202
|
-
|
|
203
|
-
## 写入数据
|
|
204
|
-
|
|
205
|
-
```python
|
|
206
|
-
# 写入已有表(追加)
|
|
207
|
-
df.write.save_as_table("my_table", mode="append")
|
|
208
|
-
|
|
209
|
-
# 覆盖写入
|
|
210
|
-
df.write.save_as_table("my_table", mode="overwrite")
|
|
211
|
-
|
|
212
|
-
# 自动建表并写入(overwrite 会重建表)
|
|
213
|
-
df.write.save_as_table("new_table", mode="overwrite")
|
|
214
|
-
|
|
215
|
-
# 写入指定 Schema 下的表
|
|
216
|
-
df.write.save_as_table("my_schema.my_table", mode="append")
|
|
217
|
-
```
|
|
218
|
-
|
|
219
|
-
---
|
|
220
|
-
|
|
221
|
-
## 执行 SQL
|
|
222
|
-
|
|
223
|
-
```python
|
|
224
|
-
# 执行 DDL/DML
|
|
225
|
-
session.sql("CREATE TABLE IF NOT EXISTS t (id INT, name STRING)").collect()
|
|
226
|
-
session.sql("INSERT INTO t VALUES (1, 'Alice')").collect()
|
|
227
|
-
|
|
228
|
-
# 执行查询并获取 DataFrame
|
|
229
|
-
df = session.sql("SELECT * FROM orders WHERE amount > 1000")
|
|
230
|
-
df.show()
|
|
231
|
-
|
|
232
|
-
# 切换 Schema
|
|
233
|
-
session.use_schema("my_schema")
|
|
234
|
-
```
|
|
235
|
-
|
|
236
|
-
---
|
|
237
|
-
|
|
238
|
-
## 文件操作(Volume)
|
|
239
|
-
|
|
240
|
-
```python
|
|
241
|
-
# 上传文件到 User Volume
|
|
242
|
-
session.file.put("/local/path/data.csv", "volume:user://~/data/")
|
|
243
|
-
|
|
244
|
-
# 下载文件
|
|
245
|
-
session.file.get("volume:user://~/data/data.csv", "/local/output/")
|
|
246
|
-
|
|
247
|
-
# 列出 User Volume 文件
|
|
248
|
-
session.sql("LIST USER VOLUME").show()
|
|
249
|
-
session.sql("SHOW USER VOLUME DIRECTORY").show()
|
|
250
|
-
```
|
|
251
|
-
|
|
252
|
-
---
|
|
253
|
-
|
|
254
|
-
## 常用 functions 速查
|
|
255
|
-
|
|
256
|
-
```python
|
|
257
|
-
from clickzetta.zettapark import functions as F
|
|
258
|
-
|
|
259
|
-
# 字符串
|
|
260
|
-
F.upper(col), F.lower(col), F.concat(col1, col2)
|
|
261
|
-
F.substring(col, 1, 3), F.trim(col), F.length(col)
|
|
262
|
-
|
|
263
|
-
# 数值
|
|
264
|
-
F.abs(col), F.round(col, 2), F.floor(col), F.ceil(col)
|
|
265
|
-
F.sqrt(col), F.pow(col, 2)
|
|
266
|
-
|
|
267
|
-
# 日期时间
|
|
268
|
-
F.current_date(), F.current_timestamp()
|
|
269
|
-
F.year(col), F.month(col), F.day(col)
|
|
270
|
-
F.date_add(col, 7), F.datediff(col1, col2)
|
|
271
|
-
|
|
272
|
-
# 条件
|
|
273
|
-
F.when(F.col("status") == "A", "Active").otherwise("Inactive")
|
|
274
|
-
F.coalesce(col1, col2) # 第一个非 null 值
|
|
275
|
-
F.isnull(col), F.isnotnull(col)
|
|
276
|
-
|
|
277
|
-
# 聚合
|
|
278
|
-
F.count("*"), F.sum(col), F.avg(col), F.max(col), F.min(col)
|
|
279
|
-
F.count_distinct(col)
|
|
280
|
-
|
|
281
|
-
# 类型转换
|
|
282
|
-
F.col("amount").cast(IntegerType())
|
|
283
|
-
```
|