@clickzetta/cz-cli-linux-x64 0.3.2 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
- package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
- package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
- package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -450
- package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
- package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
- package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
- package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
- package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
- package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
- package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
- package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
- package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
- package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
- package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -86
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
- package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
- package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
- package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -117
- package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
- package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
- package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
- package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
- package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
- package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
- package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -531
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -186
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
- package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
- package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
- package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
- package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -402
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
- package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -353
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -173
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -160
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -123
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
- package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -155
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
- package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -249
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -194
- package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
# Volume 管理参考
|
|
2
|
-
|
|
3
|
-
> 来源:https://www.yunqi.tech/documents/datalake_volume_object 等
|
|
4
|
-
|
|
5
|
-
## Volume 类型
|
|
6
|
-
|
|
7
|
-
| 类型 | 说明 |
|
|
8
|
-
|---|---|
|
|
9
|
-
| 外部 Volume(External Volume) | 挂载 OSS/COS/S3 等对象存储路径 |
|
|
10
|
-
| 内部 Volume(Internal Volume) | 系统托管存储,含 User Volume、Table Volume、命名 Volume |
|
|
11
|
-
|
|
12
|
-
---
|
|
13
|
-
|
|
14
|
-
## CREATE EXTERNAL VOLUME
|
|
15
|
-
|
|
16
|
-
```sql
|
|
17
|
-
-- OSS
|
|
18
|
-
CREATE EXTERNAL VOLUME my_oss_volume
|
|
19
|
-
LOCATION 'oss://<bucket>/<path>'
|
|
20
|
-
USING CONNECTION my_oss_conn
|
|
21
|
-
DIRECTORY = (ENABLE = TRUE, AUTO_REFRESH = TRUE)
|
|
22
|
-
RECURSIVE = TRUE;
|
|
23
|
-
|
|
24
|
-
-- COS
|
|
25
|
-
CREATE EXTERNAL VOLUME my_cos_volume
|
|
26
|
-
LOCATION 'cos://<bucket>/<path>'
|
|
27
|
-
USING CONNECTION my_cos_conn
|
|
28
|
-
DIRECTORY = (ENABLE = TRUE)
|
|
29
|
-
RECURSIVE = TRUE;
|
|
30
|
-
|
|
31
|
-
-- S3
|
|
32
|
-
CREATE EXTERNAL VOLUME my_s3_volume
|
|
33
|
-
LOCATION 's3://<bucket>/<path>'
|
|
34
|
-
USING CONNECTION my_s3_conn
|
|
35
|
-
DIRECTORY = (ENABLE = TRUE)
|
|
36
|
-
RECURSIVE = TRUE;
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
参数说明:
|
|
40
|
-
- `LOCATION`:对象存储路径
|
|
41
|
-
- `USING CONNECTION`:已创建的 STORAGE CONNECTION 名称
|
|
42
|
-
- `DIRECTORY`:目录功能配置,`ENABLE=TRUE` 开启目录索引,`AUTO_REFRESH=TRUE` 自动刷新
|
|
43
|
-
- `RECURSIVE`:是否递归扫描子目录
|
|
44
|
-
|
|
45
|
-
---
|
|
46
|
-
|
|
47
|
-
## ALTER VOLUME
|
|
48
|
-
|
|
49
|
-
```sql
|
|
50
|
-
-- 刷新目录元数据
|
|
51
|
-
ALTER VOLUME my_oss_volume REFRESH;
|
|
52
|
-
```
|
|
53
|
-
|
|
54
|
-
---
|
|
55
|
-
|
|
56
|
-
## DROP VOLUME
|
|
57
|
-
|
|
58
|
-
```sql
|
|
59
|
-
DROP VOLUME IF EXISTS my_oss_volume;
|
|
60
|
-
```
|
|
61
|
-
|
|
62
|
-
---
|
|
63
|
-
|
|
64
|
-
## SHOW / DESC VOLUME
|
|
65
|
-
|
|
66
|
-
```sql
|
|
67
|
-
-- 列出所有 Volume
|
|
68
|
-
SHOW VOLUMES;
|
|
69
|
-
|
|
70
|
-
-- 按条件过滤(SHOW VOLUMES 不支持 WHERE,使用 information_schema)
|
|
71
|
-
SELECT volume_name, volume_type, volume_region, volume_creator
|
|
72
|
-
FROM information_schema.volumes
|
|
73
|
-
WHERE volume_type = 'EXTERNAL';
|
|
74
|
-
|
|
75
|
-
-- 按名称查找
|
|
76
|
-
SELECT * FROM information_schema.volumes
|
|
77
|
-
WHERE volume_name = 'my_oss_volume';
|
|
78
|
-
|
|
79
|
-
-- 查看 Volume 详情
|
|
80
|
-
DESC VOLUME my_oss_volume;
|
|
81
|
-
|
|
82
|
-
-- 查看 Volume 目录下的文件
|
|
83
|
-
SHOW VOLUME DIRECTORY my_oss_volume;
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
---
|
|
87
|
-
|
|
88
|
-
## 查看目录元数据(DIRECTORY 函数)
|
|
89
|
-
|
|
90
|
-
```sql
|
|
91
|
-
-- 查看 Volume 目录元数据(需先 ALTER VOLUME REFRESH)
|
|
92
|
-
SELECT * FROM DIRECTORY(VOLUME my_oss_volume);
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
---
|
|
96
|
-
|
|
97
|
-
## User Volume 操作
|
|
98
|
-
|
|
99
|
-
```sql
|
|
100
|
-
-- 查看 User Volume 文件列表
|
|
101
|
-
SHOW USER VOLUME DIRECTORY;
|
|
102
|
-
|
|
103
|
-
-- 上传文件到 User Volume 根目录
|
|
104
|
-
PUT '/local/path/file.csv' TO USER VOLUME;
|
|
105
|
-
|
|
106
|
-
-- 上传并指定目标路径
|
|
107
|
-
PUT '/local/path/file.csv' TO USER VOLUME FILE 'subdir/file.csv';
|
|
108
|
-
|
|
109
|
-
-- 通配符上传多个文件
|
|
110
|
-
PUT '/local/path/images/*' TO USER VOLUME SUBDIRECTORY 'images/';
|
|
111
|
-
|
|
112
|
-
-- 下载文件
|
|
113
|
-
GET USER VOLUME FILE 'subdir/file.csv' TO '/local/output/';
|
|
114
|
-
|
|
115
|
-
-- 删除文件
|
|
116
|
-
REMOVE USER VOLUME FILE 'subdir/file.csv';
|
|
117
|
-
|
|
118
|
-
-- 删除目录下所有文件
|
|
119
|
-
REMOVE USER VOLUME SUBDIRECTORY '/';
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
---
|
|
123
|
-
|
|
124
|
-
## 从 Volume 查询数据(SELECT FROM VOLUME)
|
|
125
|
-
|
|
126
|
-
```sql
|
|
127
|
-
-- 查询 CSV 文件
|
|
128
|
-
SELECT * FROM VOLUME my_oss_volume
|
|
129
|
-
USING CSV
|
|
130
|
-
OPTIONS('header' = 'true', 'sep' = ',')
|
|
131
|
-
SUBDIRECTORY 'data/'
|
|
132
|
-
LIMIT 100;
|
|
133
|
-
|
|
134
|
-
-- 查询 Parquet 文件
|
|
135
|
-
SELECT * FROM VOLUME my_oss_volume
|
|
136
|
-
USING PARQUET
|
|
137
|
-
FILES('part-00001.parquet', 'part-00002.parquet');
|
|
138
|
-
|
|
139
|
-
-- 正则匹配文件
|
|
140
|
-
SELECT * FROM VOLUME my_oss_volume
|
|
141
|
-
USING PARQUET
|
|
142
|
-
REGEXP '.*2024-0[1-3].parquet';
|
|
143
|
-
|
|
144
|
-
-- 查询 User Volume 文件
|
|
145
|
-
SELECT * FROM USER VOLUME
|
|
146
|
-
USING CSV
|
|
147
|
-
OPTIONS('header' = 'true')
|
|
148
|
-
FILES('data.csv')
|
|
149
|
-
LIMIT 10;
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
支持格式:`CSV`、`PARQUET`、`ORC`、`JSON`、`BSON`
|
|
153
|
-
|
|
154
|
-
CSV OPTIONS 常用参数:
|
|
155
|
-
- `header`:是否有表头,默认 `false`
|
|
156
|
-
- `sep`:列分隔符,默认 `,`
|
|
157
|
-
- `compression`:压缩格式(gzip/zstd/zlib)
|
|
158
|
-
- `multiLine`:是否支持多行字段,默认 `false`
|
|
159
|
-
|
|
160
|
-
---
|
|
161
|
-
|
|
162
|
-
## COPY INTO TABLE(从 Volume 导入)
|
|
163
|
-
|
|
164
|
-
```sql
|
|
165
|
-
COPY INTO my_table
|
|
166
|
-
FROM VOLUME my_oss_volume
|
|
167
|
-
USING CSV
|
|
168
|
-
OPTIONS('header' = 'true')
|
|
169
|
-
SUBDIRECTORY 'data/';
|
|
170
|
-
```
|
|
171
|
-
|
|
172
|
-
## COPY INTO VOLUME(导出到 Volume)
|
|
173
|
-
|
|
174
|
-
```sql
|
|
175
|
-
-- 导出表到 External Volume
|
|
176
|
-
COPY INTO VOLUME my_oss_volume
|
|
177
|
-
SUBDIRECTORY 'export/'
|
|
178
|
-
FROM TABLE my_table
|
|
179
|
-
FILE_FORMAT = (TYPE = CSV);
|
|
180
|
-
|
|
181
|
-
-- 导出查询结果
|
|
182
|
-
COPY INTO VOLUME my_oss_volume
|
|
183
|
-
SUBDIRECTORY 'export/'
|
|
184
|
-
FROM (SELECT * FROM orders WHERE year = 2024)
|
|
185
|
-
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = 'GZIP');
|
|
186
|
-
|
|
187
|
-
-- 导出到 User Volume
|
|
188
|
-
COPY INTO USER VOLUME
|
|
189
|
-
SUBDIRECTORY 'export/'
|
|
190
|
-
FROM TABLE my_table
|
|
191
|
-
FILE_FORMAT = (TYPE = CSV);
|
|
192
|
-
```
|
|
193
|
-
|
|
194
|
-
> ⚠️ 导出用 `FILE_FORMAT = (TYPE = ...)` 指定格式,不是 `USING`。`USING` 仅用于 `SELECT FROM VOLUME`。
|
|
@@ -1,248 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: clickzetta-zettapark
|
|
3
|
-
description: |
|
|
4
|
-
使用 ZettaPark Python 库操作 ClickZetta Lakehouse 数据。ZettaPark 提供类 pandas 的
|
|
5
|
-
DataFrame API,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行。
|
|
6
|
-
覆盖 Session 创建、DataFrame 构建与转换(filter/select/join/groupBy)、
|
|
7
|
-
结果收集(collect/to_pandas/show)、写入表(save_as_table)、
|
|
8
|
-
文件操作(PUT/GET)、执行 SQL 等完整工作流。
|
|
9
|
-
当用户说"ZettaPark"、"zettapark"、"DataFrame API"、"Python 操作 Lakehouse"、
|
|
10
|
-
"save_as_table"、"session.table"、"session.sql"、"collect()"、"to_pandas"、
|
|
11
|
-
"Python 数据工程"、"Python 写入 Lakehouse"、"Python 读取 Lakehouse"、
|
|
12
|
-
"clickzetta_zettapark_python"时触发。
|
|
13
|
-
Keywords: ZettaPark, DataFrame, pandas-like, Python, SQL translation, distributed compute
|
|
14
|
-
---
|
|
15
|
-
|
|
16
|
-
# ClickZetta ZettaPark
|
|
17
|
-
|
|
18
|
-
ZettaPark 是 ClickZetta Lakehouse 的 Python DataFrame 框架,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行,提供类 pandas 的开发体验。
|
|
19
|
-
|
|
20
|
-
阅读 [references/zettapark-api.md](references/zettapark-api.md) 了解完整 API。
|
|
21
|
-
|
|
22
|
-
## 安装
|
|
23
|
-
|
|
24
|
-
> ⚠️ **Python 版本要求**:推荐 **Python 3.12**(最低 3.10,不支持 3.9 及以下)
|
|
25
|
-
|
|
26
|
-
```bash
|
|
27
|
-
# 方式 1:venv(Python 内置,推荐)
|
|
28
|
-
python3.12 -m venv .venv
|
|
29
|
-
source .venv/bin/activate # macOS/Linux | .venv\Scripts\activate (Windows)
|
|
30
|
-
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
31
|
-
|
|
32
|
-
# 方式 2:pyenv(需要切换 Python 版本时)
|
|
33
|
-
pyenv install 3.12.9 && pyenv local 3.12.9
|
|
34
|
-
python -m venv .venv && source .venv/bin/activate
|
|
35
|
-
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
36
|
-
|
|
37
|
-
# 方式 3:conda(数据科学环境)
|
|
38
|
-
conda create -n lakehouse python=3.12 -y && conda activate lakehouse
|
|
39
|
-
pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
40
|
-
```
|
|
41
|
-
|
|
42
|
-
---
|
|
43
|
-
|
|
44
|
-
## 创建会话
|
|
45
|
-
|
|
46
|
-
```python
|
|
47
|
-
from clickzetta.zettapark.session import Session
|
|
48
|
-
|
|
49
|
-
connection_parameters = {
|
|
50
|
-
"username": "your_username",
|
|
51
|
-
"password": "your_password",
|
|
52
|
-
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
53
|
-
"instance": "your_instance_id",
|
|
54
|
-
"workspace": "your_workspace",
|
|
55
|
-
"schema": "public",
|
|
56
|
-
"vcluster": "default_ap",
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
session = Session.builder.configs(connection_parameters).create()
|
|
60
|
-
|
|
61
|
-
# 验证连接
|
|
62
|
-
session.sql("SELECT current_user(), current_workspace()").show()
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
---
|
|
66
|
-
|
|
67
|
-
## 核心工作流
|
|
68
|
-
|
|
69
|
-
### 读取数据
|
|
70
|
-
|
|
71
|
-
```python
|
|
72
|
-
from clickzetta.zettapark import functions as F
|
|
73
|
-
|
|
74
|
-
# 从表读取
|
|
75
|
-
df = session.table("orders")
|
|
76
|
-
df = session.table("my_schema.orders")
|
|
77
|
-
|
|
78
|
-
# 从 SQL 读取
|
|
79
|
-
df = session.sql("SELECT * FROM orders WHERE year = 2024")
|
|
80
|
-
|
|
81
|
-
# 从 Python 数据创建
|
|
82
|
-
df = session.create_dataframe([[1, "Alice", 100.0], [2, "Bob", 200.0]],
|
|
83
|
-
schema=["id", "name", "amount"])
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
### 转换数据
|
|
87
|
-
|
|
88
|
-
```python
|
|
89
|
-
# 过滤、选择、新增列
|
|
90
|
-
result = (
|
|
91
|
-
session.table("orders")
|
|
92
|
-
.filter(F.col("status") == "completed")
|
|
93
|
-
.select("order_id", "customer_id", "amount")
|
|
94
|
-
.with_column("tax", F.col("amount") * 0.1)
|
|
95
|
-
.sort(F.col("amount").desc())
|
|
96
|
-
.limit(100)
|
|
97
|
-
)
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
### 聚合
|
|
101
|
-
|
|
102
|
-
```python
|
|
103
|
-
summary = (
|
|
104
|
-
session.table("orders")
|
|
105
|
-
.group_by("category")
|
|
106
|
-
.agg(
|
|
107
|
-
F.sum("amount").as_("total"),
|
|
108
|
-
F.count("*").as_("cnt"),
|
|
109
|
-
F.avg("amount").as_("avg_amount"),
|
|
110
|
-
)
|
|
111
|
-
)
|
|
112
|
-
summary.show()
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
### JOIN
|
|
116
|
-
|
|
117
|
-
```python
|
|
118
|
-
orders = session.table("orders")
|
|
119
|
-
customers = session.table("customers")
|
|
120
|
-
|
|
121
|
-
result = orders.join(
|
|
122
|
-
customers,
|
|
123
|
-
orders["customer_id"] == customers["id"],
|
|
124
|
-
"left"
|
|
125
|
-
).select(
|
|
126
|
-
orders["order_id"],
|
|
127
|
-
customers["name"],
|
|
128
|
-
orders["amount"]
|
|
129
|
-
)
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
### 写入数据
|
|
133
|
-
|
|
134
|
-
```python
|
|
135
|
-
# 追加到已有表
|
|
136
|
-
df.write.save_as_table("result_table", mode="append")
|
|
137
|
-
|
|
138
|
-
# 覆盖写入(自动建表)
|
|
139
|
-
df.write.save_as_table("result_table", mode="overwrite")
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
### 获取结果
|
|
143
|
-
|
|
144
|
-
```python
|
|
145
|
-
# 打印预览
|
|
146
|
-
df.show(20)
|
|
147
|
-
|
|
148
|
-
# 收集为 Row 列表
|
|
149
|
-
rows = df.collect()
|
|
150
|
-
for row in rows:
|
|
151
|
-
print(row["id"], row["name"])
|
|
152
|
-
|
|
153
|
-
# 转为 Pandas DataFrame(小数据量)
|
|
154
|
-
pandas_df = df.to_pandas()
|
|
155
|
-
|
|
156
|
-
# 获取行数
|
|
157
|
-
print(df.count())
|
|
158
|
-
```
|
|
159
|
-
|
|
160
|
-
---
|
|
161
|
-
|
|
162
|
-
## 典型场景
|
|
163
|
-
|
|
164
|
-
### 场景 1:ETL 数据处理
|
|
165
|
-
|
|
166
|
-
```python
|
|
167
|
-
from clickzetta.zettapark.session import Session
|
|
168
|
-
from clickzetta.zettapark import functions as F
|
|
169
|
-
|
|
170
|
-
session = Session.builder.configs(config).create()
|
|
171
|
-
|
|
172
|
-
# 读取原始数据
|
|
173
|
-
raw = session.table("bronze.raw_orders")
|
|
174
|
-
|
|
175
|
-
# 清洗转换
|
|
176
|
-
cleaned = (
|
|
177
|
-
raw
|
|
178
|
-
.filter(F.isnotnull(F.col("order_id")))
|
|
179
|
-
.filter(F.col("amount") > 0)
|
|
180
|
-
.with_column("order_date", F.col("created_at").cast("DATE"))
|
|
181
|
-
.with_column("year_month", F.date_format(F.col("order_date"), "yyyy-MM"))
|
|
182
|
-
.select("order_id", "customer_id", "amount", "order_date", "year_month")
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
# 写入 Silver 层
|
|
186
|
-
cleaned.write.save_as_table("silver.orders_cleaned", mode="overwrite")
|
|
187
|
-
|
|
188
|
-
session.close()
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
### 场景 2:特征工程(机器学习)
|
|
192
|
-
|
|
193
|
-
```python
|
|
194
|
-
from clickzetta.zettapark import functions as F
|
|
195
|
-
|
|
196
|
-
customer = session.table("clickzetta_sample_data.tpch_100g.customer")
|
|
197
|
-
orders = session.table("clickzetta_sample_data.tpch_100g.orders")
|
|
198
|
-
|
|
199
|
-
# 构建客户消费特征
|
|
200
|
-
customer_features = (
|
|
201
|
-
orders
|
|
202
|
-
.group_by("o_custkey")
|
|
203
|
-
.agg(
|
|
204
|
-
F.sum("o_totalprice").as_("total_spend"),
|
|
205
|
-
F.count("*").as_("order_count"),
|
|
206
|
-
F.avg("o_totalprice").as_("avg_order_value"),
|
|
207
|
-
F.max("o_orderdate").as_("last_order_date"),
|
|
208
|
-
)
|
|
209
|
-
.join(customer, orders["o_custkey"] == customer["c_custkey"])
|
|
210
|
-
.select("c_custkey", "c_name", "total_spend", "order_count", "avg_order_value")
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
customer_features.write.save_as_table("ml_features.customer_features", mode="overwrite")
|
|
214
|
-
```
|
|
215
|
-
|
|
216
|
-
### 场景 3:从本地文件导入
|
|
217
|
-
|
|
218
|
-
```python
|
|
219
|
-
import json
|
|
220
|
-
import gzip
|
|
221
|
-
from clickzetta.zettapark.session import Session
|
|
222
|
-
|
|
223
|
-
session = Session.builder.configs(config).create()
|
|
224
|
-
|
|
225
|
-
# 读取本地 JSON 数据
|
|
226
|
-
data = []
|
|
227
|
-
with gzip.open('data.json.gz', 'rt', encoding='utf-8') as f:
|
|
228
|
-
for line in f:
|
|
229
|
-
if line.strip():
|
|
230
|
-
data.append(json.loads(line))
|
|
231
|
-
|
|
232
|
-
# 创建 DataFrame 并写入
|
|
233
|
-
df = session.create_dataframe(data)
|
|
234
|
-
df.write.save_as_table("my_table", mode="overwrite")
|
|
235
|
-
|
|
236
|
-
session.close()
|
|
237
|
-
```
|
|
238
|
-
|
|
239
|
-
---
|
|
240
|
-
|
|
241
|
-
## 常见问题
|
|
242
|
-
|
|
243
|
-
| 问题 | 原因 | 解决方案 |
|
|
244
|
-
|---|---|---|
|
|
245
|
-
| `collect()` 超时 | 数据量过大或集群规格不足 | 增大 `sdk.job.timeout`,或先 `limit()` 测试 |
|
|
246
|
-
| `to_pandas()` 内存溢出 | 结果集过大 | 先聚合/过滤再转 pandas,或分批处理 |
|
|
247
|
-
| 列名冲突(JOIN 后) | 两表有同名列 | 用 `df_left["col"]` 明确指定来源 |
|
|
248
|
-
| `save_as_table` 报错 | 表已存在且 mode 不对 | 使用 `mode="overwrite"` 或 `mode="append"` |
|
|
@@ -1,283 +0,0 @@
|
|
|
1
|
-
# ZettaPark 快速参考
|
|
2
|
-
|
|
3
|
-
> 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
|
|
4
|
-
|
|
5
|
-
## 安装
|
|
6
|
-
|
|
7
|
-
```bash
|
|
8
|
-
pip install clickzetta_zettapark_python -U -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
9
|
-
```
|
|
10
|
-
|
|
11
|
-
---
|
|
12
|
-
|
|
13
|
-
## 创建会话
|
|
14
|
-
|
|
15
|
-
```python
|
|
16
|
-
from clickzetta.zettapark.session import Session
|
|
17
|
-
|
|
18
|
-
connection_parameters = {
|
|
19
|
-
"username": "your_username",
|
|
20
|
-
"password": "your_password",
|
|
21
|
-
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
22
|
-
"instance": "your_instance_id",
|
|
23
|
-
"workspace": "your_workspace",
|
|
24
|
-
"schema": "public",
|
|
25
|
-
"vcluster": "default_ap",
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
session = Session.builder.configs(connection_parameters).create()
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
带 hints(超时、query_tag 等):
|
|
32
|
-
|
|
33
|
-
```python
|
|
34
|
-
connection_parameters = {
|
|
35
|
-
"username": "your_username",
|
|
36
|
-
"password": "your_password",
|
|
37
|
-
"service": "cn-shanghai-alicloud.api.clickzetta.com",
|
|
38
|
-
"instance": "your_instance_id",
|
|
39
|
-
"workspace": "your_workspace",
|
|
40
|
-
"schema": "public",
|
|
41
|
-
"vcluster": "default_ap",
|
|
42
|
-
"hints": {
|
|
43
|
-
"sdk.job.timeout": 300,
|
|
44
|
-
"query_tag": "my_zettapark_app",
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
session = Session.builder.configs(connection_parameters).create()
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
从 JSON 配置文件读取:
|
|
52
|
-
|
|
53
|
-
```python
|
|
54
|
-
import json
|
|
55
|
-
with open('config.json', 'r') as f:
|
|
56
|
-
config = json.load(f)
|
|
57
|
-
session = Session.builder.configs(config).create()
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
验证连接:
|
|
61
|
-
|
|
62
|
-
```python
|
|
63
|
-
session.sql("SELECT current_user(), current_workspace(), current_vcluster()").show()
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
关闭会话:
|
|
67
|
-
|
|
68
|
-
```python
|
|
69
|
-
session.close()
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
---
|
|
73
|
-
|
|
74
|
-
## 构建 DataFrame
|
|
75
|
-
|
|
76
|
-
```python
|
|
77
|
-
# 从表创建
|
|
78
|
-
df = session.table("my_schema.my_table")
|
|
79
|
-
|
|
80
|
-
# 从 SQL 创建
|
|
81
|
-
df = session.sql("SELECT * FROM orders WHERE year = 2024")
|
|
82
|
-
|
|
83
|
-
# 从 Python 数据创建
|
|
84
|
-
df = session.create_dataframe([1, 2, 3, 4]).to_df("id")
|
|
85
|
-
df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema=["id", "name"])
|
|
86
|
-
|
|
87
|
-
# 从 Row 对象创建
|
|
88
|
-
from clickzetta.zettapark import Row
|
|
89
|
-
df = session.create_dataframe([Row(id=1, name="Alice"), Row(id=2, name="Bob")])
|
|
90
|
-
|
|
91
|
-
# 带 Schema 创建
|
|
92
|
-
from clickzetta.zettapark.types import IntegerType, StringType, StructType, StructField
|
|
93
|
-
schema = StructType([StructField("id", IntegerType()), StructField("name", StringType())])
|
|
94
|
-
df = session.create_dataframe([[1, "Alice"], [2, "Bob"]], schema)
|
|
95
|
-
|
|
96
|
-
# 范围序列
|
|
97
|
-
df = session.range(1, 10, 2).to_df("n") # 1,3,5,7,9
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
---
|
|
101
|
-
|
|
102
|
-
## DataFrame 转换操作
|
|
103
|
-
|
|
104
|
-
```python
|
|
105
|
-
from clickzetta.zettapark import functions as F
|
|
106
|
-
|
|
107
|
-
# 过滤行
|
|
108
|
-
df.filter(F.col("age") > 18)
|
|
109
|
-
df.filter(F.col("status") == "active")
|
|
110
|
-
df.where(F.col("amount") > 1000)
|
|
111
|
-
|
|
112
|
-
# 选择列
|
|
113
|
-
df.select("id", "name", "amount")
|
|
114
|
-
df.select(F.col("id"), F.col("name").as_("user_name"))
|
|
115
|
-
|
|
116
|
-
# 新增/修改列
|
|
117
|
-
df.with_column("total", F.col("price") * F.col("qty"))
|
|
118
|
-
df.with_column("upper_name", F.upper(F.col("name")))
|
|
119
|
-
|
|
120
|
-
# 重命名列
|
|
121
|
-
df.rename(F.col("old_name"), "new_name")
|
|
122
|
-
|
|
123
|
-
# 排序
|
|
124
|
-
df.sort(F.col("amount").desc())
|
|
125
|
-
df.order_by(F.col("created_at").asc())
|
|
126
|
-
|
|
127
|
-
# 去重
|
|
128
|
-
df.distinct()
|
|
129
|
-
df.drop_duplicates(["user_id"])
|
|
130
|
-
|
|
131
|
-
# 限制行数
|
|
132
|
-
df.limit(100)
|
|
133
|
-
|
|
134
|
-
# 删除列
|
|
135
|
-
df.drop("unnecessary_col")
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
---
|
|
139
|
-
|
|
140
|
-
## 聚合操作
|
|
141
|
-
|
|
142
|
-
```python
|
|
143
|
-
from clickzetta.zettapark import functions as F
|
|
144
|
-
|
|
145
|
-
# 分组聚合
|
|
146
|
-
df.group_by("category").agg(
|
|
147
|
-
F.sum("amount").as_("total_amount"),
|
|
148
|
-
F.count("*").as_("order_count"),
|
|
149
|
-
F.avg("price").as_("avg_price"),
|
|
150
|
-
F.max("amount").as_("max_amount"),
|
|
151
|
-
F.min("amount").as_("min_amount"),
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
# 全局聚合
|
|
155
|
-
df.agg(F.count("*"), F.sum("amount"))
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
---
|
|
159
|
-
|
|
160
|
-
## JOIN 操作
|
|
161
|
-
|
|
162
|
-
```python
|
|
163
|
-
# 内连接
|
|
164
|
-
df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"])
|
|
165
|
-
|
|
166
|
-
# 左连接
|
|
167
|
-
df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"], "left")
|
|
168
|
-
|
|
169
|
-
# 选择连接后的列(避免列名冲突)
|
|
170
|
-
result = df_orders.join(df_customers, df_orders["customer_id"] == df_customers["id"]) \
|
|
171
|
-
.select(df_orders["order_id"], df_customers["name"], df_orders["amount"])
|
|
172
|
-
```
|
|
173
|
-
|
|
174
|
-
---
|
|
175
|
-
|
|
176
|
-
## 执行与结果获取
|
|
177
|
-
|
|
178
|
-
```python
|
|
179
|
-
# 打印前 N 行(触发执行)
|
|
180
|
-
df.show()
|
|
181
|
-
df.show(20)
|
|
182
|
-
|
|
183
|
-
# 收集所有结果为 Row 列表
|
|
184
|
-
rows = df.collect()
|
|
185
|
-
for row in rows:
|
|
186
|
-
print(row["id"], row["name"])
|
|
187
|
-
|
|
188
|
-
# 转换为 Pandas DataFrame
|
|
189
|
-
pandas_df = df.to_pandas()
|
|
190
|
-
|
|
191
|
-
# 获取行数
|
|
192
|
-
count = df.count()
|
|
193
|
-
|
|
194
|
-
# 获取列名
|
|
195
|
-
print(df.columns)
|
|
196
|
-
|
|
197
|
-
# 查看 Schema
|
|
198
|
-
df.schema.print_tree()
|
|
199
|
-
```
|
|
200
|
-
|
|
201
|
-
---
|
|
202
|
-
|
|
203
|
-
## 写入数据
|
|
204
|
-
|
|
205
|
-
```python
|
|
206
|
-
# 写入已有表(追加)
|
|
207
|
-
df.write.save_as_table("my_table", mode="append")
|
|
208
|
-
|
|
209
|
-
# 覆盖写入
|
|
210
|
-
df.write.save_as_table("my_table", mode="overwrite")
|
|
211
|
-
|
|
212
|
-
# 自动建表并写入(overwrite 会重建表)
|
|
213
|
-
df.write.save_as_table("new_table", mode="overwrite")
|
|
214
|
-
|
|
215
|
-
# 写入指定 Schema 下的表
|
|
216
|
-
df.write.save_as_table("my_schema.my_table", mode="append")
|
|
217
|
-
```
|
|
218
|
-
|
|
219
|
-
---
|
|
220
|
-
|
|
221
|
-
## 执行 SQL
|
|
222
|
-
|
|
223
|
-
```python
|
|
224
|
-
# 执行 DDL/DML
|
|
225
|
-
session.sql("CREATE TABLE IF NOT EXISTS t (id INT, name STRING)").collect()
|
|
226
|
-
session.sql("INSERT INTO t VALUES (1, 'Alice')").collect()
|
|
227
|
-
|
|
228
|
-
# 执行查询并获取 DataFrame
|
|
229
|
-
df = session.sql("SELECT * FROM orders WHERE amount > 1000")
|
|
230
|
-
df.show()
|
|
231
|
-
|
|
232
|
-
# 切换 Schema
|
|
233
|
-
session.use_schema("my_schema")
|
|
234
|
-
```
|
|
235
|
-
|
|
236
|
-
---
|
|
237
|
-
|
|
238
|
-
## 文件操作(Volume)
|
|
239
|
-
|
|
240
|
-
```python
|
|
241
|
-
# 上传文件到 User Volume
|
|
242
|
-
session.file.put("/local/path/data.csv", "volume:user://~/data/")
|
|
243
|
-
|
|
244
|
-
# 下载文件
|
|
245
|
-
session.file.get("volume:user://~/data/data.csv", "/local/output/")
|
|
246
|
-
|
|
247
|
-
# 列出 User Volume 文件
|
|
248
|
-
session.sql("LIST USER VOLUME").show()
|
|
249
|
-
session.sql("SHOW USER VOLUME DIRECTORY").show()
|
|
250
|
-
```
|
|
251
|
-
|
|
252
|
-
---
|
|
253
|
-
|
|
254
|
-
## 常用 functions 速查
|
|
255
|
-
|
|
256
|
-
```python
|
|
257
|
-
from clickzetta.zettapark import functions as F
|
|
258
|
-
|
|
259
|
-
# 字符串
|
|
260
|
-
F.upper(col), F.lower(col), F.concat(col1, col2)
|
|
261
|
-
F.substring(col, 1, 3), F.trim(col), F.length(col)
|
|
262
|
-
|
|
263
|
-
# 数值
|
|
264
|
-
F.abs(col), F.round(col, 2), F.floor(col), F.ceil(col)
|
|
265
|
-
F.sqrt(col), F.pow(col, 2)
|
|
266
|
-
|
|
267
|
-
# 日期时间
|
|
268
|
-
F.current_date(), F.current_timestamp()
|
|
269
|
-
F.year(col), F.month(col), F.day(col)
|
|
270
|
-
F.date_add(col, 7), F.datediff(col1, col2)
|
|
271
|
-
|
|
272
|
-
# 条件
|
|
273
|
-
F.when(F.col("status") == "A", "Active").otherwise("Inactive")
|
|
274
|
-
F.coalesce(col1, col2) # 第一个非 null 值
|
|
275
|
-
F.isnull(col), F.isnotnull(col)
|
|
276
|
-
|
|
277
|
-
# 聚合
|
|
278
|
-
F.count("*"), F.sum(col), F.avg(col), F.max(col), F.min(col)
|
|
279
|
-
F.count_distinct(col)
|
|
280
|
-
|
|
281
|
-
# 类型转换
|
|
282
|
-
F.col("amount").cast(IntegerType())
|
|
283
|
-
```
|