@clickzetta/cz-cli-linux-x64 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/bin/cz-cli +0 -0
  2. package/package.json +1 -1
  3. package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
  4. package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
  5. package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
  6. package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
  7. package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
  8. package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
  9. package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
  10. package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
  11. package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
  12. package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
  13. package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
  14. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
  15. package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
  16. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
  17. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -457
  18. package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
  19. package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
  20. package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
  21. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
  22. package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
  23. package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
  24. package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
  25. package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
  26. package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
  27. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
  28. package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
  29. package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
  30. package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
  31. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
  32. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
  33. package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
  34. package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
  35. package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
  36. package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
  37. package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
  38. package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -112
  39. package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
  40. package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
  41. package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
  42. package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
  43. package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
  44. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  45. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
  46. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
  47. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
  48. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
  49. package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
  50. package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
  51. package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
  52. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
  53. package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -156
  54. package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
  55. package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
  56. package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
  57. package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
  58. package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
  59. package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
  60. package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
  61. package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
  62. package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
  63. package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
  64. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
  65. package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
  66. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -639
  67. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -324
  68. package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
  69. package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
  70. package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
  71. package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
  72. package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
  73. package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
  74. package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
  75. package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
  76. package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
  77. package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
  78. package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
  79. package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
  80. package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
  81. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -427
  82. package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
  83. package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
  84. package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
  85. package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
  86. package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
  87. package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
  88. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
  89. package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
  90. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
  91. package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
  92. package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
  93. package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
  94. package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -379
  95. package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
  96. package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -185
  97. package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
  98. package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -222
  99. package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -125
  100. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
  101. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  102. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  103. package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
  104. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  105. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  106. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
  107. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  108. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  109. package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
  110. package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
  111. package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -206
  112. package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
  113. package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
  114. package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
  115. package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -292
  116. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -199
  117. package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
  118. package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
@@ -1,199 +0,0 @@
1
- # Volume 管理参考
2
-
3
- > 来源:https://www.yunqi.tech/documents/datalake_volume_object 等
4
-
5
- ## Volume 类型
6
-
7
- | 类型 | 说明 |
8
- |---|---|
9
- | 外部 Volume(External Volume) | 挂载 OSS/COS/S3 等对象存储路径 |
10
- | 内部 Volume(Internal Volume) | 系统托管存储,含 User Volume、Table Volume、命名 Volume |
11
-
12
- ---
13
-
14
- ## CREATE EXTERNAL VOLUME
15
-
16
- ```sql
17
- -- OSS(Connection 必须使用小写 access_id/access_key)
18
- CREATE EXTERNAL VOLUME my_oss_volume
19
- LOCATION 'oss://<bucket>/<path>'
20
- USING CONNECTION my_oss_conn
21
- DIRECTORY = (ENABLE = TRUE, AUTO_REFRESH = TRUE)
22
- RECURSIVE = TRUE;
23
-
24
- -- COS
25
- CREATE EXTERNAL VOLUME my_cos_volume
26
- LOCATION 'cos://<bucket>/<path>'
27
- USING CONNECTION my_cos_conn
28
- DIRECTORY = (ENABLE = TRUE)
29
- RECURSIVE = TRUE;
30
-
31
- -- S3
32
- CREATE EXTERNAL VOLUME my_s3_volume
33
- LOCATION 's3://<bucket>/<path>'
34
- USING CONNECTION my_s3_conn
35
- DIRECTORY = (ENABLE = TRUE)
36
- RECURSIVE = TRUE;
37
- ```
38
-
39
- 参数说明:
40
- - `LOCATION`:对象存储路径
41
- - `USING CONNECTION`:已创建的 STORAGE CONNECTION 名称
42
- - `DIRECTORY`:目录功能配置,`ENABLE=TRUE` 开启目录索引,`AUTO_REFRESH=TRUE` 自动刷新
43
- - `RECURSIVE`:是否递归扫描子目录
44
-
45
- > ⚠️ 上传新文件后如果 `SHOW VOLUME DIRECTORY` 未显示,执行 `ALTER VOLUME name REFRESH` 手动刷新。
46
-
47
- ---
48
-
49
- ## ALTER VOLUME
50
-
51
- ```sql
52
- -- 刷新目录元数据
53
- ALTER VOLUME my_oss_volume REFRESH;
54
- ```
55
-
56
- ---
57
-
58
- ## DROP VOLUME
59
-
60
- ```sql
61
- DROP VOLUME IF EXISTS my_oss_volume;
62
- ```
63
-
64
- ---
65
-
66
- ## SHOW / DESC VOLUME
67
-
68
- ```sql
69
- -- 列出所有 Volume
70
- SHOW VOLUMES;
71
-
72
- -- 按条件过滤(SHOW VOLUMES 不支持 WHERE,使用 information_schema)
73
- SELECT volume_name, volume_type, volume_region, volume_creator
74
- FROM information_schema.volumes
75
- WHERE volume_type = 'EXTERNAL';
76
-
77
- -- 按名称查找
78
- SELECT * FROM information_schema.volumes
79
- WHERE volume_name = 'my_oss_volume';
80
-
81
- -- 查看 Volume 详情
82
- DESC VOLUME my_oss_volume;
83
-
84
- -- 查看 Volume 目录下的文件
85
- SHOW VOLUME DIRECTORY my_oss_volume;
86
- ```
87
-
88
- ---
89
-
90
- ## 查看目录元数据(DIRECTORY 函数)
91
-
92
- ```sql
93
- -- 查看 Volume 目录元数据(需先 ALTER VOLUME REFRESH)
94
- SELECT * FROM DIRECTORY(VOLUME my_oss_volume);
95
- ```
96
-
97
- ---
98
-
99
- ## User Volume 操作
100
-
101
- ```sql
102
- -- 查看 User Volume 文件列表
103
- SHOW USER VOLUME DIRECTORY;
104
-
105
- -- 上传文件到 User Volume 根目录
106
- PUT '/local/path/file.csv' TO USER VOLUME;
107
-
108
- -- 上传并指定目标路径
109
- PUT '/local/path/file.csv' TO USER VOLUME FILE 'subdir/file.csv';
110
-
111
- -- 通配符上传多个文件
112
- PUT '/local/path/images/*' TO USER VOLUME SUBDIRECTORY 'images/';
113
-
114
- -- 下载文件
115
- GET USER VOLUME FILE 'subdir/file.csv' TO '/local/output/';
116
-
117
- -- 删除文件
118
- REMOVE USER VOLUME FILE 'subdir/file.csv';
119
-
120
- -- 删除目录下所有文件
121
- REMOVE USER VOLUME SUBDIRECTORY '/';
122
- ```
123
-
124
- ---
125
-
126
- ## 从 Volume 查询数据(SELECT FROM VOLUME)
127
-
128
- ```sql
129
- -- 查询 CSV 文件
130
- SELECT * FROM VOLUME my_oss_volume
131
- USING CSV
132
- OPTIONS('header' = 'true', 'sep' = ',')
133
- SUBDIRECTORY 'data/'
134
- LIMIT 100;
135
-
136
- -- 查询 Parquet 文件
137
- SELECT * FROM VOLUME my_oss_volume
138
- USING PARQUET
139
- FILES('part-00001.parquet', 'part-00002.parquet');
140
-
141
- -- 正则匹配文件
142
- SELECT * FROM VOLUME my_oss_volume
143
- USING PARQUET
144
- REGEXP '.*2024-0[1-3].parquet';
145
-
146
- -- 查询 User Volume 文件
147
- SELECT * FROM USER VOLUME
148
- USING CSV
149
- OPTIONS('header' = 'true')
150
- FILES('data.csv')
151
- LIMIT 10;
152
- ```
153
-
154
- 支持格式:`CSV`、`PARQUET`、`ORC`、`JSON`、`BSON`
155
-
156
- CSV OPTIONS 常用参数:
157
- - `header`:是否有表头,默认 `false`
158
- - `sep`:列分隔符,默认 `,`
159
- - `compression`:压缩格式(gzip/zstd/zlib)
160
- - `multiLine`:是否支持多行字段,默认 `false`
161
-
162
- ---
163
-
164
- ## COPY INTO TABLE(从 Volume 导入)
165
-
166
- ```sql
167
- COPY INTO my_table
168
- FROM VOLUME my_oss_volume
169
- USING CSV
170
- OPTIONS('header' = 'true')
171
- SUBDIRECTORY 'data/';
172
- ```
173
-
174
- ## COPY INTO VOLUME(导出到 Volume)
175
-
176
- ```sql
177
- -- 导出表到 External Volume
178
- COPY INTO VOLUME my_oss_volume
179
- SUBDIRECTORY 'export/'
180
- FROM TABLE my_table
181
- FILE_FORMAT = (TYPE = CSV);
182
-
183
- -- 导出查询结果
184
- COPY INTO VOLUME my_oss_volume
185
- SUBDIRECTORY 'export/'
186
- FROM (SELECT * FROM orders WHERE year = 2024)
187
- FILE_FORMAT = (TYPE = PARQUET COMPRESSION = 'GZIP');
188
-
189
- -- 导出到 User Volume
190
- COPY INTO USER VOLUME
191
- SUBDIRECTORY 'export/'
192
- FROM TABLE my_table
193
- FILE_FORMAT = (TYPE = CSV);
194
- ```
195
-
196
- > ⚠️ **关键区分**:
197
- > - **导入**(COPY INTO TABLE / SELECT FROM VOLUME):用 `USING CSV/PARQUET/JSON` + `OPTIONS(...)`
198
- > - **导出**(COPY INTO VOLUME):用 `FILE_FORMAT = (TYPE = CSV/PARQUET/JSON)`
199
- > - 两者语法不可混用!
@@ -1,248 +0,0 @@
1
- ---
2
- name: clickzetta-zettapark
3
- description: |
4
- 使用 ZettaPark Python 库操作 ClickZetta Lakehouse 数据。ZettaPark 提供类 pandas 的
5
- DataFrame API,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行。
6
- 覆盖 Session 创建、DataFrame 构建与转换(filter/select/join/groupBy)、
7
- 结果收集(collect/to_pandas/show)、写入表(save_as_table)、
8
- 文件操作(PUT/GET)、执行 SQL 等完整工作流。
9
- 当用户说"ZettaPark"、"zettapark"、"DataFrame API"、"Python 操作 Lakehouse"、
10
- "save_as_table"、"session.table"、"session.sql"、"collect()"、"to_pandas"、
11
- "Python 数据工程"、"Python 写入 Lakehouse"、"Python 读取 Lakehouse"、
12
- "clickzetta_zettapark_python"时触发。
13
- Keywords: ZettaPark, DataFrame, pandas-like, Python, SQL translation, distributed compute
14
- ---
15
-
16
- # ClickZetta ZettaPark
17
-
18
- ZettaPark 是 ClickZetta Lakehouse 的 Python DataFrame 框架,将 Python 操作翻译为 SQL 在 Lakehouse 中分布式执行,提供类 pandas 的开发体验。
19
-
20
- 阅读 [references/zettapark-api.md](references/zettapark-api.md) 了解完整 API。
21
-
22
- ## 安装
23
-
24
- > ⚠️ **Python 版本要求**:推荐 **Python 3.12**(最低 3.10,不支持 3.9 及以下)
25
-
26
- ```bash
27
- # 方式 1:venv(Python 内置,推荐)
28
- python3.12 -m venv .venv
29
- source .venv/bin/activate # macOS/Linux | .venv\Scripts\activate (Windows)
30
- pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
31
-
32
- # 方式 2:pyenv(需要切换 Python 版本时)
33
- pyenv install 3.12.9 && pyenv local 3.12.9
34
- python -m venv .venv && source .venv/bin/activate
35
- pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
36
-
37
- # 方式 3:conda(数据科学环境)
38
- conda create -n lakehouse python=3.12 -y && conda activate lakehouse
39
- pip install clickzetta_zettapark_python -i https://pypi.tuna.tsinghua.edu.cn/simple
40
- ```
41
-
42
- ---
43
-
44
- ## 创建会话
45
-
46
- ```python
47
- from clickzetta.zettapark.session import Session
48
-
49
- connection_parameters = {
50
- "username": "your_username",
51
- "password": "your_password",
52
- "service": "cn-shanghai-alicloud.api.clickzetta.com",
53
- "instance": "your_instance_id",
54
- "workspace": "your_workspace",
55
- "schema": "public",
56
- "vcluster": "default_ap",
57
- }
58
-
59
- session = Session.builder.configs(connection_parameters).create()
60
-
61
- # 验证连接
62
- session.sql("SELECT current_user(), current_workspace()").show()
63
- ```
64
-
65
- ---
66
-
67
- ## 核心工作流
68
-
69
- ### 读取数据
70
-
71
- ```python
72
- from clickzetta.zettapark import functions as F
73
-
74
- # 从表读取
75
- df = session.table("orders")
76
- df = session.table("my_schema.orders")
77
-
78
- # 从 SQL 读取
79
- df = session.sql("SELECT * FROM orders WHERE year = 2024")
80
-
81
- # 从 Python 数据创建
82
- df = session.create_dataframe([[1, "Alice", 100.0], [2, "Bob", 200.0]],
83
- schema=["id", "name", "amount"])
84
- ```
85
-
86
- ### 转换数据
87
-
88
- ```python
89
- # 过滤、选择、新增列
90
- result = (
91
- session.table("orders")
92
- .filter(F.col("status") == "completed")
93
- .select("order_id", "customer_id", "amount")
94
- .with_column("tax", F.col("amount") * 0.1)
95
- .sort(F.col("amount").desc())
96
- .limit(100)
97
- )
98
- ```
99
-
100
- ### 聚合
101
-
102
- ```python
103
- summary = (
104
- session.table("orders")
105
- .group_by("category")
106
- .agg(
107
- F.sum("amount").as_("total"),
108
- F.count("*").as_("cnt"),
109
- F.avg("amount").as_("avg_amount"),
110
- )
111
- )
112
- summary.show()
113
- ```
114
-
115
- ### JOIN
116
-
117
- ```python
118
- orders = session.table("orders")
119
- customers = session.table("customers")
120
-
121
- result = orders.join(
122
- customers,
123
- orders["customer_id"] == customers["id"],
124
- "left"
125
- ).select(
126
- orders["order_id"],
127
- customers["name"],
128
- orders["amount"]
129
- )
130
- ```
131
-
132
- ### 写入数据
133
-
134
- ```python
135
- # 追加到已有表
136
- df.write.save_as_table("result_table", mode="append")
137
-
138
- # 覆盖写入(自动建表)
139
- df.write.save_as_table("result_table", mode="overwrite")
140
- ```
141
-
142
- ### 获取结果
143
-
144
- ```python
145
- # 打印预览
146
- df.show(20)
147
-
148
- # 收集为 Row 列表
149
- rows = df.collect()
150
- for row in rows:
151
- print(row["id"], row["name"])
152
-
153
- # 转为 Pandas DataFrame(小数据量)
154
- pandas_df = df.to_pandas()
155
-
156
- # 获取行数
157
- print(df.count())
158
- ```
159
-
160
- ---
161
-
162
- ## 典型场景
163
-
164
- ### 场景 1:ETL 数据处理
165
-
166
- ```python
167
- from clickzetta.zettapark.session import Session
168
- from clickzetta.zettapark import functions as F
169
-
170
- session = Session.builder.configs(config).create()
171
-
172
- # 读取原始数据
173
- raw = session.table("bronze.raw_orders")
174
-
175
- # 清洗转换
176
- cleaned = (
177
- raw
178
- .filter(F.isnotnull(F.col("order_id")))
179
- .filter(F.col("amount") > 0)
180
- .with_column("order_date", F.col("created_at").cast("DATE"))
181
- .with_column("year_month", F.date_format(F.col("order_date"), "yyyy-MM"))
182
- .select("order_id", "customer_id", "amount", "order_date", "year_month")
183
- )
184
-
185
- # 写入 Silver 层
186
- cleaned.write.save_as_table("silver.orders_cleaned", mode="overwrite")
187
-
188
- session.close()
189
- ```
190
-
191
- ### 场景 2:特征工程(机器学习)
192
-
193
- ```python
194
- from clickzetta.zettapark import functions as F
195
-
196
- customer = session.table("clickzetta_sample_data.tpch_100g.customer")
197
- orders = session.table("clickzetta_sample_data.tpch_100g.orders")
198
-
199
- # 构建客户消费特征
200
- customer_features = (
201
- orders
202
- .group_by("o_custkey")
203
- .agg(
204
- F.sum("o_totalprice").as_("total_spend"),
205
- F.count("*").as_("order_count"),
206
- F.avg("o_totalprice").as_("avg_order_value"),
207
- F.max("o_orderdate").as_("last_order_date"),
208
- )
209
- .join(customer, orders["o_custkey"] == customer["c_custkey"])
210
- .select("c_custkey", "c_name", "total_spend", "order_count", "avg_order_value")
211
- )
212
-
213
- customer_features.write.save_as_table("ml_features.customer_features", mode="overwrite")
214
- ```
215
-
216
- ### 场景 3:从本地文件导入
217
-
218
- ```python
219
- import json
220
- import gzip
221
- from clickzetta.zettapark.session import Session
222
-
223
- session = Session.builder.configs(config).create()
224
-
225
- # 读取本地 JSON 数据
226
- data = []
227
- with gzip.open('data.json.gz', 'rt', encoding='utf-8') as f:
228
- for line in f:
229
- if line.strip():
230
- data.append(json.loads(line))
231
-
232
- # 创建 DataFrame 并写入
233
- df = session.create_dataframe(data)
234
- df.write.save_as_table("my_table", mode="overwrite")
235
-
236
- session.close()
237
- ```
238
-
239
- ---
240
-
241
- ## 常见问题
242
-
243
- | 问题 | 原因 | 解决方案 |
244
- |---|---|---|
245
- | `collect()` 超时 | 数据量过大或集群规格不足 | 增大 `sdk.job.timeout`,或先 `limit()` 测试 |
246
- | `to_pandas()` 内存溢出 | 结果集过大 | 先聚合/过滤再转 pandas,或分批处理 |
247
- | 列名冲突(JOIN 后) | 两表有同名列 | 用 `df_left["col"]` 明确指定来源 |
248
- | `save_as_table` 报错 | 表已存在且 mode 不对 | 使用 `mode="overwrite"` 或 `mode="append"` |