@clickzetta/cz-cli-linux-x64 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
- package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
- package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
- package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
- package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
- package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -457
- package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
- package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
- package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
- package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
- package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
- package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
- package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
- package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
- package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
- package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
- package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
- package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -112
- package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
- package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
- package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
- package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
- package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
- package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
- package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -156
- package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
- package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
- package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
- package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
- package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
- package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
- package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
- package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
- package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
- package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -639
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -324
- package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
- package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
- package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
- package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
- package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
- package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
- package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
- package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
- package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
- package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
- package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
- package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
- package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -427
- package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
- package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
- package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
- package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
- package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
- package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
- package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -379
- package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
- package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -185
- package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
- package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -222
- package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -125
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
- package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
- package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -206
- package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
- package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
- package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
- package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -292
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -199
- package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
# 环境搭建与项目配置
|
|
2
|
-
|
|
3
|
-
## 环境搭建
|
|
4
|
-
|
|
5
|
-
```bash
|
|
6
|
-
# 方式 1:venv(推荐)
|
|
7
|
-
python3.12 -m venv .venv
|
|
8
|
-
source .venv/bin/activate # macOS/Linux
|
|
9
|
-
pip install clickzetta_zettapark_python clickzetta-connector-python \
|
|
10
|
-
python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
|
|
11
|
-
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
12
|
-
|
|
13
|
-
# 方式 2:pyenv(需要切换 Python 版本时)
|
|
14
|
-
pyenv install 3.12.9 && pyenv local 3.12.9
|
|
15
|
-
python -m venv .venv && source .venv/bin/activate
|
|
16
|
-
pip install clickzetta_zettapark_python clickzetta-connector-python \
|
|
17
|
-
python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
|
|
18
|
-
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
19
|
-
|
|
20
|
-
# 方式 3:conda
|
|
21
|
-
conda create -n lakehouse-ds python=3.12 -y && conda activate lakehouse-ds
|
|
22
|
-
pip install clickzetta_zettapark_python clickzetta-connector-python \
|
|
23
|
-
python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
|
|
24
|
-
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
25
|
-
```
|
|
26
|
-
|
|
27
|
-
| 问题 | 修复 |
|
|
28
|
-
|------|------|
|
|
29
|
-
| Python 3.8/3.9 | `pyenv install 3.12.9` 或 `python3.12 -m venv .venv` |
|
|
30
|
-
| `pyarrow` 版本冲突 | `pip install pyarrow==14.0.0` |
|
|
31
|
-
| M1/M2 Mac 报错 | `pip install --no-binary :all:` 或改用 conda |
|
|
32
|
-
| 连接超时 | VCluster 未启动,在 Studio 中手动启动 |
|
|
33
|
-
|
|
34
|
-
---
|
|
35
|
-
|
|
36
|
-
## Jupyter Kernel 配置
|
|
37
|
-
|
|
38
|
-
```bash
|
|
39
|
-
# 注册 venv 为 Jupyter kernel(关键步骤,否则 notebook 用系统 Python)
|
|
40
|
-
source .venv/bin/activate
|
|
41
|
-
pip install ipykernel jupyterlab
|
|
42
|
-
python -m ipykernel install --user --name lakehouse-ds --display-name "Python (lakehouse-ds)"
|
|
43
|
-
|
|
44
|
-
# 启动 JupyterLab
|
|
45
|
-
jupyter lab --port=8888
|
|
46
|
-
```
|
|
47
|
-
|
|
48
|
-
VS Code / Cursor:打开 `.ipynb` → 右上角 "Select Kernel" → 选 "Python (lakehouse-ds)"
|
|
49
|
-
|
|
50
|
-
| 问题 | 修复 |
|
|
51
|
-
|------|------|
|
|
52
|
-
| `ModuleNotFoundError: clickzetta` | kernel 未选对,切换到注册的 venv kernel |
|
|
53
|
-
| `.env` 读不到 | `load_dotenv(dotenv_path='../.env')` 指定路径 |
|
|
54
|
-
| `to_pandas()` OOM | 加 `TABLESAMPLE ROW(1)` 或 `LIMIT` |
|
|
55
|
-
| 图表不显示 | notebook 开头加 `%matplotlib inline` |
|
|
56
|
-
|
|
57
|
-
---
|
|
58
|
-
|
|
59
|
-
## src/config.py 模板
|
|
60
|
-
|
|
61
|
-
```python
|
|
62
|
-
import os, sys
|
|
63
|
-
from pathlib import Path
|
|
64
|
-
from dotenv import load_dotenv
|
|
65
|
-
from clickzetta.zettapark.session import Session
|
|
66
|
-
import clickzetta
|
|
67
|
-
|
|
68
|
-
# 多位置查找 .env
|
|
69
|
-
for _p in [
|
|
70
|
-
Path(__file__).parent.parent / ".env",
|
|
71
|
-
Path.home() / ".config" / "kilo" / ".env",
|
|
72
|
-
Path.home() / ".czcode" / ".env",
|
|
73
|
-
Path.home() / ".env",
|
|
74
|
-
]:
|
|
75
|
-
if _p.exists():
|
|
76
|
-
load_dotenv(dotenv_path=_p)
|
|
77
|
-
break
|
|
78
|
-
|
|
79
|
-
def check_environment():
|
|
80
|
-
"""在 00-env-check.ipynb 里调用,打印环境诊断。"""
|
|
81
|
-
ver = sys.version_info
|
|
82
|
-
if ver < (3, 10):
|
|
83
|
-
raise RuntimeError(
|
|
84
|
-
f"Python {ver.major}.{ver.minor} 不满足要求。ZettaPark 需要 Python 3.10+。\n"
|
|
85
|
-
"升级:brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9"
|
|
86
|
-
)
|
|
87
|
-
print(f"✅ Python {ver.major}.{ver.minor}.{ver.micro}")
|
|
88
|
-
for pkg, mod in [
|
|
89
|
-
("clickzetta_zettapark_python", "clickzetta.zettapark"),
|
|
90
|
-
("clickzetta-connector-python", "clickzetta"),
|
|
91
|
-
("pandas", "pandas"), ("python-dotenv", "dotenv"),
|
|
92
|
-
]:
|
|
93
|
-
try:
|
|
94
|
-
m = __import__(mod.split(".")[0])
|
|
95
|
-
print(f"✅ {pkg}: {getattr(m, '__version__', 'ok')}")
|
|
96
|
-
except ImportError:
|
|
97
|
-
print(f"❌ {pkg}: 未安装 → pip install {pkg}")
|
|
98
|
-
try:
|
|
99
|
-
s = get_session()
|
|
100
|
-
print(f"✅ Lakehouse: {s.sql('SELECT current_workspace(), current_user()').collect()}")
|
|
101
|
-
except Exception as e:
|
|
102
|
-
print(f"❌ Lakehouse 连接失败: {e}")
|
|
103
|
-
|
|
104
|
-
def get_session() -> Session:
|
|
105
|
-
return Session.builder.configs({
|
|
106
|
-
"service": os.environ["CLICKZETTA_SERVICE"],
|
|
107
|
-
"instance": os.environ["CLICKZETTA_INSTANCE"],
|
|
108
|
-
"workspace": os.environ["CLICKZETTA_WORKSPACE"],
|
|
109
|
-
"username": os.environ["CLICKZETTA_USERNAME"],
|
|
110
|
-
"password": os.environ["CLICKZETTA_PASSWORD"],
|
|
111
|
-
"vcluster": os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
|
|
112
|
-
"schema": os.environ.get("CLICKZETTA_SCHEMA", "public"),
|
|
113
|
-
}).create()
|
|
114
|
-
|
|
115
|
-
def get_connector_connection():
|
|
116
|
-
"""仅用于 pd.read_sql。禁止用于 df.to_sql()。"""
|
|
117
|
-
return clickzetta.connect(
|
|
118
|
-
service=os.environ["CLICKZETTA_SERVICE"],
|
|
119
|
-
instance=os.environ["CLICKZETTA_INSTANCE"],
|
|
120
|
-
workspace=os.environ["CLICKZETTA_WORKSPACE"],
|
|
121
|
-
username=os.environ["CLICKZETTA_USERNAME"],
|
|
122
|
-
password=os.environ["CLICKZETTA_PASSWORD"],
|
|
123
|
-
vcluster=os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
|
|
124
|
-
schema=os.environ.get("CLICKZETTA_SCHEMA", "public"),
|
|
125
|
-
)
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
---
|
|
129
|
-
|
|
130
|
-
## .env 模板
|
|
131
|
-
|
|
132
|
-
```bash
|
|
133
|
-
CLICKZETTA_SERVICE=cn-shanghai-alicloud.api.clickzetta.com
|
|
134
|
-
CLICKZETTA_INSTANCE=<instance-id>
|
|
135
|
-
CLICKZETTA_WORKSPACE=<workspace>
|
|
136
|
-
CLICKZETTA_USERNAME=<username>
|
|
137
|
-
CLICKZETTA_PASSWORD=<password>
|
|
138
|
-
CLICKZETTA_VCLUSTER=default_ap
|
|
139
|
-
CLICKZETTA_SCHEMA=ds_workspace
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
## pyproject.toml
|
|
143
|
-
|
|
144
|
-
```toml
|
|
145
|
-
[project]
|
|
146
|
-
name = "my-lakehouse-ds-project"
|
|
147
|
-
requires-python = ">=3.10"
|
|
148
|
-
dependencies = [
|
|
149
|
-
"clickzetta_zettapark_python>=0.1.2",
|
|
150
|
-
"clickzetta-connector-python>=1.0.0",
|
|
151
|
-
"python-dotenv>=1.0.0",
|
|
152
|
-
"pandas>=2.0.0",
|
|
153
|
-
"numpy>=1.24.0",
|
|
154
|
-
"scikit-learn>=1.3.0",
|
|
155
|
-
"pyarrow>=14.0.0",
|
|
156
|
-
"jupyterlab>=4.0.0",
|
|
157
|
-
"matplotlib>=3.7.0",
|
|
158
|
-
"seaborn>=0.12.0",
|
|
159
|
-
]
|
|
160
|
-
```
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
# 数据科学统计分析函数参考
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## 近似聚合函数(大表高效统计)
|
|
6
|
-
|
|
7
|
-
### approx_count_distinct — 近似 UV
|
|
8
|
-
|
|
9
|
-
```sql
|
|
10
|
-
-- 使用 HyperLogLog 算法,误差约 2%,比 COUNT(DISTINCT) 快 10x+
|
|
11
|
-
SELECT approx_count_distinct(user_id) AS approx_uv
|
|
12
|
-
FROM my_schema.events;
|
|
13
|
-
|
|
14
|
-
-- 按天统计 DAU
|
|
15
|
-
SELECT
|
|
16
|
-
DATE(event_time) AS dt,
|
|
17
|
-
approx_count_distinct(user_id) AS dau
|
|
18
|
-
FROM my_schema.events
|
|
19
|
-
GROUP BY 1
|
|
20
|
-
ORDER BY 1;
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
### approx_percentile — 近似分位数
|
|
24
|
-
|
|
25
|
-
```sql
|
|
26
|
-
-- 中位数、四分位数、P95、P99
|
|
27
|
-
SELECT
|
|
28
|
-
approx_percentile(amount, 0.25) AS p25,
|
|
29
|
-
approx_percentile(amount, 0.50) AS median,
|
|
30
|
-
approx_percentile(amount, 0.75) AS p75,
|
|
31
|
-
approx_percentile(amount, 0.95) AS p95,
|
|
32
|
-
approx_percentile(amount, 0.99) AS p99
|
|
33
|
-
FROM my_schema.orders;
|
|
34
|
-
|
|
35
|
-
-- 分组分位数
|
|
36
|
-
SELECT
|
|
37
|
-
category,
|
|
38
|
-
approx_percentile(price, 0.5) AS median_price
|
|
39
|
-
FROM my_schema.products
|
|
40
|
-
GROUP BY category;
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
### approx_histogram — 近似直方图
|
|
44
|
-
|
|
45
|
-
```sql
|
|
46
|
-
-- 返回结构体数组:[{min, max, count}, ...]
|
|
47
|
-
SELECT approx_histogram(amount, 10) AS hist
|
|
48
|
-
FROM my_schema.orders;
|
|
49
|
-
|
|
50
|
-
-- 解析直方图(展开为行)
|
|
51
|
-
SELECT
|
|
52
|
-
bucket.min AS bucket_min,
|
|
53
|
-
bucket.max AS bucket_max,
|
|
54
|
-
bucket.count AS bucket_count
|
|
55
|
-
FROM (
|
|
56
|
-
SELECT EXPLODE(approx_histogram(amount, 10)) AS bucket
|
|
57
|
-
FROM my_schema.orders
|
|
58
|
-
);
|
|
59
|
-
```
|
|
60
|
-
|
|
61
|
-
### approx_top_k — 近似 TOP-K 高频值
|
|
62
|
-
|
|
63
|
-
```sql
|
|
64
|
-
-- 找出出现最多的前 10 个城市
|
|
65
|
-
SELECT approx_top_k(city, 10) AS top_cities
|
|
66
|
-
FROM my_schema.orders;
|
|
67
|
-
|
|
68
|
-
-- 返回结构体数组:[{value, count}, ...]
|
|
69
|
-
-- 解析展开(字段名是 value 和 count)
|
|
70
|
-
SELECT item.value AS city, item.count AS cnt
|
|
71
|
-
FROM (
|
|
72
|
-
SELECT EXPLODE(approx_top_k(city, 10)) AS item
|
|
73
|
-
FROM my_schema.orders
|
|
74
|
-
)
|
|
75
|
-
ORDER BY cnt DESC;
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
---
|
|
79
|
-
|
|
80
|
-
## 精确统计函数
|
|
81
|
-
|
|
82
|
-
### percentile / median
|
|
83
|
-
|
|
84
|
-
```sql
|
|
85
|
-
-- 精确中位数(小表用,大表用 approx_percentile)
|
|
86
|
-
SELECT
|
|
87
|
-
percentile(amount, 0.5) AS exact_median,
|
|
88
|
-
median(amount) AS median_alias -- 等价写法
|
|
89
|
-
FROM my_schema.orders;
|
|
90
|
-
|
|
91
|
-
-- 多分位数
|
|
92
|
-
SELECT percentile(amount, ARRAY(0.25, 0.5, 0.75, 0.9, 0.99))
|
|
93
|
-
FROM my_schema.orders;
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
---
|
|
97
|
-
|
|
98
|
-
## TABLESAMPLE 采样
|
|
99
|
-
|
|
100
|
-
```sql
|
|
101
|
-
-- ROW 模式:精确行级采样(适合 ML 训练集,< 1000万行)
|
|
102
|
-
SELECT * FROM my_schema.events TABLESAMPLE ROW (10); -- 精确 10%
|
|
103
|
-
SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS); -- 精确 5 行
|
|
104
|
-
|
|
105
|
-
-- SYSTEM 模式:文件级采样(适合大表快速预览,> 1000万行)
|
|
106
|
-
SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000; -- 约 0.1%
|
|
107
|
-
|
|
108
|
-
-- 分层采样(按类别等比例采样)
|
|
109
|
-
SELECT * FROM (
|
|
110
|
-
SELECT *,
|
|
111
|
-
ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn,
|
|
112
|
-
COUNT(*) OVER (PARTITION BY category) AS cat_total
|
|
113
|
-
FROM my_schema.products
|
|
114
|
-
)
|
|
115
|
-
WHERE rn <= CEIL(cat_total * 0.1); -- 每类取 10%
|
|
116
|
-
```
|
|
117
|
-
|
|
118
|
-
| 场景 | 推荐模式 | 说明 |
|
|
119
|
-
|---|---|---|
|
|
120
|
-
| 快速数据预览 | SYSTEM | 极快,适合 > 100万行 |
|
|
121
|
-
| ML 训练集构建 | ROW | 精确随机,保证代表性 |
|
|
122
|
-
| 数据质量抽检 | SYSTEM | 快速抽样验证 |
|
|
123
|
-
| 统计分析 | ROW | 精确概率采样 |
|
|
124
|
-
|
|
125
|
-
> ⚠️ **注意**:TABLESAMPLE 在小表(< 数万行)上可能返回全部数据,百分比采样不精确。小表直接用 `LIMIT` 即可。
|
|
126
|
-
|
|
127
|
-
---
|
|
128
|
-
|
|
129
|
-
## 窗口函数(时序/排名特征)
|
|
130
|
-
|
|
131
|
-
```sql
|
|
132
|
-
-- 移动平均(7日)
|
|
133
|
-
SELECT
|
|
134
|
-
dt,
|
|
135
|
-
revenue,
|
|
136
|
-
AVG(revenue) OVER (
|
|
137
|
-
ORDER BY dt
|
|
138
|
-
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
|
|
139
|
-
) AS revenue_7d_ma
|
|
140
|
-
FROM daily_stats;
|
|
141
|
-
|
|
142
|
-
-- 环比增长率
|
|
143
|
-
SELECT
|
|
144
|
-
dt,
|
|
145
|
-
revenue,
|
|
146
|
-
LAG(revenue, 1) OVER (ORDER BY dt) AS prev_revenue,
|
|
147
|
-
ROUND(100.0 * (revenue - LAG(revenue, 1) OVER (ORDER BY dt))
|
|
148
|
-
/ NULLIF(LAG(revenue, 1) OVER (ORDER BY dt), 0), 2) AS mom_growth_pct
|
|
149
|
-
FROM daily_stats;
|
|
150
|
-
|
|
151
|
-
-- 用户行为排名(RFM 分析)
|
|
152
|
-
SELECT
|
|
153
|
-
user_id,
|
|
154
|
-
total_amount,
|
|
155
|
-
NTILE(5) OVER (ORDER BY total_amount DESC) AS monetary_quintile,
|
|
156
|
-
NTILE(5) OVER (ORDER BY order_cnt DESC) AS frequency_quintile,
|
|
157
|
-
NTILE(5) OVER (ORDER BY last_order_date DESC) AS recency_quintile
|
|
158
|
-
FROM user_rfm;
|
|
159
|
-
|
|
160
|
-
-- 去重保留最新(数据清洗常用)
|
|
161
|
-
SELECT * FROM (
|
|
162
|
-
SELECT *,
|
|
163
|
-
ROW_NUMBER() OVER (
|
|
164
|
-
PARTITION BY user_id
|
|
165
|
-
ORDER BY update_time DESC
|
|
166
|
-
) AS rn
|
|
167
|
-
FROM my_schema.users_raw
|
|
168
|
-
) WHERE rn = 1;
|
|
169
|
-
```
|
|
170
|
-
|
|
171
|
-
---
|
|
172
|
-
|
|
173
|
-
## 数据质量检查模板
|
|
174
|
-
|
|
175
|
-
```sql
|
|
176
|
-
-- 一次性输出所有关键质量指标
|
|
177
|
-
SELECT
|
|
178
|
-
COUNT(*) AS total_rows,
|
|
179
|
-
COUNT(DISTINCT user_id) AS unique_users,
|
|
180
|
-
-- 缺失率
|
|
181
|
-
ROUND(100.0 * COUNT(*) FILTER (WHERE user_id IS NULL)
|
|
182
|
-
/ COUNT(*), 2) AS user_id_null_pct,
|
|
183
|
-
ROUND(100.0 * COUNT(*) FILTER (WHERE amount IS NULL)
|
|
184
|
-
/ COUNT(*), 2) AS amount_null_pct,
|
|
185
|
-
-- 异常值
|
|
186
|
-
SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) AS negative_amount_cnt,
|
|
187
|
-
SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END) AS extreme_amount_cnt,
|
|
188
|
-
-- 时间范围
|
|
189
|
-
MIN(order_date) AS earliest_date,
|
|
190
|
-
MAX(order_date) AS latest_date,
|
|
191
|
-
-- 分布
|
|
192
|
-
approx_percentile(amount, 0.5) AS median_amount,
|
|
193
|
-
approx_percentile(amount, 0.99) AS p99_amount
|
|
194
|
-
FROM my_schema.orders;
|
|
195
|
-
```
|
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
# 数据写入、特征工程、模型推理示例
|
|
2
|
-
|
|
3
|
-
## 数据写入
|
|
4
|
-
|
|
5
|
-
| 场景 | 方式 |
|
|
6
|
-
|------|------|
|
|
7
|
-
| ZettaPark 可用(Python 3.10+) | `save_as_table()` 或 `create_dataframe().write` |
|
|
8
|
-
| 本地 CSV/pandas 写入 | `session.create_dataframe(df).write.save_as_table()` |
|
|
9
|
-
| Python 3.9 / ZettaPark 不可用 | cursor 批量 INSERT(见下方) |
|
|
10
|
-
| **禁止** | `df.to_sql()`、SQLAlchemy `clickzetta://...` |
|
|
11
|
-
|
|
12
|
-
```python
|
|
13
|
-
# 方式 A:ZettaPark(推荐)
|
|
14
|
-
session.sql("""
|
|
15
|
-
SELECT o.*, u.age_group FROM my_schema.orders_raw o
|
|
16
|
-
LEFT JOIN my_schema.users u ON o.user_id = u.user_id
|
|
17
|
-
WHERE o.amount > 0
|
|
18
|
-
""").write.mode("overwrite").save_as_table("ds_workspace.orders_clean")
|
|
19
|
-
|
|
20
|
-
# 方式 B:pandas → Lakehouse
|
|
21
|
-
session.create_dataframe(local_df).write.mode("append").save_as_table("ds_workspace.features_v1")
|
|
22
|
-
|
|
23
|
-
# 方式 C:cursor 批量 INSERT(fallback)
|
|
24
|
-
import clickzetta, os
|
|
25
|
-
conn = clickzetta.connect(
|
|
26
|
-
service=os.environ["CLICKZETTA_SERVICE"], instance=os.environ["CLICKZETTA_INSTANCE"],
|
|
27
|
-
workspace=os.environ["CLICKZETTA_WORKSPACE"], username=os.environ["CLICKZETTA_USERNAME"],
|
|
28
|
-
password=os.environ["CLICKZETTA_PASSWORD"],
|
|
29
|
-
vcluster=os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
|
|
30
|
-
schema=os.environ.get("CLICKZETTA_SCHEMA", "public"),
|
|
31
|
-
)
|
|
32
|
-
cursor = conn.cursor()
|
|
33
|
-
cursor.execute("CREATE TABLE IF NOT EXISTS ds_workspace.my_table (col1 STRING, col2 BIGINT, col3 DOUBLE)")
|
|
34
|
-
rows = local_df.values.tolist()
|
|
35
|
-
for i in range(0, len(rows), 500):
|
|
36
|
-
batch = rows[i:i+500]
|
|
37
|
-
vals = ",".join(f"({','.join(repr(v) for v in row)})" for row in batch)
|
|
38
|
-
cursor.execute(f"INSERT INTO ds_workspace.my_table VALUES {vals}")
|
|
39
|
-
conn.close()
|
|
40
|
-
```
|
|
41
|
-
|
|
42
|
-
```sql
|
|
43
|
-
-- 设置中间表生命周期(30 天自动清理)
|
|
44
|
-
ALTER TABLE ds_workspace.orders_clean SET PROPERTIES ('data_lifecycle' = '30');
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
---
|
|
48
|
-
|
|
49
|
-
## 特征工程
|
|
50
|
-
|
|
51
|
-
```sql
|
|
52
|
-
-- SQL 侧(利用 Lakehouse 算力,推荐)
|
|
53
|
-
SELECT
|
|
54
|
-
user_id,
|
|
55
|
-
COUNT(*) AS order_cnt_30d,
|
|
56
|
-
SUM(amount) AS total_amount_30d,
|
|
57
|
-
AVG(amount) AS avg_amount_30d,
|
|
58
|
-
STDDEV(amount) AS std_amount_30d,
|
|
59
|
-
DATEDIFF('day', MIN(order_date), MAX(order_date)) AS active_days,
|
|
60
|
-
COUNT(DISTINCT DATE(order_date)) AS active_day_cnt,
|
|
61
|
-
NTILE(10) OVER (ORDER BY SUM(amount) DESC) AS revenue_decile
|
|
62
|
-
FROM my_schema.orders
|
|
63
|
-
WHERE order_date >= CURRENT_DATE - INTERVAL 30 DAY
|
|
64
|
-
GROUP BY user_id;
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
```python
|
|
68
|
-
# ZettaPark 侧(Python 逻辑)
|
|
69
|
-
from clickzetta.zettapark.functions import col, when
|
|
70
|
-
|
|
71
|
-
features = session.table("ds_workspace.orders_clean") \
|
|
72
|
-
.with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
|
|
73
|
-
|
|
74
|
-
df = features.to_pandas()
|
|
75
|
-
|
|
76
|
-
from sklearn.preprocessing import StandardScaler
|
|
77
|
-
df[['amount_scaled']] = StandardScaler().fit_transform(df[['amount']])
|
|
78
|
-
|
|
79
|
-
session.create_dataframe(df).write.mode("overwrite").save_as_table("ds_workspace.features_final")
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
---
|
|
83
|
-
|
|
84
|
-
## 模型推理上线
|
|
85
|
-
|
|
86
|
-
### BITMAP 用户画像
|
|
87
|
-
|
|
88
|
-
```sql
|
|
89
|
-
CREATE TABLE ds_workspace.user_tags AS
|
|
90
|
-
SELECT tag_name, group_bitmap_state(user_id) AS user_bitmap
|
|
91
|
-
FROM my_schema.user_behavior GROUP BY tag_name;
|
|
92
|
-
|
|
93
|
-
-- 人群交集
|
|
94
|
-
SELECT bitmap_count(bitmap_and(
|
|
95
|
-
(SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '高消费'),
|
|
96
|
-
(SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '近30天活跃')
|
|
97
|
-
)) AS target_user_count;
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
### SQL UDF 批量推理
|
|
101
|
-
|
|
102
|
-
```sql
|
|
103
|
-
-- 调用已部署的模型 UDF(必须用完整 schema 路径)
|
|
104
|
-
INSERT INTO ds_workspace.predictions
|
|
105
|
-
SELECT user_id,
|
|
106
|
-
ds_workspace.credit_score_model(total_amount_30d, order_cnt_30d, active_days, avg_amount_30d) AS score,
|
|
107
|
-
CURRENT_TIMESTAMP() AS predict_time
|
|
108
|
-
FROM ds_workspace.features_final;
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
### 向量检索
|
|
112
|
-
|
|
113
|
-
```sql
|
|
114
|
-
SELECT candidate_id,
|
|
115
|
-
cosine_distance(
|
|
116
|
-
(SELECT embedding FROM ds_workspace.user_embeddings WHERE user_id = 'target'),
|
|
117
|
-
embedding
|
|
118
|
-
) AS similarity
|
|
119
|
-
FROM ds_workspace.user_embeddings
|
|
120
|
-
WHERE user_id != 'target'
|
|
121
|
-
ORDER BY similarity LIMIT 10;
|
|
122
|
-
```
|
|
@@ -1,156 +0,0 @@
|
|
|
1
|
-
# ZettaPark API 数据科学常用操作
|
|
2
|
-
|
|
3
|
-
> 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
|
|
4
|
-
> **Python 版本**:推荐 3.12(最低 3.10)。安装:`python3.12 -m venv .venv && pip install clickzetta_zettapark_python`
|
|
5
|
-
|
|
6
|
-
---
|
|
7
|
-
|
|
8
|
-
## Session 创建
|
|
9
|
-
|
|
10
|
-
```python
|
|
11
|
-
from clickzetta.zettapark.session import Session
|
|
12
|
-
import os
|
|
13
|
-
from dotenv import load_dotenv
|
|
14
|
-
|
|
15
|
-
load_dotenv()
|
|
16
|
-
|
|
17
|
-
session = Session.builder.configs({
|
|
18
|
-
"service": os.environ["CLICKZETTA_SERVICE"],
|
|
19
|
-
"instance": os.environ["CLICKZETTA_INSTANCE"],
|
|
20
|
-
"workspace": os.environ["CLICKZETTA_WORKSPACE"],
|
|
21
|
-
"username": os.environ["CLICKZETTA_USERNAME"],
|
|
22
|
-
"password": os.environ["CLICKZETTA_PASSWORD"],
|
|
23
|
-
"vcluster": os.environ["CLICKZETTA_VCLUSTER"],
|
|
24
|
-
"schema": os.environ.get("CLICKZETTA_SCHEMA", "public"),
|
|
25
|
-
"hints": {
|
|
26
|
-
"sdk.job.timeout": 300,
|
|
27
|
-
"query_tag": "ds_notebook"
|
|
28
|
-
}
|
|
29
|
-
}).create()
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
---
|
|
33
|
-
|
|
34
|
-
## 数据读取
|
|
35
|
-
|
|
36
|
-
```python
|
|
37
|
-
# 读取整张表
|
|
38
|
-
df = session.table("my_schema.orders")
|
|
39
|
-
|
|
40
|
-
# 执行 SQL 查询
|
|
41
|
-
df = session.sql("SELECT * FROM my_schema.orders WHERE amount > 100")
|
|
42
|
-
|
|
43
|
-
# 转为 pandas(小数据集)
|
|
44
|
-
pandas_df = df.to_pandas()
|
|
45
|
-
|
|
46
|
-
# 分批读取大表(避免 OOM)
|
|
47
|
-
pandas_df = session.sql("""
|
|
48
|
-
SELECT * FROM my_schema.events
|
|
49
|
-
TABLESAMPLE ROW (1) -- 1% 精确采样
|
|
50
|
-
""").to_pandas()
|
|
51
|
-
|
|
52
|
-
# 只获取前 N 行
|
|
53
|
-
pandas_df = df.limit(10000).to_pandas()
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
---
|
|
57
|
-
|
|
58
|
-
## DataFrame 变换
|
|
59
|
-
|
|
60
|
-
```python
|
|
61
|
-
from clickzetta.zettapark.functions import col, when, lit, sum as F_sum, count as F_count, avg as F_avg
|
|
62
|
-
|
|
63
|
-
# 过滤
|
|
64
|
-
df_filtered = df.filter(col("amount") > 0)
|
|
65
|
-
df_filtered = df.filter((col("status") == "COMPLETED") & (col("amount") > 100))
|
|
66
|
-
|
|
67
|
-
# 选择列
|
|
68
|
-
df_selected = df.select("user_id", "amount", "order_date")
|
|
69
|
-
|
|
70
|
-
# 新增列
|
|
71
|
-
df = df.with_column("log_amount", col("amount").cast("double"))
|
|
72
|
-
df = df.with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
|
|
73
|
-
|
|
74
|
-
# 聚合
|
|
75
|
-
agg_df = df.group_by("user_id").agg(
|
|
76
|
-
F_sum("amount").as_("total_amount"),
|
|
77
|
-
F_count("order_id").as_("order_cnt"),
|
|
78
|
-
F_avg("amount").as_("avg_amount")
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
# JOIN
|
|
82
|
-
result = orders.join(users, orders["user_id"] == users["user_id"], "left")
|
|
83
|
-
|
|
84
|
-
# 排序
|
|
85
|
-
df_sorted = df.sort(col("amount").desc())
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
---
|
|
89
|
-
|
|
90
|
-
## 数据写回
|
|
91
|
-
|
|
92
|
-
```python
|
|
93
|
-
# 覆盖写入(常用于特征表更新)
|
|
94
|
-
df.write.mode("overwrite").save_as_table("ds_workspace.features_v1")
|
|
95
|
-
|
|
96
|
-
# 追加写入(常用于预测结果)
|
|
97
|
-
df.write.mode("append").save_as_table("ds_workspace.predictions")
|
|
98
|
-
|
|
99
|
-
# pandas DataFrame 写回
|
|
100
|
-
import pandas as pd
|
|
101
|
-
local_df = pd.DataFrame({"user_id": [1, 2], "score": [0.8, 0.6]})
|
|
102
|
-
session.create_dataframe(local_df).write.mode("overwrite") \
|
|
103
|
-
.save_as_table("ds_workspace.model_scores")
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
---
|
|
107
|
-
|
|
108
|
-
## 与 pandas/scikit-learn 集成
|
|
109
|
-
|
|
110
|
-
```python
|
|
111
|
-
import pandas as pd
|
|
112
|
-
import numpy as np
|
|
113
|
-
from sklearn.preprocessing import StandardScaler
|
|
114
|
-
from sklearn.model_selection import train_test_split
|
|
115
|
-
from sklearn.ensemble import GradientBoostingClassifier
|
|
116
|
-
|
|
117
|
-
# 1. 从 Lakehouse 拉特征
|
|
118
|
-
features_df = session.sql("""
|
|
119
|
-
SELECT user_id, total_amount_30d, order_cnt_30d,
|
|
120
|
-
active_days, avg_amount_30d, label
|
|
121
|
-
FROM ds_workspace.features_final
|
|
122
|
-
""").to_pandas()
|
|
123
|
-
|
|
124
|
-
# 2. 本地处理
|
|
125
|
-
X = features_df.drop(["user_id", "label"], axis=1)
|
|
126
|
-
y = features_df["label"]
|
|
127
|
-
|
|
128
|
-
scaler = StandardScaler()
|
|
129
|
-
X_scaled = scaler.fit_transform(X)
|
|
130
|
-
|
|
131
|
-
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
|
|
132
|
-
|
|
133
|
-
# 3. 训练模型
|
|
134
|
-
model = GradientBoostingClassifier(n_estimators=100)
|
|
135
|
-
model.fit(X_train, y_train)
|
|
136
|
-
|
|
137
|
-
# 4. 预测并写回
|
|
138
|
-
features_df["predicted_score"] = model.predict_proba(X_scaled)[:, 1]
|
|
139
|
-
session.create_dataframe(
|
|
140
|
-
features_df[["user_id", "predicted_score"]]
|
|
141
|
-
).write.mode("overwrite").save_as_table("ds_workspace.predictions")
|
|
142
|
-
|
|
143
|
-
# 5. 保存模型
|
|
144
|
-
import joblib
|
|
145
|
-
joblib.dump(model, "models/gbm_model.pkl")
|
|
146
|
-
joblib.dump(scaler, "models/scaler.pkl")
|
|
147
|
-
```
|
|
148
|
-
|
|
149
|
-
---
|
|
150
|
-
|
|
151
|
-
## 注意事项
|
|
152
|
-
|
|
153
|
-
- `to_pandas()` 会把数据全部拉到本地内存,大表必须先 `TABLESAMPLE` 或 `LIMIT`
|
|
154
|
-
- `collect()` 返回 Row 对象列表,`to_pandas()` 返回 DataFrame,数据科学场景用后者
|
|
155
|
-
- ZettaPark 的 DataFrame 操作是懒执行,只有 `to_pandas()`/`collect()`/`show()`/`save_as_table()` 才真正触发计算
|
|
156
|
-
- 写回时推荐用 `ds_workspace` 这样的专属 Schema,与生产数据隔离
|