@clickzetta/cz-cli-linux-x64 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/bin/cz-cli +0 -0
  2. package/package.json +1 -1
  3. package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
  4. package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
  5. package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
  6. package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
  7. package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
  8. package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
  9. package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
  10. package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
  11. package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
  12. package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
  13. package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
  14. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
  15. package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
  16. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
  17. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -457
  18. package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
  19. package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
  20. package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
  21. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
  22. package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
  23. package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
  24. package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
  25. package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
  26. package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
  27. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
  28. package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
  29. package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
  30. package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
  31. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
  32. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
  33. package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
  34. package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
  35. package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
  36. package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
  37. package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
  38. package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -112
  39. package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
  40. package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
  41. package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
  42. package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
  43. package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
  44. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  45. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
  46. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
  47. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
  48. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
  49. package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
  50. package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
  51. package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
  52. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
  53. package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -156
  54. package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
  55. package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
  56. package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
  57. package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
  58. package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
  59. package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
  60. package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
  61. package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
  62. package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
  63. package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
  64. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
  65. package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
  66. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -639
  67. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -324
  68. package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
  69. package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
  70. package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
  71. package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
  72. package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
  73. package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
  74. package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
  75. package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
  76. package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
  77. package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
  78. package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
  79. package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
  80. package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
  81. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -427
  82. package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
  83. package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
  84. package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
  85. package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
  86. package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
  87. package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
  88. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
  89. package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
  90. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
  91. package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
  92. package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
  93. package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
  94. package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -379
  95. package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
  96. package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -185
  97. package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
  98. package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -222
  99. package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -125
  100. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
  101. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  102. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  103. package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
  104. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  105. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  106. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
  107. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  108. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  109. package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
  110. package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
  111. package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -206
  112. package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
  113. package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
  114. package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
  115. package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -292
  116. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -199
  117. package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
  118. package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
@@ -1,160 +0,0 @@
1
- # 环境搭建与项目配置
2
-
3
- ## 环境搭建
4
-
5
- ```bash
6
- # 方式 1:venv(推荐)
7
- python3.12 -m venv .venv
8
- source .venv/bin/activate # macOS/Linux
9
- pip install clickzetta_zettapark_python clickzetta-connector-python \
10
- python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
11
- -i https://pypi.tuna.tsinghua.edu.cn/simple
12
-
13
- # 方式 2:pyenv(需要切换 Python 版本时)
14
- pyenv install 3.12.9 && pyenv local 3.12.9
15
- python -m venv .venv && source .venv/bin/activate
16
- pip install clickzetta_zettapark_python clickzetta-connector-python \
17
- python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
18
- -i https://pypi.tuna.tsinghua.edu.cn/simple
19
-
20
- # 方式 3:conda
21
- conda create -n lakehouse-ds python=3.12 -y && conda activate lakehouse-ds
22
- pip install clickzetta_zettapark_python clickzetta-connector-python \
23
- python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
24
- -i https://pypi.tuna.tsinghua.edu.cn/simple
25
- ```
26
-
27
- | 问题 | 修复 |
28
- |------|------|
29
- | Python 3.8/3.9 | `pyenv install 3.12.9` 或 `python3.12 -m venv .venv` |
30
- | `pyarrow` 版本冲突 | `pip install pyarrow==14.0.0` |
31
- | M1/M2 Mac 报错 | `pip install --no-binary :all:` 或改用 conda |
32
- | 连接超时 | VCluster 未启动,在 Studio 中手动启动 |
33
-
34
- ---
35
-
36
- ## Jupyter Kernel 配置
37
-
38
- ```bash
39
- # 注册 venv 为 Jupyter kernel(关键步骤,否则 notebook 用系统 Python)
40
- source .venv/bin/activate
41
- pip install ipykernel jupyterlab
42
- python -m ipykernel install --user --name lakehouse-ds --display-name "Python (lakehouse-ds)"
43
-
44
- # 启动 JupyterLab
45
- jupyter lab --port=8888
46
- ```
47
-
48
- VS Code / Cursor:打开 `.ipynb` → 右上角 "Select Kernel" → 选 "Python (lakehouse-ds)"
49
-
50
- | 问题 | 修复 |
51
- |------|------|
52
- | `ModuleNotFoundError: clickzetta` | kernel 未选对,切换到注册的 venv kernel |
53
- | `.env` 读不到 | `load_dotenv(dotenv_path='../.env')` 指定路径 |
54
- | `to_pandas()` OOM | 加 `TABLESAMPLE ROW(1)` 或 `LIMIT` |
55
- | 图表不显示 | notebook 开头加 `%matplotlib inline` |
56
-
57
- ---
58
-
59
- ## src/config.py 模板
60
-
61
- ```python
62
- import os, sys
63
- from pathlib import Path
64
- from dotenv import load_dotenv
65
- from clickzetta.zettapark.session import Session
66
- import clickzetta
67
-
68
- # 多位置查找 .env
69
- for _p in [
70
- Path(__file__).parent.parent / ".env",
71
- Path.home() / ".config" / "kilo" / ".env",
72
- Path.home() / ".czcode" / ".env",
73
- Path.home() / ".env",
74
- ]:
75
- if _p.exists():
76
- load_dotenv(dotenv_path=_p)
77
- break
78
-
79
- def check_environment():
80
- """在 00-env-check.ipynb 里调用,打印环境诊断。"""
81
- ver = sys.version_info
82
- if ver < (3, 10):
83
- raise RuntimeError(
84
- f"Python {ver.major}.{ver.minor} 不满足要求。ZettaPark 需要 Python 3.10+。\n"
85
- "升级:brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9"
86
- )
87
- print(f"✅ Python {ver.major}.{ver.minor}.{ver.micro}")
88
- for pkg, mod in [
89
- ("clickzetta_zettapark_python", "clickzetta.zettapark"),
90
- ("clickzetta-connector-python", "clickzetta"),
91
- ("pandas", "pandas"), ("python-dotenv", "dotenv"),
92
- ]:
93
- try:
94
- m = __import__(mod.split(".")[0])
95
- print(f"✅ {pkg}: {getattr(m, '__version__', 'ok')}")
96
- except ImportError:
97
- print(f"❌ {pkg}: 未安装 → pip install {pkg}")
98
- try:
99
- s = get_session()
100
- print(f"✅ Lakehouse: {s.sql('SELECT current_workspace(), current_user()').collect()}")
101
- except Exception as e:
102
- print(f"❌ Lakehouse 连接失败: {e}")
103
-
104
- def get_session() -> Session:
105
- return Session.builder.configs({
106
- "service": os.environ["CLICKZETTA_SERVICE"],
107
- "instance": os.environ["CLICKZETTA_INSTANCE"],
108
- "workspace": os.environ["CLICKZETTA_WORKSPACE"],
109
- "username": os.environ["CLICKZETTA_USERNAME"],
110
- "password": os.environ["CLICKZETTA_PASSWORD"],
111
- "vcluster": os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
112
- "schema": os.environ.get("CLICKZETTA_SCHEMA", "public"),
113
- }).create()
114
-
115
- def get_connector_connection():
116
- """仅用于 pd.read_sql。禁止用于 df.to_sql()。"""
117
- return clickzetta.connect(
118
- service=os.environ["CLICKZETTA_SERVICE"],
119
- instance=os.environ["CLICKZETTA_INSTANCE"],
120
- workspace=os.environ["CLICKZETTA_WORKSPACE"],
121
- username=os.environ["CLICKZETTA_USERNAME"],
122
- password=os.environ["CLICKZETTA_PASSWORD"],
123
- vcluster=os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
124
- schema=os.environ.get("CLICKZETTA_SCHEMA", "public"),
125
- )
126
- ```
127
-
128
- ---
129
-
130
- ## .env 模板
131
-
132
- ```bash
133
- CLICKZETTA_SERVICE=cn-shanghai-alicloud.api.clickzetta.com
134
- CLICKZETTA_INSTANCE=<instance-id>
135
- CLICKZETTA_WORKSPACE=<workspace>
136
- CLICKZETTA_USERNAME=<username>
137
- CLICKZETTA_PASSWORD=<password>
138
- CLICKZETTA_VCLUSTER=default_ap
139
- CLICKZETTA_SCHEMA=ds_workspace
140
- ```
141
-
142
- ## pyproject.toml
143
-
144
- ```toml
145
- [project]
146
- name = "my-lakehouse-ds-project"
147
- requires-python = ">=3.10"
148
- dependencies = [
149
- "clickzetta_zettapark_python>=0.1.2",
150
- "clickzetta-connector-python>=1.0.0",
151
- "python-dotenv>=1.0.0",
152
- "pandas>=2.0.0",
153
- "numpy>=1.24.0",
154
- "scikit-learn>=1.3.0",
155
- "pyarrow>=14.0.0",
156
- "jupyterlab>=4.0.0",
157
- "matplotlib>=3.7.0",
158
- "seaborn>=0.12.0",
159
- ]
160
- ```
@@ -1,195 +0,0 @@
1
- # 数据科学统计分析函数参考
2
-
3
- ---
4
-
5
- ## 近似聚合函数(大表高效统计)
6
-
7
- ### approx_count_distinct — 近似 UV
8
-
9
- ```sql
10
- -- 使用 HyperLogLog 算法,误差约 2%,比 COUNT(DISTINCT) 快 10x+
11
- SELECT approx_count_distinct(user_id) AS approx_uv
12
- FROM my_schema.events;
13
-
14
- -- 按天统计 DAU
15
- SELECT
16
- DATE(event_time) AS dt,
17
- approx_count_distinct(user_id) AS dau
18
- FROM my_schema.events
19
- GROUP BY 1
20
- ORDER BY 1;
21
- ```
22
-
23
- ### approx_percentile — 近似分位数
24
-
25
- ```sql
26
- -- 中位数、四分位数、P95、P99
27
- SELECT
28
- approx_percentile(amount, 0.25) AS p25,
29
- approx_percentile(amount, 0.50) AS median,
30
- approx_percentile(amount, 0.75) AS p75,
31
- approx_percentile(amount, 0.95) AS p95,
32
- approx_percentile(amount, 0.99) AS p99
33
- FROM my_schema.orders;
34
-
35
- -- 分组分位数
36
- SELECT
37
- category,
38
- approx_percentile(price, 0.5) AS median_price
39
- FROM my_schema.products
40
- GROUP BY category;
41
- ```
42
-
43
- ### approx_histogram — 近似直方图
44
-
45
- ```sql
46
- -- 返回结构体数组:[{min, max, count}, ...]
47
- SELECT approx_histogram(amount, 10) AS hist
48
- FROM my_schema.orders;
49
-
50
- -- 解析直方图(展开为行)
51
- SELECT
52
- bucket.min AS bucket_min,
53
- bucket.max AS bucket_max,
54
- bucket.count AS bucket_count
55
- FROM (
56
- SELECT EXPLODE(approx_histogram(amount, 10)) AS bucket
57
- FROM my_schema.orders
58
- );
59
- ```
60
-
61
- ### approx_top_k — 近似 TOP-K 高频值
62
-
63
- ```sql
64
- -- 找出出现最多的前 10 个城市
65
- SELECT approx_top_k(city, 10) AS top_cities
66
- FROM my_schema.orders;
67
-
68
- -- 返回结构体数组:[{value, count}, ...]
69
- -- 解析展开(字段名是 value 和 count)
70
- SELECT item.value AS city, item.count AS cnt
71
- FROM (
72
- SELECT EXPLODE(approx_top_k(city, 10)) AS item
73
- FROM my_schema.orders
74
- )
75
- ORDER BY cnt DESC;
76
- ```
77
-
78
- ---
79
-
80
- ## 精确统计函数
81
-
82
- ### percentile / median
83
-
84
- ```sql
85
- -- 精确中位数(小表用,大表用 approx_percentile)
86
- SELECT
87
- percentile(amount, 0.5) AS exact_median,
88
- median(amount) AS median_alias -- 等价写法
89
- FROM my_schema.orders;
90
-
91
- -- 多分位数
92
- SELECT percentile(amount, ARRAY(0.25, 0.5, 0.75, 0.9, 0.99))
93
- FROM my_schema.orders;
94
- ```
95
-
96
- ---
97
-
98
- ## TABLESAMPLE 采样
99
-
100
- ```sql
101
- -- ROW 模式:精确行级采样(适合 ML 训练集,< 1000万行)
102
- SELECT * FROM my_schema.events TABLESAMPLE ROW (10); -- 精确 10%
103
- SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS); -- 精确 5 行
104
-
105
- -- SYSTEM 模式:文件级采样(适合大表快速预览,> 1000万行)
106
- SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000; -- 约 0.1%
107
-
108
- -- 分层采样(按类别等比例采样)
109
- SELECT * FROM (
110
- SELECT *,
111
- ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn,
112
- COUNT(*) OVER (PARTITION BY category) AS cat_total
113
- FROM my_schema.products
114
- )
115
- WHERE rn <= CEIL(cat_total * 0.1); -- 每类取 10%
116
- ```
117
-
118
- | 场景 | 推荐模式 | 说明 |
119
- |---|---|---|
120
- | 快速数据预览 | SYSTEM | 极快,适合 > 100万行 |
121
- | ML 训练集构建 | ROW | 精确随机,保证代表性 |
122
- | 数据质量抽检 | SYSTEM | 快速抽样验证 |
123
- | 统计分析 | ROW | 精确概率采样 |
124
-
125
- > ⚠️ **注意**:TABLESAMPLE 在小表(< 数万行)上可能返回全部数据,百分比采样不精确。小表直接用 `LIMIT` 即可。
126
-
127
- ---
128
-
129
- ## 窗口函数(时序/排名特征)
130
-
131
- ```sql
132
- -- 移动平均(7日)
133
- SELECT
134
- dt,
135
- revenue,
136
- AVG(revenue) OVER (
137
- ORDER BY dt
138
- ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
139
- ) AS revenue_7d_ma
140
- FROM daily_stats;
141
-
142
- -- 环比增长率
143
- SELECT
144
- dt,
145
- revenue,
146
- LAG(revenue, 1) OVER (ORDER BY dt) AS prev_revenue,
147
- ROUND(100.0 * (revenue - LAG(revenue, 1) OVER (ORDER BY dt))
148
- / NULLIF(LAG(revenue, 1) OVER (ORDER BY dt), 0), 2) AS mom_growth_pct
149
- FROM daily_stats;
150
-
151
- -- 用户行为排名(RFM 分析)
152
- SELECT
153
- user_id,
154
- total_amount,
155
- NTILE(5) OVER (ORDER BY total_amount DESC) AS monetary_quintile,
156
- NTILE(5) OVER (ORDER BY order_cnt DESC) AS frequency_quintile,
157
- NTILE(5) OVER (ORDER BY last_order_date DESC) AS recency_quintile
158
- FROM user_rfm;
159
-
160
- -- 去重保留最新(数据清洗常用)
161
- SELECT * FROM (
162
- SELECT *,
163
- ROW_NUMBER() OVER (
164
- PARTITION BY user_id
165
- ORDER BY update_time DESC
166
- ) AS rn
167
- FROM my_schema.users_raw
168
- ) WHERE rn = 1;
169
- ```
170
-
171
- ---
172
-
173
- ## 数据质量检查模板
174
-
175
- ```sql
176
- -- 一次性输出所有关键质量指标
177
- SELECT
178
- COUNT(*) AS total_rows,
179
- COUNT(DISTINCT user_id) AS unique_users,
180
- -- 缺失率
181
- ROUND(100.0 * COUNT(*) FILTER (WHERE user_id IS NULL)
182
- / COUNT(*), 2) AS user_id_null_pct,
183
- ROUND(100.0 * COUNT(*) FILTER (WHERE amount IS NULL)
184
- / COUNT(*), 2) AS amount_null_pct,
185
- -- 异常值
186
- SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) AS negative_amount_cnt,
187
- SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END) AS extreme_amount_cnt,
188
- -- 时间范围
189
- MIN(order_date) AS earliest_date,
190
- MAX(order_date) AS latest_date,
191
- -- 分布
192
- approx_percentile(amount, 0.5) AS median_amount,
193
- approx_percentile(amount, 0.99) AS p99_amount
194
- FROM my_schema.orders;
195
- ```
@@ -1,122 +0,0 @@
1
- # 数据写入、特征工程、模型推理示例
2
-
3
- ## 数据写入
4
-
5
- | 场景 | 方式 |
6
- |------|------|
7
- | ZettaPark 可用(Python 3.10+) | `save_as_table()` 或 `create_dataframe().write` |
8
- | 本地 CSV/pandas 写入 | `session.create_dataframe(df).write.save_as_table()` |
9
- | Python 3.9 / ZettaPark 不可用 | cursor 批量 INSERT(见下方) |
10
- | **禁止** | `df.to_sql()`、SQLAlchemy `clickzetta://...` |
11
-
12
- ```python
13
- # 方式 A:ZettaPark(推荐)
14
- session.sql("""
15
- SELECT o.*, u.age_group FROM my_schema.orders_raw o
16
- LEFT JOIN my_schema.users u ON o.user_id = u.user_id
17
- WHERE o.amount > 0
18
- """).write.mode("overwrite").save_as_table("ds_workspace.orders_clean")
19
-
20
- # 方式 B:pandas → Lakehouse
21
- session.create_dataframe(local_df).write.mode("append").save_as_table("ds_workspace.features_v1")
22
-
23
- # 方式 C:cursor 批量 INSERT(fallback)
24
- import clickzetta, os
25
- conn = clickzetta.connect(
26
- service=os.environ["CLICKZETTA_SERVICE"], instance=os.environ["CLICKZETTA_INSTANCE"],
27
- workspace=os.environ["CLICKZETTA_WORKSPACE"], username=os.environ["CLICKZETTA_USERNAME"],
28
- password=os.environ["CLICKZETTA_PASSWORD"],
29
- vcluster=os.environ.get("CLICKZETTA_VCLUSTER", "default_ap"),
30
- schema=os.environ.get("CLICKZETTA_SCHEMA", "public"),
31
- )
32
- cursor = conn.cursor()
33
- cursor.execute("CREATE TABLE IF NOT EXISTS ds_workspace.my_table (col1 STRING, col2 BIGINT, col3 DOUBLE)")
34
- rows = local_df.values.tolist()
35
- for i in range(0, len(rows), 500):
36
- batch = rows[i:i+500]
37
- vals = ",".join(f"({','.join(repr(v) for v in row)})" for row in batch)
38
- cursor.execute(f"INSERT INTO ds_workspace.my_table VALUES {vals}")
39
- conn.close()
40
- ```
41
-
42
- ```sql
43
- -- 设置中间表生命周期(30 天自动清理)
44
- ALTER TABLE ds_workspace.orders_clean SET PROPERTIES ('data_lifecycle' = '30');
45
- ```
46
-
47
- ---
48
-
49
- ## 特征工程
50
-
51
- ```sql
52
- -- SQL 侧(利用 Lakehouse 算力,推荐)
53
- SELECT
54
- user_id,
55
- COUNT(*) AS order_cnt_30d,
56
- SUM(amount) AS total_amount_30d,
57
- AVG(amount) AS avg_amount_30d,
58
- STDDEV(amount) AS std_amount_30d,
59
- DATEDIFF('day', MIN(order_date), MAX(order_date)) AS active_days,
60
- COUNT(DISTINCT DATE(order_date)) AS active_day_cnt,
61
- NTILE(10) OVER (ORDER BY SUM(amount) DESC) AS revenue_decile
62
- FROM my_schema.orders
63
- WHERE order_date >= CURRENT_DATE - INTERVAL 30 DAY
64
- GROUP BY user_id;
65
- ```
66
-
67
- ```python
68
- # ZettaPark 侧(Python 逻辑)
69
- from clickzetta.zettapark.functions import col, when
70
-
71
- features = session.table("ds_workspace.orders_clean") \
72
- .with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
73
-
74
- df = features.to_pandas()
75
-
76
- from sklearn.preprocessing import StandardScaler
77
- df[['amount_scaled']] = StandardScaler().fit_transform(df[['amount']])
78
-
79
- session.create_dataframe(df).write.mode("overwrite").save_as_table("ds_workspace.features_final")
80
- ```
81
-
82
- ---
83
-
84
- ## 模型推理上线
85
-
86
- ### BITMAP 用户画像
87
-
88
- ```sql
89
- CREATE TABLE ds_workspace.user_tags AS
90
- SELECT tag_name, group_bitmap_state(user_id) AS user_bitmap
91
- FROM my_schema.user_behavior GROUP BY tag_name;
92
-
93
- -- 人群交集
94
- SELECT bitmap_count(bitmap_and(
95
- (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '高消费'),
96
- (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '近30天活跃')
97
- )) AS target_user_count;
98
- ```
99
-
100
- ### SQL UDF 批量推理
101
-
102
- ```sql
103
- -- 调用已部署的模型 UDF(必须用完整 schema 路径)
104
- INSERT INTO ds_workspace.predictions
105
- SELECT user_id,
106
- ds_workspace.credit_score_model(total_amount_30d, order_cnt_30d, active_days, avg_amount_30d) AS score,
107
- CURRENT_TIMESTAMP() AS predict_time
108
- FROM ds_workspace.features_final;
109
- ```
110
-
111
- ### 向量检索
112
-
113
- ```sql
114
- SELECT candidate_id,
115
- cosine_distance(
116
- (SELECT embedding FROM ds_workspace.user_embeddings WHERE user_id = 'target'),
117
- embedding
118
- ) AS similarity
119
- FROM ds_workspace.user_embeddings
120
- WHERE user_id != 'target'
121
- ORDER BY similarity LIMIT 10;
122
- ```
@@ -1,156 +0,0 @@
1
- # ZettaPark API 数据科学常用操作
2
-
3
- > 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
4
- > **Python 版本**:推荐 3.12(最低 3.10)。安装:`python3.12 -m venv .venv && pip install clickzetta_zettapark_python`
5
-
6
- ---
7
-
8
- ## Session 创建
9
-
10
- ```python
11
- from clickzetta.zettapark.session import Session
12
- import os
13
- from dotenv import load_dotenv
14
-
15
- load_dotenv()
16
-
17
- session = Session.builder.configs({
18
- "service": os.environ["CLICKZETTA_SERVICE"],
19
- "instance": os.environ["CLICKZETTA_INSTANCE"],
20
- "workspace": os.environ["CLICKZETTA_WORKSPACE"],
21
- "username": os.environ["CLICKZETTA_USERNAME"],
22
- "password": os.environ["CLICKZETTA_PASSWORD"],
23
- "vcluster": os.environ["CLICKZETTA_VCLUSTER"],
24
- "schema": os.environ.get("CLICKZETTA_SCHEMA", "public"),
25
- "hints": {
26
- "sdk.job.timeout": 300,
27
- "query_tag": "ds_notebook"
28
- }
29
- }).create()
30
- ```
31
-
32
- ---
33
-
34
- ## 数据读取
35
-
36
- ```python
37
- # 读取整张表
38
- df = session.table("my_schema.orders")
39
-
40
- # 执行 SQL 查询
41
- df = session.sql("SELECT * FROM my_schema.orders WHERE amount > 100")
42
-
43
- # 转为 pandas(小数据集)
44
- pandas_df = df.to_pandas()
45
-
46
- # 分批读取大表(避免 OOM)
47
- pandas_df = session.sql("""
48
- SELECT * FROM my_schema.events
49
- TABLESAMPLE ROW (1) -- 1% 精确采样
50
- """).to_pandas()
51
-
52
- # 只获取前 N 行
53
- pandas_df = df.limit(10000).to_pandas()
54
- ```
55
-
56
- ---
57
-
58
- ## DataFrame 变换
59
-
60
- ```python
61
- from clickzetta.zettapark.functions import col, when, lit, sum as F_sum, count as F_count, avg as F_avg
62
-
63
- # 过滤
64
- df_filtered = df.filter(col("amount") > 0)
65
- df_filtered = df.filter((col("status") == "COMPLETED") & (col("amount") > 100))
66
-
67
- # 选择列
68
- df_selected = df.select("user_id", "amount", "order_date")
69
-
70
- # 新增列
71
- df = df.with_column("log_amount", col("amount").cast("double"))
72
- df = df.with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
73
-
74
- # 聚合
75
- agg_df = df.group_by("user_id").agg(
76
- F_sum("amount").as_("total_amount"),
77
- F_count("order_id").as_("order_cnt"),
78
- F_avg("amount").as_("avg_amount")
79
- )
80
-
81
- # JOIN
82
- result = orders.join(users, orders["user_id"] == users["user_id"], "left")
83
-
84
- # 排序
85
- df_sorted = df.sort(col("amount").desc())
86
- ```
87
-
88
- ---
89
-
90
- ## 数据写回
91
-
92
- ```python
93
- # 覆盖写入(常用于特征表更新)
94
- df.write.mode("overwrite").save_as_table("ds_workspace.features_v1")
95
-
96
- # 追加写入(常用于预测结果)
97
- df.write.mode("append").save_as_table("ds_workspace.predictions")
98
-
99
- # pandas DataFrame 写回
100
- import pandas as pd
101
- local_df = pd.DataFrame({"user_id": [1, 2], "score": [0.8, 0.6]})
102
- session.create_dataframe(local_df).write.mode("overwrite") \
103
- .save_as_table("ds_workspace.model_scores")
104
- ```
105
-
106
- ---
107
-
108
- ## 与 pandas/scikit-learn 集成
109
-
110
- ```python
111
- import pandas as pd
112
- import numpy as np
113
- from sklearn.preprocessing import StandardScaler
114
- from sklearn.model_selection import train_test_split
115
- from sklearn.ensemble import GradientBoostingClassifier
116
-
117
- # 1. 从 Lakehouse 拉特征
118
- features_df = session.sql("""
119
- SELECT user_id, total_amount_30d, order_cnt_30d,
120
- active_days, avg_amount_30d, label
121
- FROM ds_workspace.features_final
122
- """).to_pandas()
123
-
124
- # 2. 本地处理
125
- X = features_df.drop(["user_id", "label"], axis=1)
126
- y = features_df["label"]
127
-
128
- scaler = StandardScaler()
129
- X_scaled = scaler.fit_transform(X)
130
-
131
- X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
132
-
133
- # 3. 训练模型
134
- model = GradientBoostingClassifier(n_estimators=100)
135
- model.fit(X_train, y_train)
136
-
137
- # 4. 预测并写回
138
- features_df["predicted_score"] = model.predict_proba(X_scaled)[:, 1]
139
- session.create_dataframe(
140
- features_df[["user_id", "predicted_score"]]
141
- ).write.mode("overwrite").save_as_table("ds_workspace.predictions")
142
-
143
- # 5. 保存模型
144
- import joblib
145
- joblib.dump(model, "models/gbm_model.pkl")
146
- joblib.dump(scaler, "models/scaler.pkl")
147
- ```
148
-
149
- ---
150
-
151
- ## 注意事项
152
-
153
- - `to_pandas()` 会把数据全部拉到本地内存,大表必须先 `TABLESAMPLE` 或 `LIMIT`
154
- - `collect()` 返回 Row 对象列表,`to_pandas()` 返回 DataFrame,数据科学场景用后者
155
- - ZettaPark 的 DataFrame 操作是懒执行,只有 `to_pandas()`/`collect()`/`show()`/`save_as_table()` 才真正触发计算
156
- - 写回时推荐用 `ds_workspace` 这样的专属 Schema,与生产数据隔离