@clickzetta/cz-cli-darwin-x64 0.3.91 → 0.3.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
  3. package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
  4. package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
  5. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
  6. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
  7. package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
  8. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
  9. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
  10. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
  11. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
  12. package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
  13. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
  14. package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
  15. package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
  16. package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
  17. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
  18. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
  19. package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
  20. package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
  21. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
  22. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
  23. package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
  24. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
  25. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
  26. package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
  27. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
  28. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
  29. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
  30. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
  31. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
  32. package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
  33. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
  34. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
  35. package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
  36. package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
  37. package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
  38. package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
  39. package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
  40. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
  41. package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
  42. package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
  43. package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
  44. package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
  45. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
  46. package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
  47. package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
  48. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
  49. package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
  50. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
  51. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
  52. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
  53. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
  54. package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
  55. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
  56. package/package.json +1 -1
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  58. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  60. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
  61. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  62. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  63. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  64. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  65. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  66. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  67. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  68. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  69. /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
@@ -1,57 +1,57 @@
1
- # BITMAP 用户画像参考
1
+ # BITMAP User Profiling Reference
2
2
 
3
- > 来源:https://www.yunqi.tech/documents/bitmap-type
3
+ > Source: https://www.yunqi.tech/documents/bitmap-type
4
4
 
5
- BITMAP ClickZetta 中用于高效存储和处理整数集合的数据类型,基于 Roaring Bitmap 压缩算法,特别适合用户画像、人群圈选、UV 统计等数据科学场景。
5
+ BITMAP is a data type in ClickZetta for efficiently storing and processing integer sets, based on the Roaring Bitmap compression algorithm. It is particularly suited for data science use cases such as user profiling, audience segmentation, and UV counting.
6
6
 
7
7
  ---
8
8
 
9
- ## 核心限制
9
+ ## Core Limitations
10
10
 
11
- - 支持 **64 位无符号整数**(0 2^64-1
12
- - **不支持**比较操作(<、>、=)
13
- - **不支持** ORDER BYGROUP BYDISTINCT
14
- - **不能**作为 PRIMARY KEYPARTITION KEYCLUSTER KEY
11
+ - Supports **64-bit unsigned integers** (0 to 2^64-1)
12
+ - **Does not support** comparison operators (<, >, =)
13
+ - **Does not support** ORDER BY, GROUP BY, DISTINCT
14
+ - **Cannot** be used as PRIMARY KEY, PARTITION KEY, or CLUSTER KEY
15
15
 
16
16
  ---
17
17
 
18
- ## 构建用户标签 BITMAP
18
+ ## Building User Tag BITMAPs
19
19
 
20
20
  ```sql
21
- -- 方式 1:从行数据聚合构建(最常用)
21
+ -- Option 1: Aggregate from row data (most common)
22
22
  CREATE TABLE ds_workspace.user_tags AS
23
23
  SELECT
24
24
  tag_name,
25
25
  group_bitmap_state(user_id) AS user_bitmap
26
26
  FROM (
27
- -- 高消费用户
27
+ -- High-value users
28
28
  SELECT 'high_value' AS tag_name, user_id
29
29
  FROM my_schema.orders
30
30
  WHERE total_amount_30d > 1000
31
31
  UNION ALL
32
- -- 30天活跃用户
32
+ -- Active in last 30 days
33
33
  SELECT 'active_30d' AS tag_name, user_id
34
34
  FROM my_schema.events
35
35
  WHERE event_date >= CURRENT_DATE - INTERVAL 30 DAY
36
36
  UNION ALL
37
- -- 已流失用户(90天未活跃)
37
+ -- Churned users (inactive for 90 days)
38
38
  SELECT 'churned' AS tag_name, user_id
39
39
  FROM my_schema.users
40
40
  WHERE last_active_date < CURRENT_DATE - INTERVAL 90 DAY
41
41
  ) t
42
42
  GROUP BY tag_name;
43
43
 
44
- -- 方式 2:从数组构建
44
+ -- Option 2: Build from an array
45
45
  INSERT INTO ds_workspace.user_tags VALUES
46
46
  ('vip', bitmap_build(ARRAY(1001, 1002, 1003, 1004)));
47
47
  ```
48
48
 
49
49
  ---
50
50
 
51
- ## 人群圈选操作
51
+ ## Audience Segmentation Operations
52
52
 
53
53
  ```sql
54
- -- 交集:同时满足多个标签(AND
54
+ -- Intersection: users matching all tags (AND)
55
55
  SELECT bitmap_count(
56
56
  bitmap_and(
57
57
  (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
@@ -59,7 +59,7 @@ SELECT bitmap_count(
59
59
  )
60
60
  ) AS target_count;
61
61
 
62
- -- 并集:满足任一标签(OR
62
+ -- Union: users matching any tag (OR)
63
63
  SELECT bitmap_count(
64
64
  bitmap_or(
65
65
  (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
@@ -67,7 +67,7 @@ SELECT bitmap_count(
67
67
  )
68
68
  ) AS reach_count;
69
69
 
70
- -- 差集:排除某类用户(ANDNOT
70
+ -- Difference: exclude a group (ANDNOT)
71
71
  SELECT bitmap_count(
72
72
  bitmap_andnot(
73
73
  (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
@@ -75,7 +75,7 @@ SELECT bitmap_count(
75
75
  )
76
76
  ) AS targetable_count;
77
77
 
78
- -- 获取目标用户 ID 列表
78
+ -- Get target user ID list
79
79
  SELECT bitmap_to_array(
80
80
  bitmap_andnot(
81
81
  (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
@@ -86,10 +86,10 @@ SELECT bitmap_to_array(
86
86
 
87
87
  ---
88
88
 
89
- ## UV 统计(去重计数)
89
+ ## UV Counting (Distinct Count)
90
90
 
91
91
  ```sql
92
- -- 日活跃用户数(DAU
92
+ -- Daily active users (DAU)
93
93
  SELECT
94
94
  event_date,
95
95
  bitmap_count(group_bitmap_state(user_id)) AS dau
@@ -97,11 +97,11 @@ FROM my_schema.events
97
97
  GROUP BY event_date
98
98
  ORDER BY event_date;
99
99
 
100
- -- 周活跃用户数(WAU)—— 跨天去重
100
+ -- Weekly active users (WAU) — deduplicated across days
101
101
  SELECT
102
102
  DATE_TRUNC('week', event_date) AS week_start,
103
103
  bitmap_count(
104
- bitmap_or_agg(daily_bitmap) -- 合并多天 bitmap
104
+ bitmap_or_agg(daily_bitmap) -- merge multiple days' bitmaps
105
105
  ) AS wau
106
106
  FROM (
107
107
  SELECT event_date,
@@ -111,7 +111,7 @@ FROM (
111
111
  ) t
112
112
  GROUP BY 1;
113
113
 
114
- -- 用户留存分析(新用户 vs 回访用户)
114
+ -- User retention analysis (new vs. returning users)
115
115
  SELECT
116
116
  bitmap_count(
117
117
  bitmap_and(new_users.user_bitmap, return_users.user_bitmap)
@@ -128,19 +128,19 @@ FROM
128
128
 
129
129
  ---
130
130
 
131
- ## 常用 BITMAP 函数速查
131
+ ## BITMAP Function Quick Reference
132
132
 
133
- | 函数 | 说明 | 示例 |
133
+ | Function | Description | Example |
134
134
  |---|---|---|
135
- | `group_bitmap_state(col)` | 聚合构建 BITMAP | `GROUP BY tag` |
136
- | `bitmap_count(bm)` | 计算元素个数(UV | `bitmap_count(user_bm)` |
137
- | `bitmap_and(a, b)` | 交集 | 同时满足 A B |
138
- | `bitmap_or(a, b)` | 并集 | 满足 A B |
139
- | `bitmap_andnot(a, b)` | 差集 | A 中但不在 B |
140
- | `bitmap_xor(a, b)` | 异或(只在一个中) | AB 各自独有的 |
141
- | `bitmap_to_array(bm)` | 转为整数数组 | 获取用户 ID 列表 |
142
- | `bitmap_build(arr)` | 从数组构建 | `bitmap_build(ARRAY(1,2,3))` |
143
- | `bitmap_contains(bm, val)` | 检查是否包含某值 | `bitmap_contains(bm, user_id)` |
144
- | `bitmap_min(bm)` | 最小元素 | — |
145
- | `bitmap_max(bm)` | 最大元素 | — |
146
- | `to_bitmap(val)` | 单值转 BITMAP | `to_bitmap(user_id)` |
135
+ | `group_bitmap_state(col)` | Aggregate to build a BITMAP | `GROUP BY tag` |
136
+ | `bitmap_count(bm)` | Count elements (UV) | `bitmap_count(user_bm)` |
137
+ | `bitmap_and(a, b)` | Intersection | Users in both A and B |
138
+ | `bitmap_or(a, b)` | Union | Users in A or B |
139
+ | `bitmap_andnot(a, b)` | Difference | In A but not in B |
140
+ | `bitmap_xor(a, b)` | Symmetric difference | Exclusive to either A or B |
141
+ | `bitmap_to_array(bm)` | Convert to integer array | Get user ID list |
142
+ | `bitmap_build(arr)` | Build from array | `bitmap_build(ARRAY(1,2,3))` |
143
+ | `bitmap_contains(bm, val)` | Check if value is present | `bitmap_contains(bm, user_id)` |
144
+ | `bitmap_min(bm)` | Minimum element | — |
145
+ | `bitmap_max(bm)` | Maximum element | — |
146
+ | `to_bitmap(val)` | Convert single value to BITMAP | `to_bitmap(user_id)` |
@@ -1,6 +1,6 @@
1
- # 数据发现、质量评估、清洗、EDA 示例
1
+ # Data Discovery, Quality Assessment, Cleaning, and EDA Examples
2
2
 
3
- ## 数据发现
3
+ ## Data Discovery
4
4
 
5
5
  ```python
6
6
  from src.config import get_session
@@ -20,10 +20,10 @@ session.sql("""
20
20
 
21
21
  ---
22
22
 
23
- ## 数据质量评估
23
+ ## Data Quality Assessment
24
24
 
25
25
  ```sql
26
- -- 基础统计
26
+ -- Basic statistics
27
27
  SELECT
28
28
  COUNT(*) AS total_rows,
29
29
  COUNT(DISTINCT user_id) AS unique_users,
@@ -32,11 +32,11 @@ SELECT
32
32
  ROUND(100.0 * SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS amount_null_pct
33
33
  FROM my_schema.orders;
34
34
 
35
- -- 主键重复检查
35
+ -- Duplicate primary key check
36
36
  SELECT order_id, COUNT(*) AS cnt
37
37
  FROM my_schema.orders GROUP BY order_id HAVING cnt > 1 LIMIT 10;
38
38
 
39
- -- 数值分布(大表高效)
39
+ -- Numeric distribution (efficient for large tables)
40
40
  SELECT
41
41
  approx_percentile(amount, 0.25) AS p25,
42
42
  approx_percentile(amount, 0.50) AS median,
@@ -45,25 +45,25 @@ SELECT
45
45
  MIN(amount) AS min_val, MAX(amount) AS max_val
46
46
  FROM my_schema.orders;
47
47
 
48
- -- 高频值 TOP-K
48
+ -- Top-K high-frequency values
49
49
  SELECT approx_top_k(status, 10) AS top_statuses FROM my_schema.orders;
50
50
 
51
- -- 近似 UV
51
+ -- Approximate distinct count
52
52
  SELECT approx_count_distinct(user_id) AS approx_uv FROM my_schema.events;
53
53
  ```
54
54
 
55
55
  ---
56
56
 
57
- ## 数据清洗
57
+ ## Data Cleaning
58
58
 
59
59
  ```sql
60
- -- 去重(保留最新一条)
60
+ -- Deduplication (keep the latest record)
61
61
  SELECT * FROM (
62
62
  SELECT *, ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY update_time DESC) AS rn
63
63
  FROM my_schema.orders_raw
64
64
  ) WHERE rn = 1;
65
65
 
66
- -- 缺失值处理 + 类型转换
66
+ -- Null handling + type casting
67
67
  SELECT
68
68
  order_id, user_id,
69
69
  COALESCE(amount, 0.0) AS amount,
@@ -72,7 +72,7 @@ SELECT
72
72
  FROM my_schema.orders_raw
73
73
  WHERE user_id IS NOT NULL;
74
74
 
75
- -- 多表整合
75
+ -- Multi-table integration
76
76
  SELECT o.order_id, o.user_id, o.amount, o.order_date,
77
77
  u.age_group, u.city, p.category, p.brand
78
78
  FROM my_schema.orders o
@@ -85,16 +85,16 @@ LEFT JOIN my_schema.products p ON o.product_id = p.product_id;
85
85
  ## EDA
86
86
 
87
87
  ```python
88
- # 采样策略
88
+ # Sampling strategies
89
89
  df_quick = session.sql("""
90
90
  SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000
91
- """).to_pandas() # SYSTEM:文件级,极快,适合 >100万行预览
91
+ """).to_pandas() # SYSTEM: file-level, very fast, good for >1M row previews
92
92
 
93
93
  df_ml = session.sql("""
94
94
  SELECT * FROM my_schema.events TABLESAMPLE ROW (10)
95
- """).to_pandas() # ROW:行级精确,适合 ML 训练集
95
+ """).to_pandas() # ROW: exact row-level, good for ML training sets
96
96
 
97
- # 时序分析
97
+ # Time series analysis
98
98
  session.sql("""
99
99
  SELECT
100
100
  DATE_TRUNC('day', order_time) AS dt,
@@ -1,62 +1,62 @@
1
- # 环境搭建与项目配置
1
+ # Environment Setup & Project Configuration
2
2
 
3
- ## 环境搭建
3
+ ## Environment Setup
4
4
 
5
5
  ```bash
6
- # 方式 1venv(推荐)
6
+ # Option 1: venv (recommended)
7
7
  python3.12 -m venv .venv
8
8
  source .venv/bin/activate # macOS/Linux
9
9
  pip install clickzetta_zettapark_python clickzetta-connector-python \
10
10
  python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
11
11
  -i https://pypi.tuna.tsinghua.edu.cn/simple
12
12
 
13
- # 方式 2pyenv(需要切换 Python 版本时)
13
+ # Option 2: pyenv (when you need to switch Python versions)
14
14
  pyenv install 3.12.9 && pyenv local 3.12.9
15
15
  python -m venv .venv && source .venv/bin/activate
16
16
  pip install clickzetta_zettapark_python clickzetta-connector-python \
17
17
  python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
18
18
  -i https://pypi.tuna.tsinghua.edu.cn/simple
19
19
 
20
- # 方式 3conda
20
+ # Option 3: conda
21
21
  conda create -n lakehouse-ds python=3.12 -y && conda activate lakehouse-ds
22
22
  pip install clickzetta_zettapark_python clickzetta-connector-python \
23
23
  python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
24
24
  -i https://pypi.tuna.tsinghua.edu.cn/simple
25
25
  ```
26
26
 
27
- | 问题 | 修复 |
27
+ | Issue | Fix |
28
28
  |------|------|
29
- | Python 3.8/3.9 | `pyenv install 3.12.9` `python3.12 -m venv .venv` |
30
- | `pyarrow` 版本冲突 | `pip install pyarrow==14.0.0` |
31
- | M1/M2 Mac 报错 | `pip install --no-binary :all:` 或改用 conda |
32
- | 连接超时 | VCluster 未启动,在 Studio 中手动启动 |
29
+ | Python 3.8/3.9 | `pyenv install 3.12.9` or `python3.12 -m venv .venv` |
30
+ | `pyarrow` version conflict | `pip install pyarrow==14.0.0` |
31
+ | M1/M2 Mac error | `pip install --no-binary :all:` or use conda |
32
+ | Connection timeout | VCluster not started — start it manually in Studio |
33
33
 
34
34
  ---
35
35
 
36
- ## Jupyter Kernel 配置
36
+ ## Jupyter Kernel Configuration
37
37
 
38
38
  ```bash
39
- # 注册 venv Jupyter kernel(关键步骤,否则 notebook 用系统 Python
39
+ # Register the venv as a Jupyter kernel (critical — otherwise notebook uses system Python)
40
40
  source .venv/bin/activate
41
41
  pip install ipykernel jupyterlab
42
42
  python -m ipykernel install --user --name lakehouse-ds --display-name "Python (lakehouse-ds)"
43
43
 
44
- # 启动 JupyterLab
44
+ # Start JupyterLab
45
45
  jupyter lab --port=8888
46
46
  ```
47
47
 
48
- VS Code / Cursor:打开 `.ipynb` → 右上角 "Select Kernel" → "Python (lakehouse-ds)"
48
+ VS Code / Cursor: open `.ipynb` → top-right "Select Kernel" → choose "Python (lakehouse-ds)"
49
49
 
50
- | 问题 | 修复 |
50
+ | Issue | Fix |
51
51
  |------|------|
52
- | `ModuleNotFoundError: clickzetta` | kernel 未选对,切换到注册的 venv kernel |
53
- | `.env` 读不到 | `load_dotenv(dotenv_path='../.env')` 指定路径 |
54
- | `to_pandas()` OOM | `TABLESAMPLE ROW(1)` `LIMIT` |
55
- | 图表不显示 | notebook 开头加 `%matplotlib inline` |
52
+ | `ModuleNotFoundError: clickzetta` | Wrong kernel selected — switch to the registered venv kernel |
53
+ | `.env` not loading | Use `load_dotenv(dotenv_path='../.env')` with an explicit path |
54
+ | `to_pandas()` OOM | Add `TABLESAMPLE ROW(1)` or `LIMIT` |
55
+ | Charts not showing | Add `%matplotlib inline` at the top of the notebook |
56
56
 
57
57
  ---
58
58
 
59
- ## src/config.py 模板
59
+ ## src/config.py Template
60
60
 
61
61
  ```python
62
62
  import os, sys
@@ -65,7 +65,7 @@ from dotenv import load_dotenv
65
65
  from clickzetta.zettapark.session import Session
66
66
  import clickzetta
67
67
 
68
- # 多位置查找 .env
68
+ # Search for .env in multiple locations
69
69
  for _p in [
70
70
  Path(__file__).parent.parent / ".env",
71
71
  Path.home() / ".config" / "kilo" / ".env",
@@ -77,12 +77,12 @@ for _p in [
77
77
  break
78
78
 
79
79
  def check_environment():
80
- """ 00-env-check.ipynb 里调用,打印环境诊断。"""
80
+ """Call from 00-env-check.ipynb to print environment diagnostics."""
81
81
  ver = sys.version_info
82
82
  if ver < (3, 10):
83
83
  raise RuntimeError(
84
- f"Python {ver.major}.{ver.minor} 不满足要求。ZettaPark 需要 Python 3.10+。\n"
85
- "升级:brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9"
84
+ f"Python {ver.major}.{ver.minor} does not meet requirements. ZettaPark requires Python 3.10+.\n"
85
+ "Upgrade: brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9"
86
86
  )
87
87
  print(f"✅ Python {ver.major}.{ver.minor}.{ver.micro}")
88
88
  for pkg, mod in [
@@ -94,12 +94,12 @@ def check_environment():
94
94
  m = __import__(mod.split(".")[0])
95
95
  print(f"✅ {pkg}: {getattr(m, '__version__', 'ok')}")
96
96
  except ImportError:
97
- print(f"❌ {pkg}: 未安装 → pip install {pkg}")
97
+ print(f"❌ {pkg}: not installed → pip install {pkg}")
98
98
  try:
99
99
  s = get_session()
100
100
  print(f"✅ Lakehouse: {s.sql('SELECT current_workspace(), current_user()').collect()}")
101
101
  except Exception as e:
102
- print(f"❌ Lakehouse 连接失败: {e}")
102
+ print(f"❌ Lakehouse connection failed: {e}")
103
103
 
104
104
  def get_session() -> Session:
105
105
  return Session.builder.configs({
@@ -113,7 +113,7 @@ def get_session() -> Session:
113
113
  }).create()
114
114
 
115
115
  def get_connector_connection():
116
- """仅用于 pd.read_sql。禁止用于 df.to_sql()"""
116
+ """For pd.read_sql only. Do NOT use with df.to_sql()."""
117
117
  return clickzetta.connect(
118
118
  service=os.environ["CLICKZETTA_SERVICE"],
119
119
  instance=os.environ["CLICKZETTA_INSTANCE"],
@@ -127,7 +127,7 @@ def get_connector_connection():
127
127
 
128
128
  ---
129
129
 
130
- ## .env 模板
130
+ ## .env Template
131
131
 
132
132
  ```bash
133
133
  CLICKZETTA_SERVICE=cn-shanghai-alicloud.api.clickzetta.com
@@ -1,17 +1,17 @@
1
- # 数据科学统计分析函数参考
1
+ # Statistical Analysis Functions Reference for Data Science
2
2
 
3
3
  ---
4
4
 
5
- ## 近似聚合函数(大表高效统计)
5
+ ## Approximate Aggregate Functions (Efficient for Large Tables)
6
6
 
7
- ### approx_count_distinct — 近似 UV
7
+ ### approx_count_distinct — Approximate Distinct Count
8
8
 
9
9
  ```sql
10
- -- 使用 HyperLogLog 算法,误差约 2%,比 COUNT(DISTINCT) 快 10x+
10
+ -- Uses HyperLogLog algorithm, ~2% error, 10x+ faster than COUNT(DISTINCT)
11
11
  SELECT approx_count_distinct(user_id) AS approx_uv
12
12
  FROM my_schema.events;
13
13
 
14
- -- 按天统计 DAU
14
+ -- Daily active users (DAU)
15
15
  SELECT
16
16
  DATE(event_time) AS dt,
17
17
  approx_count_distinct(user_id) AS dau
@@ -20,10 +20,10 @@ GROUP BY 1
20
20
  ORDER BY 1;
21
21
  ```
22
22
 
23
- ### approx_percentile — 近似分位数
23
+ ### approx_percentile — Approximate Percentiles
24
24
 
25
25
  ```sql
26
- -- 中位数、四分位数、P95P99
26
+ -- Median, quartiles, P95, P99
27
27
  SELECT
28
28
  approx_percentile(amount, 0.25) AS p25,
29
29
  approx_percentile(amount, 0.50) AS median,
@@ -32,7 +32,7 @@ SELECT
32
32
  approx_percentile(amount, 0.99) AS p99
33
33
  FROM my_schema.orders;
34
34
 
35
- -- 分组分位数
35
+ -- Grouped percentiles
36
36
  SELECT
37
37
  category,
38
38
  approx_percentile(price, 0.5) AS median_price
@@ -40,14 +40,14 @@ FROM my_schema.products
40
40
  GROUP BY category;
41
41
  ```
42
42
 
43
- ### approx_histogram — 近似直方图
43
+ ### approx_histogram — Approximate Histogram
44
44
 
45
45
  ```sql
46
- -- 返回结构体数组:[{min, max, count}, ...]
46
+ -- Returns a struct array: [{min, max, count}, ...]
47
47
  SELECT approx_histogram(amount, 10) AS hist
48
48
  FROM my_schema.orders;
49
49
 
50
- -- 解析直方图(展开为行)
50
+ -- Parse histogram (expand to rows)
51
51
  SELECT
52
52
  bucket.min AS bucket_min,
53
53
  bucket.max AS bucket_max,
@@ -58,15 +58,15 @@ FROM (
58
58
  );
59
59
  ```
60
60
 
61
- ### approx_top_k — 近似 TOP-K 高频值
61
+ ### approx_top_k — Approximate Top-K High-Frequency Values
62
62
 
63
63
  ```sql
64
- -- 找出出现最多的前 10 个城市
64
+ -- Find the top 10 most frequent cities
65
65
  SELECT approx_top_k(city, 10) AS top_cities
66
66
  FROM my_schema.orders;
67
67
 
68
- -- 返回结构体数组:[{value, count}, ...]
69
- -- 解析展开(字段名是 value count
68
+ -- Returns a struct array: [{value, count}, ...]
69
+ -- Expand to rows (fields are value and count)
70
70
  SELECT item.value AS city, item.count AS cnt
71
71
  FROM (
72
72
  SELECT EXPLODE(approx_top_k(city, 10)) AS item
@@ -77,59 +77,59 @@ ORDER BY cnt DESC;
77
77
 
78
78
  ---
79
79
 
80
- ## 精确统计函数
80
+ ## Exact Statistical Functions
81
81
 
82
82
  ### percentile / median
83
83
 
84
84
  ```sql
85
- -- 精确中位数(小表用,大表用 approx_percentile
85
+ -- Exact median (use for small tables; use approx_percentile for large tables)
86
86
  SELECT
87
87
  percentile(amount, 0.5) AS exact_median,
88
- median(amount) AS median_alias -- 等价写法
88
+ median(amount) AS median_alias -- equivalent
89
89
  FROM my_schema.orders;
90
90
 
91
- -- 多分位数
91
+ -- Multiple percentiles
92
92
  SELECT percentile(amount, ARRAY(0.25, 0.5, 0.75, 0.9, 0.99))
93
93
  FROM my_schema.orders;
94
94
  ```
95
95
 
96
96
  ---
97
97
 
98
- ## TABLESAMPLE 采样
98
+ ## TABLESAMPLE Sampling
99
99
 
100
100
  ```sql
101
- -- ROW 模式:精确行级采样(适合 ML 训练集,< 1000万行)
102
- SELECT * FROM my_schema.events TABLESAMPLE ROW (10); -- 精确 10%
103
- SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS); -- 精确 5
101
+ -- ROW mode: exact row-level sampling (good for ML training sets, <10M rows)
102
+ SELECT * FROM my_schema.events TABLESAMPLE ROW (10); -- exact 10%
103
+ SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS); -- exact 5 rows
104
104
 
105
- -- SYSTEM 模式:文件级采样(适合大表快速预览,> 1000万行)
106
- SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000; -- 0.1%
105
+ -- SYSTEM mode: file-level sampling (good for large table quick preview, >10M rows)
106
+ SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000; -- ~0.1%
107
107
 
108
- -- 分层采样(按类别等比例采样)
108
+ -- Stratified sampling (proportional by category)
109
109
  SELECT * FROM (
110
110
  SELECT *,
111
111
  ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn,
112
112
  COUNT(*) OVER (PARTITION BY category) AS cat_total
113
113
  FROM my_schema.products
114
114
  )
115
- WHERE rn <= CEIL(cat_total * 0.1); -- 每类取 10%
115
+ WHERE rn <= CEIL(cat_total * 0.1); -- 10% per category
116
116
  ```
117
117
 
118
- | 场景 | 推荐模式 | 说明 |
118
+ | Use Case | Recommended Mode | Notes |
119
119
  |---|---|---|
120
- | 快速数据预览 | SYSTEM | 极快,适合 > 100万行 |
121
- | ML 训练集构建 | ROW | 精确随机,保证代表性 |
122
- | 数据质量抽检 | SYSTEM | 快速抽样验证 |
123
- | 统计分析 | ROW | 精确概率采样 |
120
+ | Quick data preview | SYSTEM | Very fast, good for >1M rows |
121
+ | ML training set | ROW | Exact random, ensures representativeness |
122
+ | Data quality spot check | SYSTEM | Fast sampling for validation |
123
+ | Statistical analysis | ROW | Exact probability sampling |
124
124
 
125
- > ⚠️ **注意**:TABLESAMPLE 在小表(< 数万行)上可能返回全部数据,百分比采样不精确。小表直接用 `LIMIT` 即可。
125
+ > ⚠️ **Note**: TABLESAMPLE on small tables (<tens of thousands of rows) may return all data — percentage sampling is not precise. Use `LIMIT` directly for small tables.
126
126
 
127
127
  ---
128
128
 
129
- ## 窗口函数(时序/排名特征)
129
+ ## Window Functions (Time Series / Ranking Features)
130
130
 
131
131
  ```sql
132
- -- 移动平均(7日)
132
+ -- 7-day moving average
133
133
  SELECT
134
134
  dt,
135
135
  revenue,
@@ -139,7 +139,7 @@ SELECT
139
139
  ) AS revenue_7d_ma
140
140
  FROM daily_stats;
141
141
 
142
- -- 环比增长率
142
+ -- Month-over-month growth rate
143
143
  SELECT
144
144
  dt,
145
145
  revenue,
@@ -148,7 +148,7 @@ SELECT
148
148
  / NULLIF(LAG(revenue, 1) OVER (ORDER BY dt), 0), 2) AS mom_growth_pct
149
149
  FROM daily_stats;
150
150
 
151
- -- 用户行为排名(RFM 分析)
151
+ -- User behavior ranking (RFM analysis)
152
152
  SELECT
153
153
  user_id,
154
154
  total_amount,
@@ -157,7 +157,7 @@ SELECT
157
157
  NTILE(5) OVER (ORDER BY last_order_date DESC) AS recency_quintile
158
158
  FROM user_rfm;
159
159
 
160
- -- 去重保留最新(数据清洗常用)
160
+ -- Deduplication keeping latest (common in data cleaning)
161
161
  SELECT * FROM (
162
162
  SELECT *,
163
163
  ROW_NUMBER() OVER (
@@ -170,25 +170,25 @@ SELECT * FROM (
170
170
 
171
171
  ---
172
172
 
173
- ## 数据质量检查模板
173
+ ## Data Quality Check Template
174
174
 
175
175
  ```sql
176
- -- 一次性输出所有关键质量指标
176
+ -- Output all key quality metrics in one query
177
177
  SELECT
178
178
  COUNT(*) AS total_rows,
179
179
  COUNT(DISTINCT user_id) AS unique_users,
180
- -- 缺失率
180
+ -- Null rates
181
181
  ROUND(100.0 * COUNT(*) FILTER (WHERE user_id IS NULL)
182
182
  / COUNT(*), 2) AS user_id_null_pct,
183
183
  ROUND(100.0 * COUNT(*) FILTER (WHERE amount IS NULL)
184
184
  / COUNT(*), 2) AS amount_null_pct,
185
- -- 异常值
185
+ -- Anomalies
186
186
  SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) AS negative_amount_cnt,
187
187
  SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END) AS extreme_amount_cnt,
188
- -- 时间范围
188
+ -- Time range
189
189
  MIN(order_date) AS earliest_date,
190
190
  MAX(order_date) AS latest_date,
191
- -- 分布
191
+ -- Distribution
192
192
  approx_percentile(amount, 0.5) AS median_amount,
193
193
  approx_percentile(amount, 0.99) AS p99_amount
194
194
  FROM my_schema.orders;