npm - @clickzetta/cz-cli-darwin-x64 - Versions diffs - 0.3.92 → 0.3.93 - Mend

@clickzetta/cz-cli-darwin-x64 0.3.92 → 0.3.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/bin/skills/clickzetta-data-science/references/bitmap-profile.md CHANGED Viewed

@@ -1,57 +1,57 @@
-# BITMAP 用户画像参考
+# BITMAP User Profiling Reference
-> 来源：https://www.yunqi.tech/documents/bitmap-type
+> Source: https://www.yunqi.tech/documents/bitmap-type
-BITMAP 是 ClickZetta 中用于高效存储和处理整数集合的数据类型，基于 Roaring Bitmap 压缩算法，特别适合用户画像、人群圈选、UV 统计等数据科学场景。
+BITMAP is a data type in ClickZetta for efficiently storing and processing integer sets, based on the Roaring Bitmap compression algorithm. It is particularly suited for data science use cases such as user profiling, audience segmentation, and UV counting.
 ---
-## 核心限制
+## Core Limitations
-- 支持 **64 位无符号整数**（0 到 2^64-1）
-- **不支持**比较操作（<、>、=）
-- **不支持** ORDER BY、GROUP BY、DISTINCT
-- **不能**作为 PRIMARY KEY、PARTITION KEY、CLUSTER KEY
+- Supports **64-bit unsigned integers** (0 to 2^64-1)
+- **Does not support** comparison operators (<, >, =)
+- **Does not support** ORDER BY, GROUP BY, DISTINCT
+- **Cannot** be used as PRIMARY KEY, PARTITION KEY, or CLUSTER KEY
 ---
-## 构建用户标签 BITMAP
+## Building User Tag BITMAPs
 ```sql
--- 方式 1：从行数据聚合构建（最常用）
+-- Option 1: Aggregate from row data (most common)
 CREATE TABLE ds_workspace.user_tags AS
 SELECT
     tag_name,
     group_bitmap_state(user_id) AS user_bitmap
 FROM (
-    -- 高消费用户
+    -- High-value users
     SELECT 'high_value' AS tag_name, user_id
     FROM my_schema.orders
     WHERE total_amount_30d > 1000
     UNION ALL
-    -- 近30天活跃用户
+    -- Active in last 30 days
     SELECT 'active_30d' AS tag_name, user_id
     FROM my_schema.events
     WHERE event_date >= CURRENT_DATE - INTERVAL 30 DAY
     UNION ALL
-    -- 已流失用户（90天未活跃）
+    -- Churned users (inactive for 90 days)
     SELECT 'churned' AS tag_name, user_id
     FROM my_schema.users
     WHERE last_active_date < CURRENT_DATE - INTERVAL 90 DAY
 ) t
 GROUP BY tag_name;
--- 方式 2：从数组构建
+-- Option 2: Build from an array
 INSERT INTO ds_workspace.user_tags VALUES
     ('vip', bitmap_build(ARRAY(1001, 1002, 1003, 1004)));
 ```
 ---
-## 人群圈选操作
+## Audience Segmentation Operations
 ```sql
--- 交集：同时满足多个标签（AND）
+-- Intersection: users matching all tags (AND)
 SELECT bitmap_count(
     bitmap_and(
         (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
@@ -59,7 +59,7 @@ SELECT bitmap_count(
     )
 ) AS target_count;
--- 并集：满足任一标签（OR）
+-- Union: users matching any tag (OR)
 SELECT bitmap_count(
     bitmap_or(
         (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
@@ -67,7 +67,7 @@ SELECT bitmap_count(
     )
 ) AS reach_count;
--- 差集：排除某类用户（ANDNOT）
+-- Difference: exclude a group (ANDNOT)
 SELECT bitmap_count(
     bitmap_andnot(
         (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
@@ -75,7 +75,7 @@ SELECT bitmap_count(
     )
 ) AS targetable_count;
--- 获取目标用户 ID 列表
+-- Get target user ID list
 SELECT bitmap_to_array(
     bitmap_andnot(
         (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
@@ -86,10 +86,10 @@ SELECT bitmap_to_array(
 ---
-## UV 统计（去重计数）
+## UV Counting (Distinct Count)
 ```sql
--- 日活跃用户数（DAU）
+-- Daily active users (DAU)
 SELECT
     event_date,
     bitmap_count(group_bitmap_state(user_id)) AS dau
@@ -97,11 +97,11 @@ FROM my_schema.events
 GROUP BY event_date
 ORDER BY event_date;
--- 周活跃用户数（WAU）—— 跨天去重
+-- Weekly active users (WAU) — deduplicated across days
 SELECT
     DATE_TRUNC('week', event_date) AS week_start,
     bitmap_count(
-        bitmap_or_agg(daily_bitmap)  -- 合并多天 bitmap
+        bitmap_or_agg(daily_bitmap)  -- merge multiple days' bitmaps
     ) AS wau
 FROM (
     SELECT event_date,
@@ -111,7 +111,7 @@ FROM (
 ) t
 GROUP BY 1;
--- 用户留存分析（新用户 vs 回访用户）
+-- User retention analysis (new vs. returning users)
 SELECT
     bitmap_count(
         bitmap_and(new_users.user_bitmap, return_users.user_bitmap)
@@ -128,19 +128,19 @@ FROM
 ---
-## 常用 BITMAP 函数速查
+## BITMAP Function Quick Reference
-| 函数 | 说明 | 示例 |
+| Function | Description | Example |
 |---|---|---|
-| `group_bitmap_state(col)` | 聚合构建 BITMAP | `GROUP BY tag` |
-| `bitmap_count(bm)` | 计算元素个数（UV） | `bitmap_count(user_bm)` |
-| `bitmap_and(a, b)` | 交集 | 同时满足 A 和 B |
-| `bitmap_or(a, b)` | 并集 | 满足 A 或 B |
-| `bitmap_andnot(a, b)` | 差集 | 在 A 中但不在 B 中 |
-| `bitmap_xor(a, b)` | 异或（只在一个中） | A、B 各自独有的 |
-| `bitmap_to_array(bm)` | 转为整数数组 | 获取用户 ID 列表 |
-| `bitmap_build(arr)` | 从数组构建 | `bitmap_build(ARRAY(1,2,3))` |
-| `bitmap_contains(bm, val)` | 检查是否包含某值 | `bitmap_contains(bm, user_id)` |
-| `bitmap_min(bm)` | 最小元素 | — |
-| `bitmap_max(bm)` | 最大元素 | — |
-| `to_bitmap(val)` | 单值转 BITMAP | `to_bitmap(user_id)` |
+| `group_bitmap_state(col)` | Aggregate to build a BITMAP | `GROUP BY tag` |
+| `bitmap_count(bm)` | Count elements (UV) | `bitmap_count(user_bm)` |
+| `bitmap_and(a, b)` | Intersection | Users in both A and B |
+| `bitmap_or(a, b)` | Union | Users in A or B |
+| `bitmap_andnot(a, b)` | Difference | In A but not in B |
+| `bitmap_xor(a, b)` | Symmetric difference | Exclusive to either A or B |
+| `bitmap_to_array(bm)` | Convert to integer array | Get user ID list |
+| `bitmap_build(arr)` | Build from array | `bitmap_build(ARRAY(1,2,3))` |
+| `bitmap_contains(bm, val)` | Check if value is present | `bitmap_contains(bm, user_id)` |
+| `bitmap_min(bm)` | Minimum element | — |
+| `bitmap_max(bm)` | Maximum element | — |
+| `to_bitmap(val)` | Convert single value to BITMAP | `to_bitmap(user_id)` |

package/bin/skills/clickzetta-data-science/references/data-patterns.md CHANGED Viewed

@@ -1,6 +1,6 @@
-# 数据发现、质量评估、清洗、EDA 示例
+# Data Discovery, Quality Assessment, Cleaning, and EDA Examples
-## 数据发现
+## Data Discovery
 ```python
 from src.config import get_session
@@ -20,10 +20,10 @@ session.sql("""
 ---
-## 数据质量评估
+## Data Quality Assessment
 ```sql
--- 基础统计
+-- Basic statistics
 SELECT
     COUNT(*)                                                          AS total_rows,
     COUNT(DISTINCT user_id)                                           AS unique_users,
@@ -32,11 +32,11 @@ SELECT
     ROUND(100.0 * SUM(CASE WHEN amount  IS NULL THEN 1 ELSE 0 END) / COUNT(*), 2) AS amount_null_pct
 FROM my_schema.orders;
--- 主键重复检查
+-- Duplicate primary key check
 SELECT order_id, COUNT(*) AS cnt
 FROM my_schema.orders GROUP BY order_id HAVING cnt > 1 LIMIT 10;
--- 数值分布（大表高效）
+-- Numeric distribution (efficient for large tables)
 SELECT
     approx_percentile(amount, 0.25) AS p25,
     approx_percentile(amount, 0.50) AS median,
@@ -45,25 +45,25 @@ SELECT
     MIN(amount) AS min_val, MAX(amount) AS max_val
 FROM my_schema.orders;
--- 高频值 TOP-K
+-- Top-K high-frequency values
 SELECT approx_top_k(status, 10) AS top_statuses FROM my_schema.orders;
--- 近似 UV
+-- Approximate distinct count
 SELECT approx_count_distinct(user_id) AS approx_uv FROM my_schema.events;
 ```
 ---
-## 数据清洗
+## Data Cleaning
 ```sql
--- 去重（保留最新一条）
+-- Deduplication (keep the latest record)
 SELECT * FROM (
     SELECT *, ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY update_time DESC) AS rn
     FROM my_schema.orders_raw
 ) WHERE rn = 1;
--- 缺失值处理 + 类型转换
+-- Null handling + type casting
 SELECT
     order_id, user_id,
     COALESCE(amount, 0.0)       AS amount,
@@ -72,7 +72,7 @@ SELECT
 FROM my_schema.orders_raw
 WHERE user_id IS NOT NULL;
--- 多表整合
+-- Multi-table integration
 SELECT o.order_id, o.user_id, o.amount, o.order_date,
        u.age_group, u.city, p.category, p.brand
 FROM my_schema.orders o
@@ -85,16 +85,16 @@ LEFT JOIN my_schema.products p ON o.product_id = p.product_id;
 ## EDA
 ```python
-# 采样策略
+# Sampling strategies
 df_quick = session.sql("""
     SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000
-""").to_pandas()  # SYSTEM：文件级，极快，适合 >100万行预览
+""").to_pandas()  # SYSTEM: file-level, very fast, good for >1M row previews
 df_ml = session.sql("""
     SELECT * FROM my_schema.events TABLESAMPLE ROW (10)
-""").to_pandas()  # ROW：行级精确，适合 ML 训练集
+""").to_pandas()  # ROW: exact row-level, good for ML training sets
-# 时序分析
+# Time series analysis
 session.sql("""
     SELECT
         DATE_TRUNC('day', order_time)  AS dt,

package/bin/skills/clickzetta-data-science/references/setup.md CHANGED Viewed

@@ -1,62 +1,62 @@
-# 环境搭建与项目配置
+# Environment Setup & Project Configuration
-## 环境搭建
+## Environment Setup
 ```bash
-# 方式 1：venv（推荐）
+# Option 1: venv (recommended)
 python3.12 -m venv .venv
 source .venv/bin/activate          # macOS/Linux
 pip install clickzetta_zettapark_python clickzetta-connector-python \
     python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
     -i https://pypi.tuna.tsinghua.edu.cn/simple
-# 方式 2：pyenv（需要切换 Python 版本时）
+# Option 2: pyenv (when you need to switch Python versions)
 pyenv install 3.12.9 && pyenv local 3.12.9
 python -m venv .venv && source .venv/bin/activate
 pip install clickzetta_zettapark_python clickzetta-connector-python \
     python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
     -i https://pypi.tuna.tsinghua.edu.cn/simple
-# 方式 3：conda
+# Option 3: conda
 conda create -n lakehouse-ds python=3.12 -y && conda activate lakehouse-ds
 pip install clickzetta_zettapark_python clickzetta-connector-python \
     python-dotenv pandas numpy scikit-learn pyarrow jupyterlab matplotlib seaborn \
     -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
-| 问题 | 修复 |
+| Issue | Fix |
 |------|------|
-| Python 3.8/3.9 | `pyenv install 3.12.9` 或 `python3.12 -m venv .venv` |
-| `pyarrow` 版本冲突 | `pip install pyarrow==14.0.0` |
-| M1/M2 Mac 报错 | `pip install --no-binary :all:` 或改用 conda |
-| 连接超时 | VCluster 未启动，在 Studio 中手动启动 |
+| Python 3.8/3.9 | `pyenv install 3.12.9` or `python3.12 -m venv .venv` |
+| `pyarrow` version conflict | `pip install pyarrow==14.0.0` |
+| M1/M2 Mac error | `pip install --no-binary :all:` or use conda |
+| Connection timeout | VCluster not started — start it manually in Studio |
 ---
-## Jupyter Kernel 配置
+## Jupyter Kernel Configuration
 ```bash
-# 注册 venv 为 Jupyter kernel（关键步骤，否则 notebook 用系统 Python）
+# Register the venv as a Jupyter kernel (critical — otherwise notebook uses system Python)
 source .venv/bin/activate
 pip install ipykernel jupyterlab
 python -m ipykernel install --user --name lakehouse-ds --display-name "Python (lakehouse-ds)"
-# 启动 JupyterLab
+# Start JupyterLab
 jupyter lab --port=8888
 ```
-VS Code / Cursor：打开 `.ipynb` → 右上角 "Select Kernel" → 选 "Python (lakehouse-ds)"
+VS Code / Cursor: open `.ipynb` → top-right "Select Kernel" → choose "Python (lakehouse-ds)"
-| 问题 | 修复 |
+| Issue | Fix |
 |------|------|
-| `ModuleNotFoundError: clickzetta` | kernel 未选对，切换到注册的 venv kernel |
-| `.env` 读不到 | `load_dotenv(dotenv_path='../.env')` 指定路径 |
-| `to_pandas()` OOM | 加 `TABLESAMPLE ROW(1)` 或 `LIMIT` |
-| 图表不显示 | notebook 开头加 `%matplotlib inline` |
+| `ModuleNotFoundError: clickzetta` | Wrong kernel selected — switch to the registered venv kernel |
+| `.env` not loading | Use `load_dotenv(dotenv_path='../.env')` with an explicit path |
+| `to_pandas()` OOM | Add `TABLESAMPLE ROW(1)` or `LIMIT` |
+| Charts not showing | Add `%matplotlib inline` at the top of the notebook |
 ---
-## src/config.py 模板
+## src/config.py Template
 ```python
 import os, sys
@@ -65,7 +65,7 @@ from dotenv import load_dotenv
 from clickzetta.zettapark.session import Session
 import clickzetta
-# 多位置查找 .env
+# Search for .env in multiple locations
 for _p in [
     Path(__file__).parent.parent / ".env",
     Path.home() / ".config" / "kilo" / ".env",
@@ -77,12 +77,12 @@ for _p in [
         break
 def check_environment():
-    """在 00-env-check.ipynb 里调用，打印环境诊断。"""
+    """Call from 00-env-check.ipynb to print environment diagnostics."""
     ver = sys.version_info
     if ver < (3, 10):
         raise RuntimeError(
-            f"Python {ver.major}.{ver.minor} 不满足要求。ZettaPark 需要 Python 3.10+。\n"
-            "升级：brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9"
+            f"Python {ver.major}.{ver.minor} does not meet requirements. ZettaPark requires Python 3.10+.\n"
+            "Upgrade: brew install pyenv && pyenv install 3.12.9 && pyenv local 3.12.9"
         )
     print(f"✅ Python {ver.major}.{ver.minor}.{ver.micro}")
     for pkg, mod in [
@@ -94,12 +94,12 @@ def check_environment():
             m = __import__(mod.split(".")[0])
             print(f"✅ {pkg}: {getattr(m, '__version__', 'ok')}")
         except ImportError:
-            print(f"❌ {pkg}: 未安装 → pip install {pkg}")
+            print(f"❌ {pkg}: not installed → pip install {pkg}")
     try:
         s = get_session()
         print(f"✅ Lakehouse: {s.sql('SELECT current_workspace(), current_user()').collect()}")
     except Exception as e:
-        print(f"❌ Lakehouse 连接失败: {e}")
+        print(f"❌ Lakehouse connection failed: {e}")
 def get_session() -> Session:
     return Session.builder.configs({
@@ -113,7 +113,7 @@ def get_session() -> Session:
     }).create()
 def get_connector_connection():
-    """仅用于 pd.read_sql。禁止用于 df.to_sql()。"""
+    """For pd.read_sql only. Do NOT use with df.to_sql()."""
     return clickzetta.connect(
         service=os.environ["CLICKZETTA_SERVICE"],
         instance=os.environ["CLICKZETTA_INSTANCE"],
@@ -127,7 +127,7 @@ def get_connector_connection():
 ---
-## .env 模板
+## .env Template
 ```bash
 CLICKZETTA_SERVICE=cn-shanghai-alicloud.api.clickzetta.com

package/bin/skills/clickzetta-data-science/references/stats-functions.md CHANGED Viewed

@@ -1,17 +1,17 @@
-# 数据科学统计分析函数参考
+# Statistical Analysis Functions Reference for Data Science
 ---
-## 近似聚合函数（大表高效统计）
+## Approximate Aggregate Functions (Efficient for Large Tables)
-### approx_count_distinct — 近似 UV
+### approx_count_distinct — Approximate Distinct Count
 ```sql
--- 使用 HyperLogLog 算法，误差约 2%，比 COUNT(DISTINCT) 快 10x+
+-- Uses HyperLogLog algorithm, ~2% error, 10x+ faster than COUNT(DISTINCT)
 SELECT approx_count_distinct(user_id) AS approx_uv
 FROM my_schema.events;
--- 按天统计 DAU
+-- Daily active users (DAU)
 SELECT
     DATE(event_time) AS dt,
     approx_count_distinct(user_id) AS dau
@@ -20,10 +20,10 @@ GROUP BY 1
 ORDER BY 1;
 ```
-### approx_percentile — 近似分位数
+### approx_percentile — Approximate Percentiles
 ```sql
--- 中位数、四分位数、P95、P99
+-- Median, quartiles, P95, P99
 SELECT
     approx_percentile(amount, 0.25) AS p25,
     approx_percentile(amount, 0.50) AS median,
@@ -32,7 +32,7 @@ SELECT
     approx_percentile(amount, 0.99) AS p99
 FROM my_schema.orders;
--- 分组分位数
+-- Grouped percentiles
 SELECT
     category,
     approx_percentile(price, 0.5) AS median_price
@@ -40,14 +40,14 @@ FROM my_schema.products
 GROUP BY category;
 ```
-### approx_histogram — 近似直方图
+### approx_histogram — Approximate Histogram
 ```sql
--- 返回结构体数组：[{min, max, count}, ...]
+-- Returns a struct array: [{min, max, count}, ...]
 SELECT approx_histogram(amount, 10) AS hist
 FROM my_schema.orders;
--- 解析直方图（展开为行）
+-- Parse histogram (expand to rows)
 SELECT
     bucket.min AS bucket_min,
     bucket.max AS bucket_max,
@@ -58,15 +58,15 @@ FROM (
 );
 ```
-### approx_top_k — 近似 TOP-K 高频值
+### approx_top_k — Approximate Top-K High-Frequency Values
 ```sql
--- 找出出现最多的前 10 个城市
+-- Find the top 10 most frequent cities
 SELECT approx_top_k(city, 10) AS top_cities
 FROM my_schema.orders;
--- 返回结构体数组：[{value, count}, ...]
--- 解析展开（字段名是 value 和 count）
+-- Returns a struct array: [{value, count}, ...]
+-- Expand to rows (fields are value and count)
 SELECT item.value AS city, item.count AS cnt
 FROM (
     SELECT EXPLODE(approx_top_k(city, 10)) AS item
@@ -77,59 +77,59 @@ ORDER BY cnt DESC;
 ---
-## 精确统计函数
+## Exact Statistical Functions
 ### percentile / median
 ```sql
--- 精确中位数（小表用，大表用 approx_percentile）
+-- Exact median (use for small tables; use approx_percentile for large tables)
 SELECT
     percentile(amount, 0.5)  AS exact_median,
-    median(amount)           AS median_alias  -- 等价写法
+    median(amount)           AS median_alias  -- equivalent
 FROM my_schema.orders;
--- 多分位数
+-- Multiple percentiles
 SELECT percentile(amount, ARRAY(0.25, 0.5, 0.75, 0.9, 0.99))
 FROM my_schema.orders;
 ```
 ---
-## TABLESAMPLE 采样
+## TABLESAMPLE Sampling
 ```sql
--- ROW 模式：精确行级采样（适合 ML 训练集，< 1000万行）
-SELECT * FROM my_schema.events TABLESAMPLE ROW (10);      -- 精确 10%
-SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS);  -- 精确 5 行
+-- ROW mode: exact row-level sampling (good for ML training sets, <10M rows)
+SELECT * FROM my_schema.events TABLESAMPLE ROW (10);      -- exact 10%
+SELECT * FROM my_schema.events TABLESAMPLE ROW (5 ROWS);  -- exact 5 rows
--- SYSTEM 模式：文件级采样（适合大表快速预览，> 1000万行）
-SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000;  -- 约 0.1%
+-- SYSTEM mode: file-level sampling (good for large table quick preview, >10M rows)
+SELECT * FROM my_schema.events TABLESAMPLE SYSTEM (0.1) LIMIT 50000;  -- ~0.1%
--- 分层采样（按类别等比例采样）
+-- Stratified sampling (proportional by category)
 SELECT * FROM (
     SELECT *,
            ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn,
            COUNT(*) OVER (PARTITION BY category) AS cat_total
     FROM my_schema.products
 )
-WHERE rn <= CEIL(cat_total * 0.1);  -- 每类取 10%
+WHERE rn <= CEIL(cat_total * 0.1);  -- 10% per category
 ```
-| 场景 | 推荐模式 | 说明 |
+| Use Case | Recommended Mode | Notes |
 |---|---|---|
-| 快速数据预览 | SYSTEM | 极快，适合 > 100万行 |
-| ML 训练集构建 | ROW | 精确随机，保证代表性 |
-| 数据质量抽检 | SYSTEM | 快速抽样验证 |
-| 统计分析 | ROW | 精确概率采样 |
+| Quick data preview | SYSTEM | Very fast, good for >1M rows |
+| ML training set | ROW | Exact random, ensures representativeness |
+| Data quality spot check | SYSTEM | Fast sampling for validation |
+| Statistical analysis | ROW | Exact probability sampling |
-> ⚠️ **注意**：TABLESAMPLE 在小表（< 数万行）上可能返回全部数据，百分比采样不精确。小表直接用 `LIMIT` 即可。
+> ⚠️ **Note**: TABLESAMPLE on small tables (<tens of thousands of rows) may return all data — percentage sampling is not precise. Use `LIMIT` directly for small tables.
 ---
-## 窗口函数（时序/排名特征）
+## Window Functions (Time Series / Ranking Features)
 ```sql
--- 移动平均（7日）
+-- 7-day moving average
 SELECT
     dt,
     revenue,
@@ -139,7 +139,7 @@ SELECT
     ) AS revenue_7d_ma
 FROM daily_stats;
--- 环比增长率
+-- Month-over-month growth rate
 SELECT
     dt,
     revenue,
@@ -148,7 +148,7 @@ SELECT
           / NULLIF(LAG(revenue, 1) OVER (ORDER BY dt), 0), 2) AS mom_growth_pct
 FROM daily_stats;
--- 用户行为排名（RFM 分析）
+-- User behavior ranking (RFM analysis)
 SELECT
     user_id,
     total_amount,
@@ -157,7 +157,7 @@ SELECT
     NTILE(5) OVER (ORDER BY last_order_date DESC) AS recency_quintile
 FROM user_rfm;
--- 去重保留最新（数据清洗常用）
+-- Deduplication keeping latest (common in data cleaning)
 SELECT * FROM (
     SELECT *,
            ROW_NUMBER() OVER (
@@ -170,25 +170,25 @@ SELECT * FROM (
 ---
-## 数据质量检查模板
+## Data Quality Check Template
 ```sql
--- 一次性输出所有关键质量指标
+-- Output all key quality metrics in one query
 SELECT
     COUNT(*)                                                    AS total_rows,
     COUNT(DISTINCT user_id)                                     AS unique_users,
-    -- 缺失率
+    -- Null rates
     ROUND(100.0 * COUNT(*) FILTER (WHERE user_id IS NULL)
           / COUNT(*), 2)                                        AS user_id_null_pct,
     ROUND(100.0 * COUNT(*) FILTER (WHERE amount IS NULL)
           / COUNT(*), 2)                                        AS amount_null_pct,
-    -- 异常值
+    -- Anomalies
     SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END)                AS negative_amount_cnt,
     SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END)          AS extreme_amount_cnt,
-    -- 时间范围
+    -- Time range
     MIN(order_date)                                             AS earliest_date,
     MAX(order_date)                                             AS latest_date,
-    -- 分布
+    -- Distribution
     approx_percentile(amount, 0.5)                             AS median_amount,
     approx_percentile(amount, 0.99)                            AS p99_amount
 FROM my_schema.orders;