npm - @clickzetta/cz-cli-darwin-arm64 - Versions diffs - 0.3.92 → 0.3.94 - Mend

@clickzetta/cz-cli-darwin-arm64 0.3.92 → 0.3.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/bin/skills/clickzetta-data-science/references/write-and-infer.md CHANGED Viewed

@@ -1,26 +1,26 @@
-# 数据写入、特征工程、模型推理示例
+# Data Write, Feature Engineering, and Model Inference Examples
-## 数据写入
+## Data Write
-| 场景 | 方式 |
+| Scenario | Method |
 |------|------|
-| ZettaPark 可用（Python 3.10+） | `save_as_table()` 或 `create_dataframe().write` |
-| 本地 CSV/pandas 写入 | `session.create_dataframe(df).write.save_as_table()` |
-| Python 3.9 / ZettaPark 不可用 | cursor 批量 INSERT（见下方） |
-| **禁止** | `df.to_sql()`、SQLAlchemy `clickzetta://...` |
+| ZettaPark available (Python 3.10+) | `save_as_table()` or `create_dataframe().write` |
+| Local CSV/pandas write | `session.create_dataframe(df).write.save_as_table()` |
+| Python 3.9 / ZettaPark unavailable | cursor batch INSERT (see below) |
+| **Forbidden** | `df.to_sql()`, SQLAlchemy `clickzetta://...` |
 ```python
-# 方式 A：ZettaPark（推荐）
+# Option A: ZettaPark (recommended)
 session.sql("""
     SELECT o.*, u.age_group FROM my_schema.orders_raw o
     LEFT JOIN my_schema.users u ON o.user_id = u.user_id
     WHERE o.amount > 0
 """).write.mode("overwrite").save_as_table("ds_workspace.orders_clean")
-# 方式 B：pandas → Lakehouse
+# Option B: pandas → Lakehouse
 session.create_dataframe(local_df).write.mode("append").save_as_table("ds_workspace.features_v1")
-# 方式 C：cursor 批量 INSERT（fallback）
+# Option C: cursor batch INSERT (fallback)
 import clickzetta, os
 conn = clickzetta.connect(
     service=os.environ["CLICKZETTA_SERVICE"], instance=os.environ["CLICKZETTA_INSTANCE"],
@@ -40,16 +40,16 @@ conn.close()
 ```
 ```sql
--- 设置中间表生命周期（30 天自动清理）
+-- Set intermediate table lifecycle (auto-cleanup after 30 days)
 ALTER TABLE ds_workspace.orders_clean SET PROPERTIES ('data_lifecycle' = '30');
 ```
 ---
-## 特征工程
+## Feature Engineering
 ```sql
--- SQL 侧（利用 Lakehouse 算力，推荐）
+-- SQL side (leverages Lakehouse compute, recommended)
 SELECT
     user_id,
     COUNT(*)                                                    AS order_cnt_30d,
@@ -65,7 +65,7 @@ GROUP BY user_id;
 ```
 ```python
-# ZettaPark 侧（Python 逻辑）
+# ZettaPark side (Python logic)
 from clickzetta.zettapark.functions import col, when
 features = session.table("ds_workspace.orders_clean") \
@@ -81,26 +81,26 @@ session.create_dataframe(df).write.mode("overwrite").save_as_table("ds_workspace
 ---
-## 模型推理上线
+## Model Inference Deployment
-### BITMAP 用户画像
+### BITMAP User Profiling
 ```sql
 CREATE TABLE ds_workspace.user_tags AS
 SELECT tag_name, group_bitmap_state(user_id) AS user_bitmap
 FROM my_schema.user_behavior GROUP BY tag_name;
--- 人群交集
+-- Audience intersection
 SELECT bitmap_count(bitmap_and(
-    (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '高消费'),
-    (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '近30天活跃')
+    (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
+    (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'active_30d')
 )) AS target_user_count;
 ```
-### SQL UDF 批量推理
+### SQL UDF Batch Inference
 ```sql
--- 调用已部署的模型 UDF（必须用完整 schema 路径）
+-- Call a deployed model UDF (must use full schema path)
 INSERT INTO ds_workspace.predictions
 SELECT user_id,
        ds_workspace.credit_score_model(total_amount_30d, order_cnt_30d, active_days, avg_amount_30d) AS score,
@@ -108,7 +108,7 @@ SELECT user_id,
 FROM ds_workspace.features_final;
 ```
-### 向量检索
+### Vector Search
 ```sql
 SELECT candidate_id,

package/bin/skills/clickzetta-data-science/references/zettapark-api.md CHANGED Viewed

@@ -1,11 +1,11 @@
-# ZettaPark API 数据科学常用操作
+# ZettaPark API — Common Data Science Operations
-> 来源：https://www.yunqi.tech/documents/ZettaparkQuickStart
-> **Python 版本**：推荐 3.12（最低 3.10）。安装：`python3.12 -m venv .venv && pip install clickzetta_zettapark_python`
+> Source: https://www.yunqi.tech/documents/ZettaparkQuickStart
+> **Python version**: 3.12 recommended (3.10 minimum). Install: `python3.12 -m venv .venv && pip install clickzetta_zettapark_python`
 ---
-## Session 创建
+## Creating a Session
 ```python
 from clickzetta.zettapark.session import Session
@@ -31,47 +31,47 @@ session = Session.builder.configs({
 ---
-## 数据读取
+## Reading Data
 ```python
-# 读取整张表
+# Read an entire table
 df = session.table("my_schema.orders")
-# 执行 SQL 查询
+# Execute a SQL query
 df = session.sql("SELECT * FROM my_schema.orders WHERE amount > 100")
-# 转为 pandas（小数据集）
+# Convert to pandas (small datasets)
 pandas_df = df.to_pandas()
-# 分批读取大表（避免 OOM）
+# Read large tables in batches (avoid OOM)
 pandas_df = session.sql("""
     SELECT * FROM my_schema.events
-    TABLESAMPLE ROW (1)   -- 1% 精确采样
+    TABLESAMPLE ROW (1)   -- exact 1% sample
 """).to_pandas()
-# 只获取前 N 行
+# Get first N rows only
 pandas_df = df.limit(10000).to_pandas()
 ```
 ---
-## DataFrame 变换
+## DataFrame Transformations
 ```python
 from clickzetta.zettapark.functions import col, when, lit, sum as F_sum, count as F_count, avg as F_avg
-# 过滤
+# Filter
 df_filtered = df.filter(col("amount") > 0)
 df_filtered = df.filter((col("status") == "COMPLETED") & (col("amount") > 100))
-# 选择列
+# Select columns
 df_selected = df.select("user_id", "amount", "order_date")
-# 新增列
+# Add columns
 df = df.with_column("log_amount", col("amount").cast("double"))
 df = df.with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
-# 聚合
+# Aggregate
 agg_df = df.group_by("user_id").agg(
     F_sum("amount").as_("total_amount"),
     F_count("order_id").as_("order_cnt"),
@@ -81,22 +81,22 @@ agg_df = df.group_by("user_id").agg(
 # JOIN
 result = orders.join(users, orders["user_id"] == users["user_id"], "left")
-# 排序
+# Sort
 df_sorted = df.sort(col("amount").desc())
 ```
 ---
-## 数据写回
+## Writing Data Back
 ```python
-# 覆盖写入（常用于特征表更新）
+# Overwrite (common for feature table updates)
 df.write.mode("overwrite").save_as_table("ds_workspace.features_v1")
-# 追加写入（常用于预测结果）
+# Append (common for prediction results)
 df.write.mode("append").save_as_table("ds_workspace.predictions")
-# pandas DataFrame 写回
+# Write a pandas DataFrame back
 import pandas as pd
 local_df = pd.DataFrame({"user_id": [1, 2], "score": [0.8, 0.6]})
 session.create_dataframe(local_df).write.mode("overwrite") \
@@ -105,7 +105,7 @@ session.create_dataframe(local_df).write.mode("overwrite") \
 ---
-## 与 pandas/scikit-learn 集成
+## Integration with pandas / scikit-learn
 ```python
 import pandas as pd
@@ -114,14 +114,14 @@ from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import GradientBoostingClassifier
-# 1. 从 Lakehouse 拉特征
+# 1. Pull features from Lakehouse
 features_df = session.sql("""
     SELECT user_id, total_amount_30d, order_cnt_30d,
            active_days, avg_amount_30d, label
     FROM ds_workspace.features_final
 """).to_pandas()
-# 2. 本地处理
+# 2. Local processing
 X = features_df.drop(["user_id", "label"], axis=1)
 y = features_df["label"]
@@ -130,17 +130,17 @@ X_scaled = scaler.fit_transform(X)
 X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
-# 3. 训练模型
+# 3. Train model
 model = GradientBoostingClassifier(n_estimators=100)
 model.fit(X_train, y_train)
-# 4. 预测并写回
+# 4. Predict and write back
 features_df["predicted_score"] = model.predict_proba(X_scaled)[:, 1]
 session.create_dataframe(
     features_df[["user_id", "predicted_score"]]
 ).write.mode("overwrite").save_as_table("ds_workspace.predictions")
-# 5. 保存模型
+# 5. Save model
 import joblib
 joblib.dump(model, "models/gbm_model.pkl")
 joblib.dump(scaler, "models/scaler.pkl")
@@ -148,9 +148,9 @@ joblib.dump(scaler, "models/scaler.pkl")
 ---
-## 注意事项
+## Notes
-- `to_pandas()` 会把数据全部拉到本地内存，大表必须先 `TABLESAMPLE` 或 `LIMIT`
-- `collect()` 返回 Row 对象列表，`to_pandas()` 返回 DataFrame，数据科学场景用后者
-- ZettaPark 的 DataFrame 操作是懒执行，只有 `to_pandas()`/`collect()`/`show()`/`save_as_table()` 才真正触发计算
-- 写回时推荐用 `ds_workspace` 这样的专属 Schema，与生产数据隔离
+- `to_pandas()` pulls all data into local memory — always `TABLESAMPLE` or `LIMIT` large tables first
+- `collect()` returns a list of Row objects; `to_pandas()` returns a DataFrame — use the latter for data science
+- ZettaPark DataFrame operations are lazy — computation only triggers on `to_pandas()` / `collect()` / `show()` / `save_as_table()`
+- Write results to a dedicated schema like `ds_workspace` to keep them isolated from production data

package/bin/skills/clickzetta-dw-modeling/SKILL.md CHANGED Viewed

@@ -256,7 +256,7 @@ CDC/Kafka 持续写入 Bronze → Silver（REFRESH INTERVAL 10 MINUTE）→ Gold
 ### DDL 模板
-加载 `clickzetta-sql-syntax-guide` 确认语法，生成各层 DDL。
+参考 ClickZetta Lakehouse 官方文档确认语法，生成各层 DDL。如果是从 Snowflake / Databricks 迁移过来的 DDL，加载 `clickzetta-sql-migration` skill 处理语法差异。
 **数仓开发代码资产化原则：每段 SQL 都应保存为 Studio 任务，作为可管理的代码资产。**

package/bin/skills/clickzetta-external-function/SKILL.md CHANGED Viewed

@@ -1,102 +1,38 @@
 ---
 name: clickzetta-external-function
 description: |
-  在 ClickZetta Lakehouse 中创建和使用外部函数（External Function / UDF），
-  通过 Python 或 Java 扩展 SQL 计算能力，调用 LLM、图像识别、自定义算法等外部服务。
-  覆盖 CREATE API CONNECTION（阿里云FC/腾讯云SCF/AWS Lambda）、
-  CREATE EXTERNAL FUNCTION、Python UDF 代码结构与打包、
-  内置 AI_COMPLETE 和 AI_EMBEDDING 函数的使用。
-  当用户说"外部函数"、"UDF"、"自定义函数"、"External Function"、
-  "Remote Function"、"调用 LLM"、"AI_COMPLETE"、"AI_EMBEDDING"、
-  "文本向量化"、"调用阿里云函数计算"、"调用云函数"、"Python UDF"、
-  "Java UDF"、"CREATE EXTERNAL FUNCTION"时触发。
-  Keywords: external function, UDF, Python UDF, Java UDF, LLM, custom function
+  Create and use External Functions (custom UDFs) in ClickZetta Lakehouse using Python or Java,
+  deployed on Alibaba Cloud FC / Tencent Cloud SCF / AWS Lambda.
+  Covers CREATE API CONNECTION (TYPE CLOUD_FUNCTION), CREATE EXTERNAL FUNCTION, Python/Java UDF code structure and packaging.
+  Keywords: external function, UDF, Python UDF, Java UDF, custom function, cloud function
 ---
-# ClickZetta External Function
+# ClickZetta External Function (Custom UDF)
-External Function 让 SQL 可以调用外部计算能力（LLM、图像识别、自定义算法），通过 Python/Java 编写函数逻辑，部署在云函数服务上执行。
+External Functions let SQL call custom compute logic written in Python or Java, deployed on a cloud function service (Alibaba Cloud FC / Tencent Cloud SCF / AWS Lambda).
-阅读 [references/external-function-ddl.md](references/external-function-ddl.md) 了解完整语法。
+See [references/external-function-ddl.md](references/external-function-ddl.md) for the full syntax reference.
 ---
-## 两种使用路径
+## Overall Flow
-| 路径 | 适用场景 | 复杂度 |
-|---|---|---|
-| **内置 AI 函数**（AI_COMPLETE / AI_EMBEDDING） | 调用 LLM 生成文本、文本向量化 | 低，只需创建 API Connection |
-| **External Function** | 自定义算法、图像处理、私有模型 | 高，需部署云函数 |
----
-## 路径一：内置 AI 函数（推荐）
-### 1. 创建 AI API Connection
-```sql
-CREATE API CONNECTION conn_bailian
-    TYPE ai_function
-    PROVIDER = 'bailian'
-    BASE_URL = 'https://dashscope.aliyuncs.com/api/v1'
-    API_KEY = '<key>';
 ```
-### 2. AI_COMPLETE — 调用 LLM
-```sql
--- 文本摘要
-SELECT id,
-       AI_COMPLETE('connection:conn_bailian', '请用一句话总结：' || content) AS summary
-FROM articles;
--- 情感分析
-SELECT id, review,
-       AI_COMPLETE('connection:conn_bailian',
-           '判断以下评论的情感（正面/负面/中性），只返回一个词：' || review) AS sentiment
-FROM user_reviews;
--- 通过平台 Endpoint（管理员预配置）
-SELECT AI_COMPLETE('endpoint:my_llm_endpoint', prompt_col) AS result
-FROM my_table;
-```
-### 3. AI_EMBEDDING — 文本向量化
-```sql
--- 批量生成 embedding
-SELECT id, content,
-       AI_EMBEDDING('connection:conn_bailian', content) AS vec
-FROM documents;
--- 语义搜索（结合向量索引）
-SELECT id, content,
-       cosine_distance(vec, AI_EMBEDDING('connection:conn_bailian', '用户查询')) AS dist
-FROM doc_embeddings
-ORDER BY dist
-LIMIT 10;
+1. Enable a cloud function service (Alibaba Cloud FC / Tencent Cloud SCF / AWS Lambda)
+2. Write Python/Java function code
+3. Package and upload to object storage or User Volume
+4. Grant Lakehouse access to the cloud function service (RAM role)
+5. CREATE API CONNECTION (TYPE CLOUD_FUNCTION)
+6. CREATE EXTERNAL FUNCTION
+7. Call the function in SQL
 ```
 ---
-## 路径二：External Function（自定义 UDF）
-### 整体流程
-```
-1. 开通云函数服务（阿里云FC / 腾讯云SCF / AWS Lambda）
-2. 编写 Python/Java 函数代码
-3. 打包上传到对象存储或 User Volume
-4. 授权 Lakehouse 访问云函数服务（RAM 角色）
-5. CREATE API CONNECTION
-6. CREATE EXTERNAL FUNCTION
-7. 在 SQL 中调用
-```
-### 步骤 1：创建云函数 API Connection
+## Step 1: Create a Cloud Function API Connection
 ```sql
--- 阿里云 FC
+-- Alibaba Cloud FC
 CREATE API CONNECTION IF NOT EXISTS my_fc_conn
   TYPE CLOUD_FUNCTION
   PROVIDER = 'aliyun'
@@ -105,7 +41,7 @@ CREATE API CONNECTION IF NOT EXISTS my_fc_conn
   NAMESPACE = 'default'
   CODE_BUCKET = 'my-oss-bucket';
--- 腾讯云 SCF
+-- Tencent Cloud SCF
 CREATE API CONNECTION IF NOT EXISTS my_scf_conn
   TYPE CLOUD_FUNCTION
   PROVIDER = 'tencent'
@@ -115,7 +51,9 @@ CREATE API CONNECTION IF NOT EXISTS my_scf_conn
   CODE_BUCKET = 'my-cos-bucket';
 ```
-### 步骤 2：编写 Python UDF
+---
+## Step 2: Write a Python UDF
 ```python
 # upper.py
@@ -132,31 +70,34 @@ class Upper(object):
         return arg.upper()
 ```
-打包上传：
+Package and upload:
 ```bash
 zip -rq upper.zip upper.py
 ```
 ```sql
--- 上传到 User Volume（在 ClickZetta Studio 或 CLI 中执行，source_path 使用绝对路径）
+-- Upload to User Volume (run in ClickZetta Studio or CLI; source_path must be an absolute path)
 PUT '/path/to/upper.zip' TO USER VOLUME;
 ```
-### 步骤 3：创建 External Function
+---
+## Step 3: Create the External Function
 ```sql
--- ⚠️ CREATE EXTERNAL FUNCTION 不支持 OR REPLACE，只支持 IF NOT EXISTS
--- ❌ 错误：CREATE OR REPLACE EXTERNAL FUNCTION ...
--- ✅ 正确：
--- 使用 User Volume 存放代码（无需 OSS）
+-- ⚠️ CREATE EXTERNAL FUNCTION does not support OR REPLACE, only IF NOT EXISTS
+-- ❌ Wrong: CREATE OR REPLACE EXTERNAL FUNCTION ...
+-- ✅ Correct:
+-- Using User Volume to store code (no object storage required)
 CREATE EXTERNAL FUNCTION IF NOT EXISTS public.str_upper
   AS 'upper.Upper'
   USING FILE = 'volume:user://~/upper.zip'
   CONNECTION = my_fc_conn
   WITH PROPERTIES ('remote.udf.api' = 'python3.mc.v0')
-  COMMENT '字符串转大写';
+  COMMENT 'Convert string to uppercase';
--- 使用 OSS 存放代码
+-- Using OSS to store code
 CREATE EXTERNAL FUNCTION IF NOT EXISTS public.str_upper
   AS 'upper.Upper'
   USING FILE = 'oss://my-bucket/functions/upper.zip'
@@ -164,40 +105,41 @@ CREATE EXTERNAL FUNCTION IF NOT EXISTS public.str_upper
   WITH PROPERTIES ('remote.udf.api' = 'python3.mc.v0');
 ```
-### 步骤 4：调用函数
+---
+## Step 4: Call the Function
 ```sql
--- ⚠️ 调用外部函数必须使用完整 Schema 路径，不能省略 schema
--- ❌ 错误：SELECT str_upper(name) FROM my_table;
--- ✅ 正确：
+-- ⚠️ External functions must be called with the full schema-qualified name; the schema cannot be omitted
+-- ❌ Wrong: SELECT str_upper(name) FROM my_table;
+-- ✅ Correct:
 SELECT id, public.str_upper(name) AS upper_name FROM my_table;
 ```
 ---
-## 管理操作
+## Management
 ```sql
--- 查看所有外部函数
+-- List all external functions
 SHOW EXTERNAL FUNCTIONS;
 SHOW EXTERNAL FUNCTIONS LIKE 'str_%';
--- 删除函数（注意：用 DROP FUNCTION，不是 DROP EXTERNAL FUNCTION）
+-- Drop a function (use DROP FUNCTION, not DROP EXTERNAL FUNCTION)
 DROP FUNCTION IF EXISTS public.str_upper;
 ```
-> ⚠️ **注意**：`CREATE FUNCTION`（SQL 内联函数）只支持 SQL 表达式，不支持 Python/JavaScript 等编程语言。需要编程语言逻辑请使用 `CREATE EXTERNAL FUNCTION`。
+> ⚠️ **Note**: `CREATE FUNCTION` (inline SQL function) only supports SQL expressions — it does not support Python, JavaScript, or other programming languages. Use `CREATE EXTERNAL FUNCTION` when you need full programming language logic.
 ---
-## 常见问题
+## Troubleshooting
-| 问题 | 原因 | 解决方案 |
+| Problem | Cause | Solution |
 |---|---|---|
-| 函数调用超时 | 云函数冷启动或执行慢 | 增大超时配置，或预热函数 |
-| 依赖库 ABI 不兼容 | 在 macOS/Windows 打包 | 用 `quay.io/pypa/manylinux2014_x86_64` 容器打包 |
-| 代码包 > 500MB | 依赖过大 | 改用容器镜像方式部署 |
-| AI_COMPLETE 报错 | API Key 无效或余额不足 | 检查 API Connection 的 API_KEY |
-| ROLE_ARN 权限不足 | RAM 角色未授权 | 参考文档配置 AliyunFCFullAccess + OSS 权限 |
-| 函数调用报"not found" | 省略了 Schema 前缀 | 必须用完整路径：`schema.function_name(...)` |
-| CREATE OR REPLACE 报错 | EXTERNAL FUNCTION 不支持 OR REPLACE | 改用 `CREATE EXTERNAL FUNCTION IF NOT EXISTS` |
+| Function call times out | Cloud function cold start or slow execution | Increase the timeout setting, or pre-warm the function |
+| Dependency ABI incompatibility | Packaged on macOS/Windows | Use the `quay.io/pypa/manylinux2014_x86_64` container to package |
+| Code package > 500 MB | Dependencies too large | Switch to container image deployment |
+| ROLE_ARN permission denied | RAM role not authorized | Configure AliyunFCFullAccess + OSS permissions per the documentation |
+| Function call returns "not found" | Schema prefix omitted | Always use the full path: `schema.function_name(...)` |
+| CREATE OR REPLACE error | EXTERNAL FUNCTION does not support OR REPLACE | Use `CREATE EXTERNAL FUNCTION IF NOT EXISTS` instead |

package/bin/skills/clickzetta-external-function/eval_cases.jsonl CHANGED Viewed

@@ -1,4 +1,4 @@
-{"case_id":"001","type":"should_call","user_input":"怎么创建一个调用 LLM 的外部函数？需要哪些步骤？","expected_skill":"clickzetta-external-function","expected_output_contains":["API CONNECTION","EXTERNAL FUNCTION"]}
-{"case_id":"002","type":"should_call","user_input":"AI_COMPLETE 和 AI_EMBEDDING 内置函数怎么用？","expected_skill":"clickzetta-external-function","expected_output_contains":["AI_COMPLETE","AI_EMBEDDING"]}
-{"case_id":"003","type":"should_call","user_input":"Python UDF 的代码结构和打包方式是什么？","expected_skill":"clickzetta-external-function","expected_output_contains":["Python","def"]}
-{"case_id":"004","type":"should_call","user_input":"怎么创建 API CONNECTION 连接阿里云函数计算？","expected_skill":"clickzetta-external-function","expected_output_contains":["API CONNECTION"]}
+{"case_id":"001","type":"should_call","user_input":"How do I create an external function to call a custom algorithm?","expected_skill":"clickzetta-external-function","expected_output_contains":["API CONNECTION","EXTERNAL FUNCTION"]}
+{"case_id":"002","type":"should_call","user_input":"What is the Python UDF code structure and how do I package it?","expected_skill":"clickzetta-external-function","expected_output_contains":["annotate","evaluate"]}
+{"case_id":"003","type":"should_call","user_input":"How do I create an API Connection for Alibaba Cloud Function Compute?","expected_skill":"clickzetta-external-function","expected_output_contains":["API CONNECTION","CLOUD_FUNCTION"]}
+{"case_id":"004","type":"should_call","user_input":"How do I call a UDF in SQL with the correct schema prefix?","expected_skill":"clickzetta-external-function","expected_output_contains":["schema","function_name"]}