@clickzetta/cz-cli-darwin-arm64 0.3.92 → 0.3.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
  3. package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
  4. package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
  5. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
  6. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
  7. package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
  8. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
  9. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
  10. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
  11. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
  12. package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
  13. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
  14. package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
  15. package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
  16. package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
  17. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
  18. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
  19. package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
  20. package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
  21. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
  22. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
  23. package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
  24. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
  25. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
  26. package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
  27. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
  28. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
  29. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
  30. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
  31. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
  32. package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
  33. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
  34. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
  35. package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
  36. package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
  37. package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
  38. package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
  39. package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
  40. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
  41. package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
  42. package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
  43. package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
  44. package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
  45. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
  46. package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
  47. package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
  48. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
  49. package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
  50. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
  51. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
  52. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
  53. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
  54. package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
  55. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
  56. package/package.json +1 -1
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  58. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  60. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
  61. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  62. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  63. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  64. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  65. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  66. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  67. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  68. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  69. /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
@@ -1,26 +1,26 @@
1
- # 数据写入、特征工程、模型推理示例
1
+ # Data Write, Feature Engineering, and Model Inference Examples
2
2
 
3
- ## 数据写入
3
+ ## Data Write
4
4
 
5
- | 场景 | 方式 |
5
+ | Scenario | Method |
6
6
  |------|------|
7
- | ZettaPark 可用(Python 3.10+) | `save_as_table()` `create_dataframe().write` |
8
- | 本地 CSV/pandas 写入 | `session.create_dataframe(df).write.save_as_table()` |
9
- | Python 3.9 / ZettaPark 不可用 | cursor 批量 INSERT(见下方) |
10
- | **禁止** | `df.to_sql()`、SQLAlchemy `clickzetta://...` |
7
+ | ZettaPark available (Python 3.10+) | `save_as_table()` or `create_dataframe().write` |
8
+ | Local CSV/pandas write | `session.create_dataframe(df).write.save_as_table()` |
9
+ | Python 3.9 / ZettaPark unavailable | cursor batch INSERT (see below) |
10
+ | **Forbidden** | `df.to_sql()`, SQLAlchemy `clickzetta://...` |
11
11
 
12
12
  ```python
13
- # 方式 AZettaPark(推荐)
13
+ # Option A: ZettaPark (recommended)
14
14
  session.sql("""
15
15
  SELECT o.*, u.age_group FROM my_schema.orders_raw o
16
16
  LEFT JOIN my_schema.users u ON o.user_id = u.user_id
17
17
  WHERE o.amount > 0
18
18
  """).write.mode("overwrite").save_as_table("ds_workspace.orders_clean")
19
19
 
20
- # 方式 Bpandas → Lakehouse
20
+ # Option B: pandas → Lakehouse
21
21
  session.create_dataframe(local_df).write.mode("append").save_as_table("ds_workspace.features_v1")
22
22
 
23
- # 方式 Ccursor 批量 INSERTfallback
23
+ # Option C: cursor batch INSERT (fallback)
24
24
  import clickzetta, os
25
25
  conn = clickzetta.connect(
26
26
  service=os.environ["CLICKZETTA_SERVICE"], instance=os.environ["CLICKZETTA_INSTANCE"],
@@ -40,16 +40,16 @@ conn.close()
40
40
  ```
41
41
 
42
42
  ```sql
43
- -- 设置中间表生命周期(30 天自动清理)
43
+ -- Set intermediate table lifecycle (auto-cleanup after 30 days)
44
44
  ALTER TABLE ds_workspace.orders_clean SET PROPERTIES ('data_lifecycle' = '30');
45
45
  ```
46
46
 
47
47
  ---
48
48
 
49
- ## 特征工程
49
+ ## Feature Engineering
50
50
 
51
51
  ```sql
52
- -- SQL 侧(利用 Lakehouse 算力,推荐)
52
+ -- SQL side (leverages Lakehouse compute, recommended)
53
53
  SELECT
54
54
  user_id,
55
55
  COUNT(*) AS order_cnt_30d,
@@ -65,7 +65,7 @@ GROUP BY user_id;
65
65
  ```
66
66
 
67
67
  ```python
68
- # ZettaPark 侧(Python 逻辑)
68
+ # ZettaPark side (Python logic)
69
69
  from clickzetta.zettapark.functions import col, when
70
70
 
71
71
  features = session.table("ds_workspace.orders_clean") \
@@ -81,26 +81,26 @@ session.create_dataframe(df).write.mode("overwrite").save_as_table("ds_workspace
81
81
 
82
82
  ---
83
83
 
84
- ## 模型推理上线
84
+ ## Model Inference Deployment
85
85
 
86
- ### BITMAP 用户画像
86
+ ### BITMAP User Profiling
87
87
 
88
88
  ```sql
89
89
  CREATE TABLE ds_workspace.user_tags AS
90
90
  SELECT tag_name, group_bitmap_state(user_id) AS user_bitmap
91
91
  FROM my_schema.user_behavior GROUP BY tag_name;
92
92
 
93
- -- 人群交集
93
+ -- Audience intersection
94
94
  SELECT bitmap_count(bitmap_and(
95
- (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '高消费'),
96
- (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = '近30天活跃')
95
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'high_value'),
96
+ (SELECT user_bitmap FROM ds_workspace.user_tags WHERE tag_name = 'active_30d')
97
97
  )) AS target_user_count;
98
98
  ```
99
99
 
100
- ### SQL UDF 批量推理
100
+ ### SQL UDF Batch Inference
101
101
 
102
102
  ```sql
103
- -- 调用已部署的模型 UDF(必须用完整 schema 路径)
103
+ -- Call a deployed model UDF (must use full schema path)
104
104
  INSERT INTO ds_workspace.predictions
105
105
  SELECT user_id,
106
106
  ds_workspace.credit_score_model(total_amount_30d, order_cnt_30d, active_days, avg_amount_30d) AS score,
@@ -108,7 +108,7 @@ SELECT user_id,
108
108
  FROM ds_workspace.features_final;
109
109
  ```
110
110
 
111
- ### 向量检索
111
+ ### Vector Search
112
112
 
113
113
  ```sql
114
114
  SELECT candidate_id,
@@ -1,11 +1,11 @@
1
- # ZettaPark API 数据科学常用操作
1
+ # ZettaPark API — Common Data Science Operations
2
2
 
3
- > 来源:https://www.yunqi.tech/documents/ZettaparkQuickStart
4
- > **Python 版本**:推荐 3.12(最低 3.10)。安装:`python3.12 -m venv .venv && pip install clickzetta_zettapark_python`
3
+ > Source: https://www.yunqi.tech/documents/ZettaparkQuickStart
4
+ > **Python version**: 3.12 recommended (3.10 minimum). Install: `python3.12 -m venv .venv && pip install clickzetta_zettapark_python`
5
5
 
6
6
  ---
7
7
 
8
- ## Session 创建
8
+ ## Creating a Session
9
9
 
10
10
  ```python
11
11
  from clickzetta.zettapark.session import Session
@@ -31,47 +31,47 @@ session = Session.builder.configs({
31
31
 
32
32
  ---
33
33
 
34
- ## 数据读取
34
+ ## Reading Data
35
35
 
36
36
  ```python
37
- # 读取整张表
37
+ # Read an entire table
38
38
  df = session.table("my_schema.orders")
39
39
 
40
- # 执行 SQL 查询
40
+ # Execute a SQL query
41
41
  df = session.sql("SELECT * FROM my_schema.orders WHERE amount > 100")
42
42
 
43
- # 转为 pandas(小数据集)
43
+ # Convert to pandas (small datasets)
44
44
  pandas_df = df.to_pandas()
45
45
 
46
- # 分批读取大表(避免 OOM
46
+ # Read large tables in batches (avoid OOM)
47
47
  pandas_df = session.sql("""
48
48
  SELECT * FROM my_schema.events
49
- TABLESAMPLE ROW (1) -- 1% 精确采样
49
+ TABLESAMPLE ROW (1) -- exact 1% sample
50
50
  """).to_pandas()
51
51
 
52
- # 只获取前 N
52
+ # Get first N rows only
53
53
  pandas_df = df.limit(10000).to_pandas()
54
54
  ```
55
55
 
56
56
  ---
57
57
 
58
- ## DataFrame 变换
58
+ ## DataFrame Transformations
59
59
 
60
60
  ```python
61
61
  from clickzetta.zettapark.functions import col, when, lit, sum as F_sum, count as F_count, avg as F_avg
62
62
 
63
- # 过滤
63
+ # Filter
64
64
  df_filtered = df.filter(col("amount") > 0)
65
65
  df_filtered = df.filter((col("status") == "COMPLETED") & (col("amount") > 100))
66
66
 
67
- # 选择列
67
+ # Select columns
68
68
  df_selected = df.select("user_id", "amount", "order_date")
69
69
 
70
- # 新增列
70
+ # Add columns
71
71
  df = df.with_column("log_amount", col("amount").cast("double"))
72
72
  df = df.with_column("is_high_value", when(col("amount") > 1000, 1).otherwise(0))
73
73
 
74
- # 聚合
74
+ # Aggregate
75
75
  agg_df = df.group_by("user_id").agg(
76
76
  F_sum("amount").as_("total_amount"),
77
77
  F_count("order_id").as_("order_cnt"),
@@ -81,22 +81,22 @@ agg_df = df.group_by("user_id").agg(
81
81
  # JOIN
82
82
  result = orders.join(users, orders["user_id"] == users["user_id"], "left")
83
83
 
84
- # 排序
84
+ # Sort
85
85
  df_sorted = df.sort(col("amount").desc())
86
86
  ```
87
87
 
88
88
  ---
89
89
 
90
- ## 数据写回
90
+ ## Writing Data Back
91
91
 
92
92
  ```python
93
- # 覆盖写入(常用于特征表更新)
93
+ # Overwrite (common for feature table updates)
94
94
  df.write.mode("overwrite").save_as_table("ds_workspace.features_v1")
95
95
 
96
- # 追加写入(常用于预测结果)
96
+ # Append (common for prediction results)
97
97
  df.write.mode("append").save_as_table("ds_workspace.predictions")
98
98
 
99
- # pandas DataFrame 写回
99
+ # Write a pandas DataFrame back
100
100
  import pandas as pd
101
101
  local_df = pd.DataFrame({"user_id": [1, 2], "score": [0.8, 0.6]})
102
102
  session.create_dataframe(local_df).write.mode("overwrite") \
@@ -105,7 +105,7 @@ session.create_dataframe(local_df).write.mode("overwrite") \
105
105
 
106
106
  ---
107
107
 
108
- ## pandas/scikit-learn 集成
108
+ ## Integration with pandas / scikit-learn
109
109
 
110
110
  ```python
111
111
  import pandas as pd
@@ -114,14 +114,14 @@ from sklearn.preprocessing import StandardScaler
114
114
  from sklearn.model_selection import train_test_split
115
115
  from sklearn.ensemble import GradientBoostingClassifier
116
116
 
117
- # 1. Lakehouse 拉特征
117
+ # 1. Pull features from Lakehouse
118
118
  features_df = session.sql("""
119
119
  SELECT user_id, total_amount_30d, order_cnt_30d,
120
120
  active_days, avg_amount_30d, label
121
121
  FROM ds_workspace.features_final
122
122
  """).to_pandas()
123
123
 
124
- # 2. 本地处理
124
+ # 2. Local processing
125
125
  X = features_df.drop(["user_id", "label"], axis=1)
126
126
  y = features_df["label"]
127
127
 
@@ -130,17 +130,17 @@ X_scaled = scaler.fit_transform(X)
130
130
 
131
131
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
132
132
 
133
- # 3. 训练模型
133
+ # 3. Train model
134
134
  model = GradientBoostingClassifier(n_estimators=100)
135
135
  model.fit(X_train, y_train)
136
136
 
137
- # 4. 预测并写回
137
+ # 4. Predict and write back
138
138
  features_df["predicted_score"] = model.predict_proba(X_scaled)[:, 1]
139
139
  session.create_dataframe(
140
140
  features_df[["user_id", "predicted_score"]]
141
141
  ).write.mode("overwrite").save_as_table("ds_workspace.predictions")
142
142
 
143
- # 5. 保存模型
143
+ # 5. Save model
144
144
  import joblib
145
145
  joblib.dump(model, "models/gbm_model.pkl")
146
146
  joblib.dump(scaler, "models/scaler.pkl")
@@ -148,9 +148,9 @@ joblib.dump(scaler, "models/scaler.pkl")
148
148
 
149
149
  ---
150
150
 
151
- ## 注意事项
151
+ ## Notes
152
152
 
153
- - `to_pandas()` 会把数据全部拉到本地内存,大表必须先 `TABLESAMPLE` `LIMIT`
154
- - `collect()` 返回 Row 对象列表,`to_pandas()` 返回 DataFrame,数据科学场景用后者
155
- - ZettaPark DataFrame 操作是懒执行,只有 `to_pandas()`/`collect()`/`show()`/`save_as_table()` 才真正触发计算
156
- - 写回时推荐用 `ds_workspace` 这样的专属 Schema,与生产数据隔离
153
+ - `to_pandas()` pulls all data into local memory — always `TABLESAMPLE` or `LIMIT` large tables first
154
+ - `collect()` returns a list of Row objects; `to_pandas()` returns a DataFrame — use the latter for data science
155
+ - ZettaPark DataFrame operations are lazy — computation only triggers on `to_pandas()` / `collect()` / `show()` / `save_as_table()`
156
+ - Write results to a dedicated schema like `ds_workspace` to keep them isolated from production data
@@ -256,7 +256,7 @@ CDC/Kafka 持续写入 Bronze → Silver(REFRESH INTERVAL 10 MINUTE)→ Gold
256
256
 
257
257
  ### DDL 模板
258
258
 
259
- 加载 `clickzetta-sql-syntax-guide` 确认语法,生成各层 DDL。
259
+ 参考 ClickZetta Lakehouse 官方文档确认语法,生成各层 DDL。如果是从 Snowflake / Databricks 迁移过来的 DDL,加载 `clickzetta-sql-migration` skill 处理语法差异。
260
260
 
261
261
  **数仓开发代码资产化原则:每段 SQL 都应保存为 Studio 任务,作为可管理的代码资产。**
262
262
 
@@ -1,102 +1,38 @@
1
1
  ---
2
2
  name: clickzetta-external-function
3
3
  description: |
4
- ClickZetta Lakehouse 中创建和使用外部函数(External Function / UDF),
5
- 通过 Python Java 扩展 SQL 计算能力,调用 LLM、图像识别、自定义算法等外部服务。
6
- 覆盖 CREATE API CONNECTION(阿里云FC/腾讯云SCF/AWS Lambda)、
7
- CREATE EXTERNAL FUNCTION、Python UDF 代码结构与打包、
8
- 内置 AI_COMPLETE 和 AI_EMBEDDING 函数的使用。
9
- 当用户说"外部函数"、"UDF"、"自定义函数"、"External Function"、
10
- "Remote Function"、"调用 LLM"、"AI_COMPLETE"、"AI_EMBEDDING"、
11
- "文本向量化"、"调用阿里云函数计算"、"调用云函数"、"Python UDF"、
12
- "Java UDF"、"CREATE EXTERNAL FUNCTION"时触发。
13
- Keywords: external function, UDF, Python UDF, Java UDF, LLM, custom function
4
+ Create and use External Functions (custom UDFs) in ClickZetta Lakehouse using Python or Java,
5
+ deployed on Alibaba Cloud FC / Tencent Cloud SCF / AWS Lambda.
6
+ Covers CREATE API CONNECTION (TYPE CLOUD_FUNCTION), CREATE EXTERNAL FUNCTION, Python/Java UDF code structure and packaging.
7
+ Keywords: external function, UDF, Python UDF, Java UDF, custom function, cloud function
14
8
  ---
15
9
 
16
- # ClickZetta External Function
10
+ # ClickZetta External Function (Custom UDF)
17
11
 
18
- External Function SQL 可以调用外部计算能力(LLM、图像识别、自定义算法),通过 Python/Java 编写函数逻辑,部署在云函数服务上执行。
12
+ External Functions let SQL call custom compute logic written in Python or Java, deployed on a cloud function service (Alibaba Cloud FC / Tencent Cloud SCF / AWS Lambda).
19
13
 
20
- 阅读 [references/external-function-ddl.md](references/external-function-ddl.md) 了解完整语法。
14
+ See [references/external-function-ddl.md](references/external-function-ddl.md) for the full syntax reference.
21
15
 
22
16
  ---
23
17
 
24
- ## 两种使用路径
18
+ ## Overall Flow
25
19
 
26
- | 路径 | 适用场景 | 复杂度 |
27
- |---|---|---|
28
- | **内置 AI 函数**(AI_COMPLETE / AI_EMBEDDING) | 调用 LLM 生成文本、文本向量化 | 低,只需创建 API Connection |
29
- | **External Function** | 自定义算法、图像处理、私有模型 | 高,需部署云函数 |
30
-
31
- ---
32
-
33
- ## 路径一:内置 AI 函数(推荐)
34
-
35
- ### 1. 创建 AI API Connection
36
-
37
- ```sql
38
- CREATE API CONNECTION conn_bailian
39
- TYPE ai_function
40
- PROVIDER = 'bailian'
41
- BASE_URL = 'https://dashscope.aliyuncs.com/api/v1'
42
- API_KEY = '<key>';
43
20
  ```
44
-
45
- ### 2. AI_COMPLETE 调用 LLM
46
-
47
- ```sql
48
- -- 文本摘要
49
- SELECT id,
50
- AI_COMPLETE('connection:conn_bailian', '请用一句话总结:' || content) AS summary
51
- FROM articles;
52
-
53
- -- 情感分析
54
- SELECT id, review,
55
- AI_COMPLETE('connection:conn_bailian',
56
- '判断以下评论的情感(正面/负面/中性),只返回一个词:' || review) AS sentiment
57
- FROM user_reviews;
58
-
59
- -- 通过平台 Endpoint(管理员预配置)
60
- SELECT AI_COMPLETE('endpoint:my_llm_endpoint', prompt_col) AS result
61
- FROM my_table;
62
- ```
63
-
64
- ### 3. AI_EMBEDDING — 文本向量化
65
-
66
- ```sql
67
- -- 批量生成 embedding
68
- SELECT id, content,
69
- AI_EMBEDDING('connection:conn_bailian', content) AS vec
70
- FROM documents;
71
-
72
- -- 语义搜索(结合向量索引)
73
- SELECT id, content,
74
- cosine_distance(vec, AI_EMBEDDING('connection:conn_bailian', '用户查询')) AS dist
75
- FROM doc_embeddings
76
- ORDER BY dist
77
- LIMIT 10;
21
+ 1. Enable a cloud function service (Alibaba Cloud FC / Tencent Cloud SCF / AWS Lambda)
22
+ 2. Write Python/Java function code
23
+ 3. Package and upload to object storage or User Volume
24
+ 4. Grant Lakehouse access to the cloud function service (RAM role)
25
+ 5. CREATE API CONNECTION (TYPE CLOUD_FUNCTION)
26
+ 6. CREATE EXTERNAL FUNCTION
27
+ 7. Call the function in SQL
78
28
  ```
79
29
 
80
30
  ---
81
31
 
82
- ## 路径二:External Function(自定义 UDF)
83
-
84
- ### 整体流程
85
-
86
- ```
87
- 1. 开通云函数服务(阿里云FC / 腾讯云SCF / AWS Lambda)
88
- 2. 编写 Python/Java 函数代码
89
- 3. 打包上传到对象存储或 User Volume
90
- 4. 授权 Lakehouse 访问云函数服务(RAM 角色)
91
- 5. CREATE API CONNECTION
92
- 6. CREATE EXTERNAL FUNCTION
93
- 7. 在 SQL 中调用
94
- ```
95
-
96
- ### 步骤 1:创建云函数 API Connection
32
+ ## Step 1: Create a Cloud Function API Connection
97
33
 
98
34
  ```sql
99
- -- 阿里云 FC
35
+ -- Alibaba Cloud FC
100
36
  CREATE API CONNECTION IF NOT EXISTS my_fc_conn
101
37
  TYPE CLOUD_FUNCTION
102
38
  PROVIDER = 'aliyun'
@@ -105,7 +41,7 @@ CREATE API CONNECTION IF NOT EXISTS my_fc_conn
105
41
  NAMESPACE = 'default'
106
42
  CODE_BUCKET = 'my-oss-bucket';
107
43
 
108
- -- 腾讯云 SCF
44
+ -- Tencent Cloud SCF
109
45
  CREATE API CONNECTION IF NOT EXISTS my_scf_conn
110
46
  TYPE CLOUD_FUNCTION
111
47
  PROVIDER = 'tencent'
@@ -115,7 +51,9 @@ CREATE API CONNECTION IF NOT EXISTS my_scf_conn
115
51
  CODE_BUCKET = 'my-cos-bucket';
116
52
  ```
117
53
 
118
- ### 步骤 2:编写 Python UDF
54
+ ---
55
+
56
+ ## Step 2: Write a Python UDF
119
57
 
120
58
  ```python
121
59
  # upper.py
@@ -132,31 +70,34 @@ class Upper(object):
132
70
  return arg.upper()
133
71
  ```
134
72
 
135
- 打包上传:
73
+ Package and upload:
136
74
  ```bash
137
75
  zip -rq upper.zip upper.py
138
76
  ```
139
77
 
140
78
  ```sql
141
- -- 上传到 User Volume(在 ClickZetta Studio CLI 中执行,source_path 使用绝对路径)
79
+ -- Upload to User Volume (run in ClickZetta Studio or CLI; source_path must be an absolute path)
142
80
  PUT '/path/to/upper.zip' TO USER VOLUME;
143
81
  ```
144
82
 
145
- ### 步骤 3:创建 External Function
83
+ ---
84
+
85
+ ## Step 3: Create the External Function
146
86
 
147
87
  ```sql
148
- -- ⚠️ CREATE EXTERNAL FUNCTION 不支持 OR REPLACE,只支持 IF NOT EXISTS
149
- -- ❌ 错误:CREATE OR REPLACE EXTERNAL FUNCTION ...
150
- -- ✅ 正确:
151
- -- 使用 User Volume 存放代码(无需 OSS)
88
+ -- ⚠️ CREATE EXTERNAL FUNCTION does not support OR REPLACE, only IF NOT EXISTS
89
+ -- ❌ Wrong: CREATE OR REPLACE EXTERNAL FUNCTION ...
90
+ -- ✅ Correct:
91
+
92
+ -- Using User Volume to store code (no object storage required)
152
93
  CREATE EXTERNAL FUNCTION IF NOT EXISTS public.str_upper
153
94
  AS 'upper.Upper'
154
95
  USING FILE = 'volume:user://~/upper.zip'
155
96
  CONNECTION = my_fc_conn
156
97
  WITH PROPERTIES ('remote.udf.api' = 'python3.mc.v0')
157
- COMMENT '字符串转大写';
98
+ COMMENT 'Convert string to uppercase';
158
99
 
159
- -- 使用 OSS 存放代码
100
+ -- Using OSS to store code
160
101
  CREATE EXTERNAL FUNCTION IF NOT EXISTS public.str_upper
161
102
  AS 'upper.Upper'
162
103
  USING FILE = 'oss://my-bucket/functions/upper.zip'
@@ -164,40 +105,41 @@ CREATE EXTERNAL FUNCTION IF NOT EXISTS public.str_upper
164
105
  WITH PROPERTIES ('remote.udf.api' = 'python3.mc.v0');
165
106
  ```
166
107
 
167
- ### 步骤 4:调用函数
108
+ ---
109
+
110
+ ## Step 4: Call the Function
168
111
 
169
112
  ```sql
170
- -- ⚠️ 调用外部函数必须使用完整 Schema 路径,不能省略 schema
171
- -- ❌ 错误:SELECT str_upper(name) FROM my_table;
172
- -- ✅ 正确:
113
+ -- ⚠️ External functions must be called with the full schema-qualified name; the schema cannot be omitted
114
+ -- ❌ Wrong: SELECT str_upper(name) FROM my_table;
115
+ -- ✅ Correct:
173
116
  SELECT id, public.str_upper(name) AS upper_name FROM my_table;
174
117
  ```
175
118
 
176
119
  ---
177
120
 
178
- ## 管理操作
121
+ ## Management
179
122
 
180
123
  ```sql
181
- -- 查看所有外部函数
124
+ -- List all external functions
182
125
  SHOW EXTERNAL FUNCTIONS;
183
126
  SHOW EXTERNAL FUNCTIONS LIKE 'str_%';
184
127
 
185
- -- 删除函数(注意:用 DROP FUNCTION,不是 DROP EXTERNAL FUNCTION
128
+ -- Drop a function (use DROP FUNCTION, not DROP EXTERNAL FUNCTION)
186
129
  DROP FUNCTION IF EXISTS public.str_upper;
187
130
  ```
188
131
 
189
- > ⚠️ **注意**:`CREATE FUNCTION`(SQL 内联函数)只支持 SQL 表达式,不支持 Python/JavaScript 等编程语言。需要编程语言逻辑请使用 `CREATE EXTERNAL FUNCTION`。
132
+ > ⚠️ **Note**: `CREATE FUNCTION` (inline SQL function) only supports SQL expressions — it does not support Python, JavaScript, or other programming languages. Use `CREATE EXTERNAL FUNCTION` when you need full programming language logic.
190
133
 
191
134
  ---
192
135
 
193
- ## 常见问题
136
+ ## Troubleshooting
194
137
 
195
- | 问题 | 原因 | 解决方案 |
138
+ | Problem | Cause | Solution |
196
139
  |---|---|---|
197
- | 函数调用超时 | 云函数冷启动或执行慢 | 增大超时配置,或预热函数 |
198
- | 依赖库 ABI 不兼容 | macOS/Windows 打包 | `quay.io/pypa/manylinux2014_x86_64` 容器打包 |
199
- | 代码包 > 500MB | 依赖过大 | 改用容器镜像方式部署 |
200
- | AI_COMPLETE 报错 | API Key 无效或余额不足 | 检查 API Connection API_KEY |
201
- | ROLE_ARN 权限不足 | RAM 角色未授权 | 参考文档配置 AliyunFCFullAccess + OSS 权限 |
202
- | 函数调用报"not found" | 省略了 Schema 前缀 | 必须用完整路径:`schema.function_name(...)` |
203
- | CREATE OR REPLACE 报错 | EXTERNAL FUNCTION 不支持 OR REPLACE | 改用 `CREATE EXTERNAL FUNCTION IF NOT EXISTS` |
140
+ | Function call times out | Cloud function cold start or slow execution | Increase the timeout setting, or pre-warm the function |
141
+ | Dependency ABI incompatibility | Packaged on macOS/Windows | Use the `quay.io/pypa/manylinux2014_x86_64` container to package |
142
+ | Code package > 500 MB | Dependencies too large | Switch to container image deployment |
143
+ | ROLE_ARN permission denied | RAM role not authorized | Configure AliyunFCFullAccess + OSS permissions per the documentation |
144
+ | Function call returns "not found" | Schema prefix omitted | Always use the full path: `schema.function_name(...)` |
145
+ | CREATE OR REPLACE error | EXTERNAL FUNCTION does not support OR REPLACE | Use `CREATE EXTERNAL FUNCTION IF NOT EXISTS` instead |
@@ -1,4 +1,4 @@
1
- {"case_id":"001","type":"should_call","user_input":"怎么创建一个调用 LLM 的外部函数?需要哪些步骤?","expected_skill":"clickzetta-external-function","expected_output_contains":["API CONNECTION","EXTERNAL FUNCTION"]}
2
- {"case_id":"002","type":"should_call","user_input":"AI_COMPLETE AI_EMBEDDING 内置函数怎么用?","expected_skill":"clickzetta-external-function","expected_output_contains":["AI_COMPLETE","AI_EMBEDDING"]}
3
- {"case_id":"003","type":"should_call","user_input":"Python UDF 的代码结构和打包方式是什么?","expected_skill":"clickzetta-external-function","expected_output_contains":["Python","def"]}
4
- {"case_id":"004","type":"should_call","user_input":"怎么创建 API CONNECTION 连接阿里云函数计算?","expected_skill":"clickzetta-external-function","expected_output_contains":["API CONNECTION"]}
1
+ {"case_id":"001","type":"should_call","user_input":"How do I create an external function to call a custom algorithm?","expected_skill":"clickzetta-external-function","expected_output_contains":["API CONNECTION","EXTERNAL FUNCTION"]}
2
+ {"case_id":"002","type":"should_call","user_input":"What is the Python UDF code structure and how do I package it?","expected_skill":"clickzetta-external-function","expected_output_contains":["annotate","evaluate"]}
3
+ {"case_id":"003","type":"should_call","user_input":"How do I create an API Connection for Alibaba Cloud Function Compute?","expected_skill":"clickzetta-external-function","expected_output_contains":["API CONNECTION","CLOUD_FUNCTION"]}
4
+ {"case_id":"004","type":"should_call","user_input":"How do I call a UDF in SQL with the correct schema prefix?","expected_skill":"clickzetta-external-function","expected_output_contains":["schema","function_name"]}