@clickzetta/cz-cli-darwin-x64 0.3.92 → 0.3.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
  3. package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
  4. package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
  5. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
  6. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
  7. package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
  8. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
  9. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
  10. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
  11. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
  12. package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
  13. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
  14. package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
  15. package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
  16. package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
  17. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
  18. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
  19. package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
  20. package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
  21. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
  22. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
  23. package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
  24. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
  25. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
  26. package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
  27. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
  28. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
  29. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
  30. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
  31. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
  32. package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
  33. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
  34. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
  35. package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
  36. package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
  37. package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
  38. package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
  39. package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
  40. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
  41. package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
  42. package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
  43. package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
  44. package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
  45. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
  46. package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
  47. package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
  48. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
  49. package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
  50. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
  51. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
  52. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
  53. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
  54. package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
  55. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
  56. package/package.json +1 -1
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  58. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  60. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
  61. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  62. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  63. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  64. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  65. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  66. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  67. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  68. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  69. /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
package/bin/cz-cli CHANGED
Binary file
@@ -0,0 +1,109 @@
1
+ ---
2
+ name: clickzetta-ai-function
3
+ description: |
4
+ Use ClickZetta built-in AI functions: AI_COMPLETE (call LLMs) and AI_EMBEDDING (text vectors).
5
+ Covers CREATE API CONNECTION (TYPE ai_function), AI_COMPLETE, AI_EMBEDDING.
6
+ Keywords: AI_COMPLETE, AI_EMBEDDING, LLM, text embedding, semantic search, built-in AI function
7
+ ---
8
+
9
+ # ClickZetta Built-in AI Functions
10
+
11
+ ClickZetta provides two built-in AI functions that let you call LLMs and generate text embeddings directly from SQL — no cloud function deployment required. You only need an API Connection.
12
+
13
+ See [references/ai-function-ddl.md](references/ai-function-ddl.md) for the full syntax reference.
14
+
15
+ ---
16
+
17
+ ## Quick Start
18
+
19
+ ```sql
20
+ -- 1. Create an AI API Connection
21
+ CREATE API CONNECTION conn_bailian
22
+ TYPE ai_function
23
+ BASE_URL = 'https://dashscope.aliyuncs.com/api/v1'
24
+ API_KEY = '<your-api-key>';
25
+
26
+ -- 2. Call AI_COMPLETE to summarize text
27
+ SELECT id,
28
+ AI_COMPLETE('conn_bailian:qwen3-plus', 'Summarize in one sentence: ' || content) AS summary
29
+ FROM articles
30
+ LIMIT 10;
31
+
32
+ -- 3. Call AI_EMBEDDING to generate vectors
33
+ SELECT id,
34
+ AI_EMBEDDING('conn_bailian:text-embedding-v3', content) AS vec
35
+ FROM documents
36
+ LIMIT 10;
37
+ ```
38
+
39
+ ---
40
+
41
+ ## Create an AI API Connection
42
+
43
+ ```sql
44
+ CREATE API CONNECTION conn_bailian
45
+ TYPE ai_function
46
+ BASE_URL = 'https://dashscope.aliyuncs.com/api/v1'
47
+ API_KEY = '<key>';
48
+ ```
49
+
50
+ | Parameter | Description |
51
+ |---|---|
52
+ | TYPE | Must be `ai_function` |
53
+ | BASE_URL | Provider API base URL |
54
+ | API_KEY | API key for the provider |
55
+
56
+ ---
57
+
58
+ ## AI_COMPLETE — Call an LLM
59
+
60
+ ```sql
61
+ -- Text summarization
62
+ SELECT id,
63
+ AI_COMPLETE('conn_bailian:qwen3-plus', 'Summarize in one sentence: ' || content) AS summary
64
+ FROM articles;
65
+
66
+ -- Sentiment analysis
67
+ SELECT id, review,
68
+ AI_COMPLETE('conn_bailian:qwen3-plus',
69
+ 'Classify the sentiment of the following review (positive/negative/neutral), return one word only: ' || review) AS sentiment
70
+ FROM user_reviews;
71
+
72
+ -- Text classification
73
+ SELECT id, description,
74
+ AI_COMPLETE('conn_bailian:qwen3-plus',
75
+ 'Classify this product description into one category (Electronics/Clothing/Food): ' || description) AS category
76
+ FROM products;
77
+
78
+ -- Via a platform Endpoint (pre-configured by admin, no API key needed)
79
+ SELECT AI_COMPLETE('my_llm_endpoint:qwen3-plus', prompt_col) AS result
80
+ FROM my_table;
81
+ ```
82
+
83
+ ---
84
+
85
+ ## AI_EMBEDDING — Text Embedding
86
+
87
+ ```sql
88
+ -- Batch generate embeddings
89
+ SELECT id, content,
90
+ AI_EMBEDDING('conn_bailian:text-embedding-v3', content) AS vec
91
+ FROM documents;
92
+
93
+ -- Semantic search (combined with a vector index)
94
+ SELECT id, content,
95
+ cosine_distance(vec, AI_EMBEDDING('conn_bailian:text-embedding-v3', 'user query text')) AS dist
96
+ FROM doc_embeddings
97
+ ORDER BY dist
98
+ LIMIT 10;
99
+ ```
100
+
101
+ ---
102
+
103
+ ## Troubleshooting
104
+
105
+ | Problem | Cause | Solution |
106
+ |---|---|---|
107
+ | AI_COMPLETE / AI_EMBEDDING error | Invalid API key or insufficient balance | Check the API_KEY in the API Connection |
108
+ | Slow response | LLM API latency | Expected for large batches; consider filtering rows first |
109
+ | Empty or unexpected output | Prompt not specific enough | Refine the prompt with clearer instructions |
@@ -0,0 +1,4 @@
1
+ {"case_id":"001","type":"should_call","user_input":"How do I use AI_COMPLETE to summarize text in SQL?","expected_skill":"clickzetta-ai-function","expected_output_contains":["AI_COMPLETE","connection"]}
2
+ {"case_id":"002","type":"should_call","user_input":"How do I generate text embeddings with AI_EMBEDDING?","expected_skill":"clickzetta-ai-function","expected_output_contains":["AI_EMBEDDING","connection"]}
3
+ {"case_id":"003","type":"should_call","user_input":"How do I create an AI API Connection for Bailian?","expected_skill":"clickzetta-ai-function","expected_output_contains":["API CONNECTION","ai_function"]}
4
+ {"case_id":"004","type":"should_call","user_input":"How do I do semantic search using vector embeddings in ClickZetta?","expected_skill":"clickzetta-ai-function","expected_output_contains":["AI_EMBEDDING","cosine_distance"]}
@@ -0,0 +1,106 @@
1
+ # AI Function DDL Reference
2
+
3
+ ## Concepts
4
+
5
+ ClickZetta provides two built-in AI functions that call external LLM APIs directly from SQL:
6
+
7
+ | Function | Purpose |
8
+ |---|---|
9
+ | `AI_COMPLETE(source, prompt)` | Call an LLM to generate text (summarization, classification, Q&A, etc.) |
10
+ | `AI_EMBEDDING(source, text)` | Generate a vector embedding for a text string |
11
+
12
+ Both functions require an **API Connection** of `TYPE ai_function`, or a platform **Endpoint** pre-configured by an admin.
13
+
14
+ ---
15
+
16
+ ## CREATE API CONNECTION (AI Function)
17
+
18
+ ```sql
19
+ CREATE API CONNECTION conn_bailian
20
+ TYPE ai_function
21
+ BASE_URL = 'https://dashscope.aliyuncs.com/api/v1'
22
+ API_KEY = '<key>';
23
+ ```
24
+
25
+ | Parameter | Description |
26
+ |---|---|
27
+ | TYPE | `ai_function` — distinguishes from cloud function connections |
28
+ | BASE_URL | Provider API base URL |
29
+ | API_KEY | API key for authentication |
30
+
31
+ ---
32
+
33
+ ## AI_COMPLETE
34
+
35
+ ```sql
36
+ AI_COMPLETE('<connection-name>:<model-name>', prompt)
37
+ ```
38
+
39
+ | Argument | Description |
40
+ |---|---|
41
+ | `source` | `'<connection-name>:<model-name>'` — connection name and model name joined by `:` |
42
+ | `prompt` | A string expression — the prompt sent to the LLM |
43
+
44
+ Returns: `STRING` — the LLM's text response.
45
+
46
+ ### Examples
47
+
48
+ ```sql
49
+ -- Summarization
50
+ SELECT AI_COMPLETE('conn_bailian:qwen3-plus', 'Summarize in one sentence: ' || content) AS summary
51
+ FROM articles;
52
+
53
+ -- Sentiment analysis
54
+ SELECT AI_COMPLETE('conn_bailian:qwen3-plus',
55
+ 'Classify sentiment (positive/negative/neutral), one word only: ' || review) AS sentiment
56
+ FROM user_reviews;
57
+
58
+ -- Via platform Endpoint (pre-configured by admin)
59
+ SELECT AI_COMPLETE('my_llm_endpoint:qwen3-plus', prompt_col) AS result
60
+ FROM my_table;
61
+ ```
62
+
63
+ ---
64
+
65
+ ## AI_EMBEDDING
66
+
67
+ ```sql
68
+ AI_EMBEDDING('<connection-name>:<model-name>', text)
69
+ ```
70
+
71
+ | Argument | Description |
72
+ |---|---|
73
+ | `source` | `'<connection-name>:<model-name>'` — connection name and model name joined by `:` |
74
+ | `text` | A string expression — the text to embed |
75
+
76
+ Returns: `ARRAY<FLOAT>` — the embedding vector.
77
+
78
+ ### Examples
79
+
80
+ ```sql
81
+ -- Generate embeddings
82
+ SELECT id, AI_EMBEDDING('conn_bailian:text-embedding-v3', content) AS vec
83
+ FROM documents;
84
+
85
+ -- Semantic search
86
+ SELECT id, content,
87
+ cosine_distance(vec, AI_EMBEDDING('conn_bailian:text-embedding-v3', 'query text')) AS dist
88
+ FROM doc_embeddings
89
+ ORDER BY dist
90
+ LIMIT 10;
91
+ ```
92
+
93
+ ---
94
+
95
+ ## Source Format
96
+
97
+ The first argument is always `'<connection-name>:<model-name>'`:
98
+
99
+ | Part | Description |
100
+ |---|---|
101
+ | `<connection-name>` | Name of the API Connection created with `CREATE API CONNECTION` (or a platform Endpoint name) |
102
+ | `<model-name>` | Model identifier supported by the provider, e.g. `qwen3-plus`, `text-embedding-v3` |
103
+
104
+ Examples:
105
+ - `'conn_bailian:qwen3-plus'` — use connection `conn_bailian` with model `qwen3-plus`
106
+ - `'conn_bailian:text-embedding-v3'` — use connection `conn_bailian` with embedding model `text-embedding-v3`
@@ -1,179 +1,180 @@
1
1
  ---
2
2
  name: clickzetta-batch-sync-pipeline
3
3
  description: |
4
- 创建和管理 ClickZetta Lakehouse 离线同步(批量同步)任务,支持单表离线同步和多表离线同步两种模式。
5
- 单表模式适合简单的源→目标表同步;多表模式支持整库镜像、多表镜像、多表合并三种同步方式。
6
- 当用户说"离线同步"、"批量同步"、"batch sync"、"数据库同步到 Lakehouse"、"整库迁移"、
7
- "多表同步""定期同步""周期性数据同步""分库分表合并"、"离线数据迁移"时触发。
8
- 包含单表/多表离线同步任务创建、数据源配置、字段映射、同步规则、调度部署、任务运维等
9
- ClickZetta Studio 特有逻辑。
10
- Keywords: batch sync, offline sync, full load, mirror, multi-table sync
4
+ Create and manage ClickZetta Lakehouse batch sync tasks, supporting both single-table and multi-table modes.
5
+ Single-table mode is suitable for simple source-to-target table sync; multi-table mode supports full database mirror,
6
+ multi-table mirror, and sharded table merge.
7
+ Triggered when the user says "batch sync", "offline sync", "sync database to Lakehouse", "full database migration",
8
+ "multi-table sync", "periodic sync", "scheduled data sync", "sharded table merge", "offline data migration".
9
+ Covers single-table/multi-table batch sync task creation, data source configuration, column mapping,
10
+ sync rules, scheduling, deployment, and task operations all ClickZetta Studio specific logic.
11
+ Keywords: batch sync, offline sync, full load, mirror, multi-table sync, scheduled sync
11
12
  ---
12
13
 
13
- # 离线同步(批量同步)Pipeline 工作流
14
+ # Batch Sync Pipeline Workflow
14
15
 
15
- ## 向导:收集必要信息
16
+ ## Wizard: Collect Required Information
16
17
 
17
- 开始创建同步任务前,优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
18
+ Before creating a sync task, use an interactive question tool (e.g., `question`) to collect the following information via option menus. If no such tool is available, list all questions in text at once:
18
19
 
19
20
  ```
20
21
  question({
21
22
  questions: [
22
23
  {
23
- question: "数据源类型和名称是什么?",
24
+ question: "What is the data source type and name?",
24
25
  options: [
25
- { label: "MySQL", description: " aliyun_mysqlrds_mysql" },
26
- { label: "PostgreSQL", description: " pg_prodaurora_pg" },
27
- { label: "SQL Server", description: " sqlserver_prod" },
28
- { label: "OSS/S3/COS 对象存储", description: " oss_buckets3_data" }
26
+ { label: "MySQL", description: "e.g., aliyun_mysql, rds_mysql" },
27
+ { label: "PostgreSQL", description: "e.g., pg_prod, aurora_pg" },
28
+ { label: "SQL Server", description: "e.g., sqlserver_prod" },
29
+ { label: "OSS/S3/COS Object Storage", description: "e.g., oss_bucket, s3_data" }
29
30
  ]
30
31
  },
31
32
  {
32
- question: "同步范围是什么?",
33
+ question: "What is the sync scope?",
33
34
  options: [
34
- { label: "单表同步", description: "指定一张源表同步到目标表" },
35
- { label: "多表镜像", description: "整库或指定多张表批量同步" },
36
- { label: "分库分表合并", description: "多张源表合并到一张目标表" }
35
+ { label: "Single-table sync", description: "Sync one source table to one target table" },
36
+ { label: "Multi-table mirror", description: "Sync entire database or multiple selected tables" },
37
+ { label: "Sharded table merge", description: "Merge multiple source tables into one target table" }
37
38
  ]
38
39
  },
39
40
  {
40
- question: "写入模式?",
41
+ question: "Write mode?",
41
42
  options: [
42
- { label: "全量覆盖(OVERWRITE", description: "每次全量覆盖目标表,推荐" },
43
- { label: "增量追加(APPEND", description: "追加新数据,不删除历史" }
43
+ { label: "Full overwrite (OVERWRITE)", description: "Overwrite target table each run, recommended" },
44
+ { label: "Incremental append (APPEND)", description: "Append new data without deleting history" }
44
45
  ]
45
46
  }
46
47
  ]
47
48
  })
48
49
  ```
49
50
 
50
- 收集到信息后,还需确认:目标 schema(如 `ods`)、调度时间(如每天 02:00)。这两项可以在用户回答后直接询问,或从上下文推断。
51
+ After collecting the above, also confirm: target schema (e.g., `ods`) and schedule time (e.g., daily at 02:00). These can be asked after the user responds, or inferred from context.
51
52
 
52
- **如果用户已经提供了足够信息,直接进入工作流,不再弹出菜单。**
53
+ **If the user has already provided sufficient information, proceed directly to the workflow without showing the menu.**
53
54
 
54
55
  ---
55
56
 
56
- ## 前置依赖
57
+ ## Prerequisites
57
58
 
58
- - ClickZetta Lakehouse Studio 账户,具备创建同步任务、目标表的权限
59
- - 源端数据源已在 Studio 中配置(具备 SELECT 权限)
60
- - 目标端 Lakehouse 数据源可用(具备 CREATEINSERT 权限)
61
- - 已安装 cz-cli 并完成 profile 配置(`cz-cli profile status` 验证连接)
59
+ - ClickZetta Lakehouse Studio account with permissions to create sync tasks and target tables
60
+ - Source data source already configured in Studio (with SELECT permission)
61
+ - Target Lakehouse data source available (with CREATE and INSERT permissions)
62
+ - cz-cli installed and profile configured (verify with `cz-cli profile status`)
62
63
 
63
64
  ---
64
65
 
65
- ## 适用场景
66
+ ## Applicable Scenarios
66
67
 
67
- - 将外部数据库(MySQL / PostgreSQL / SQL Server 等)的数据定期同步到 Lakehouse
68
- - 单表离线同步:简单的源表目标表周期性同步
69
- - 多表离线同步:整库迁移、多表批量同步、分库分表合并
70
- - 数据时效性要求不高,按天/小时等周期批量更新
68
+ - Periodically sync data from external databases (MySQL / PostgreSQL / SQL Server, etc.) to Lakehouse
69
+ - Single-table batch sync: simple source table target table periodic sync
70
+ - Multi-table batch sync: full database migration, multi-table batch sync, sharded table merge
71
+ - Low data freshness requirements — batch updates on daily/hourly schedules
71
72
 
72
73
  ---
73
74
 
74
- ## 模式选择
75
+ ## Mode Selection
75
76
 
76
- | 维度 | 单表离线同步 | 多表离线同步 |
77
- |------|------------|------------|
78
- | 任务类型 ID | `1`(DI/INTEGRATION | `291`(MULTI_DI |
79
- | 同步粒度 | 单张源表单张目标表 | 整库 / 多表多张目标表 |
80
- | 适用场景 | 简单同步、精细控制单表 | 整库迁移、批量同步、分库分表合并 |
81
- | Schema Evolution | 不支持 | 支持(新增字段自动适配) |
82
- | 自动建表 | 需手动创建或快速创建 | 目标表不存在时自动创建 |
83
- | 写入模式 | 由数据源决定 | overwrite / upsert 可选 |
77
+ | Dimension | Single-table Batch Sync | Multi-table Batch Sync |
78
+ |-----------|------------------------|----------------------|
79
+ | Task Type ID | `1` (DI/INTEGRATION) | `291` (MULTI_DI) |
80
+ | Sync Granularity | One source table one target table | Full database / multiple tables multiple target tables |
81
+ | Use Case | Simple sync, fine-grained single-table control | Full database migration, batch sync, sharded table merge |
82
+ | Schema Evolution | Not supported | Supported (new columns auto-adapted) |
83
+ | Auto Table Creation | Manual or quick-create required | Auto-creates target table if not exists |
84
+ | Write Mode | Determined by data source | overwrite / upsert selectable |
84
85
 
85
- > **重要**:这两种任务类型均为 UI_ONLY 类型,脚本内容必须在 Studio Web UI 中配置。
86
- > cz-cli 负责任务创建、调度配置、发布和运维;数据源选择、字段映射等内容配置在 Studio UI 完成。
86
+ > **Important**: Both task types are UI_ONLY types — task content must be configured in the Studio Web UI.
87
+ > cz-cli handles task creation, scheduling, deployment, and operations; data source selection and column mapping are configured in Studio UI.
87
88
 
88
89
  ---
89
90
 
90
- ## 工作流
91
+ ## Workflow
91
92
 
92
- > **重要**:离线同步任务的**内容配置**(来源表选择、字段映射、同步规则等)必须在 Studio Web UI 中完成。
93
- > cz-cli 负责任务创建、调度配置、发布和运维;数据源选择、字段映射等内容配置在 Studio UI 完成。
93
+ > **Important**: Batch sync task **content configuration** (source table selection, column mapping, sync rules, etc.) must be completed in the Studio Web UI.
94
+ > cz-cli handles task creation, scheduling, deployment, and operations; data source selection and column mapping are configured in Studio UI.
94
95
 
95
- ### 步骤 1:用 cz-cli 创建任务
96
+ ### Step 1: Create Task with cz-cli
96
97
 
97
98
  ```bash
98
- # 单表离线同步(task_type=1,即 DI/INTEGRATION
99
+ # Single-table batch sync (task_type=1, i.e., DI/INTEGRATION)
99
100
  cz-cli task create "sync_orders_daily" --type DI --folder <folder_name>
100
101
 
101
- # 多表离线同步(task_type=291,即 MULTI_DI
102
+ # Multi-table batch sync (task_type=291, i.e., MULTI_DI)
102
103
  cz-cli task create "sync_ecommerce_db" --type MULTI_DI --folder <folder_name>
103
104
  ```
104
105
 
105
- 命令返回 `task_id` `studio_url`,在 `studio_url` 中完成数据源配置。
106
+ The command returns `task_id` and `studio_url`. Complete data source configuration at the `studio_url`.
106
107
 
107
- ### 步骤 2:在 Studio UI 中配置同步内容
108
+ ### Step 2: Configure Sync Content in Studio UI
108
109
 
109
- 打开步骤 1 返回的 `studio_url`,在 Studio 中完成:
110
+ Open the `studio_url` returned in Step 1 and complete the following in Studio:
110
111
 
111
- **来源数据配置**
112
- - 选择源端数据源类型和连接(支持的数据源类型以 Studio UI 中显示为准,可通过 `cz-cli datasource list` 查看已配置的数据源)
113
- - 单表:指定 schema 和表名
114
- - 多表:选择整库 / 勾选多表 / 配置合并规则
112
+ **Source Data Configuration**
113
+ - Select source data source type and connection (supported types are shown in Studio UI; use `cz-cli datasource list` to view configured data sources)
114
+ - Single-table: specify schema and table name
115
+ - Multi-table: select full database / check multiple tables / configure merge rules
115
116
 
116
- **目标设置**
117
- - 选择目标 Lakehouse 数据源和 workspace
118
- - 配置目标 schema 和表名
119
- - 多表模式可配置命名规则(支持 `{SOURCE_DATABASE}`、`{SOURCE_TABLE}` 变量)
117
+ **Target Settings**
118
+ - Select target Lakehouse data source and workspace
119
+ - Configure target schema and table name
120
+ - Multi-table mode supports naming rules (with `{SOURCE_DATABASE}`, `{SOURCE_TABLE}` variables)
120
121
 
121
- **同步规则(多表模式)**
122
- - Schema Evolution:源端新增字段自动适配;删除字段写入 Null
123
- - 写入模式:非主键表 → overwrite;主键表 → overwrite upsert
122
+ **Sync Rules (Multi-table mode)**
123
+ - Schema Evolution: new columns from source are auto-adapted; deleted columns are written as Null
124
+ - Write Mode: non-primary-key tables → overwrite; primary-key tables → overwrite or upsert
124
125
 
125
- ### 步骤 3:在 Studio UI 中调试运行
126
+ ### Step 3: Debug Run in Studio UI
126
127
 
127
- 点击「运行」按钮进行调试,验证数据源连接和配置是否正确。
128
+ Click the "Run" button to debug and verify data source connectivity and configuration.
128
129
 
129
- ### 步骤 4:用 cz-cli 配置调度和发布
130
+ ### Step 4: Configure Scheduling and Deploy with cz-cli
130
131
 
131
132
  ```bash
132
- # 配置调度(具体参数见 --help
133
+ # Configure schedule (see --help for parameters)
133
134
  cz-cli task save-cron <task_name> --help
134
135
 
135
- # 发布任务
136
+ # Deploy task
136
137
  cz-cli task deploy <task_name> -y
137
138
  ```
138
139
 
139
- > 离线同步任务(task_type=1 291)必须使用 Sync VCluster,不能使用通用型或分析型 VCluster。
140
+ > Batch sync tasks (task_type=1 and 291) must use a Sync VCluster — general-purpose or analytics VClusters are not supported.
140
141
 
141
- ### 步骤 5:验证与监控
142
+ ### Step 5: Verify and Monitor
142
143
 
143
144
  ```bash
144
- cz-cli runs list --task <task_name> # 查看运行记录
145
- cz-cli runs detail <run_id> # 查看运行详情
146
- cz-cli attempts log <run_id> # 查看执行日志
147
- cz-cli runs refill <task_name> --help # 补数据(--help 查看参数)
145
+ cz-cli runs list --task <task_name> # View run history
146
+ cz-cli runs detail <run_id> # View run details
147
+ cz-cli attempts log <run_id> # View execution logs
148
+ cz-cli runs refill <task_name> --help # Backfill data (--help for parameters)
148
149
  ```
149
150
 
150
151
  ---
151
152
 
152
- ## 任务运维操作
153
+ ## Task Operations
153
154
 
154
- | 操作 | cz-cli 命令 | 说明 |
155
- |------|------------|------|
156
- | 下线 | `cz-cli task undeploy <task> -y` | 停止任务并从调度系统移除(不可逆) |
157
- | 补数据 | `cz-cli runs refill <task> --from D --to D -y` | 对历史周期进行数据补录 |
158
- | 查看依赖 | `cz-cli runs deps <task>` | 查看已发布的上下游依赖 |
159
- | 查看运行 | `cz-cli runs list --task <task>` | 查看运行实例列表 |
155
+ | Operation | cz-cli Command | Description |
156
+ |-----------|---------------|-------------|
157
+ | Undeploy | `cz-cli task undeploy <task> -y` | Stop task and remove from scheduler (irreversible) |
158
+ | Backfill | `cz-cli runs refill <task> --from D --to D -y` | Backfill data for historical periods |
159
+ | View Dependencies | `cz-cli runs deps <task>` | View published upstream/downstream dependencies |
160
+ | View Runs | `cz-cli runs list --task <task>` | View run instance list |
160
161
 
161
- 多表离线同步任务在 Studio「任务运维」→「周期任务」中管理,可查看:
162
- - 任务实例 Tab:每张表的读取/写入行数和同步速率
163
- - 同步对象 Tab:所有源表和目标表的映射关系
162
+ Multi-table batch sync tasks are managed in Studio under "Task Operations" → "Scheduled Tasks", where you can view:
163
+ - Task Instance Tab: read/write row counts and sync rate per table
164
+ - Sync Objects Tab: mapping of all source tables to target tables
164
165
 
165
166
  ---
166
167
 
167
- ## 交付验收 Checklist
168
+ ## Delivery Acceptance Checklist
168
169
 
169
- 同步任务发布运行后,**必须逐项验证**:
170
+ After the sync task is deployed and running, **verify each item**:
170
171
 
171
172
  ```sql
172
- -- 1. 行数比对:目标表行数与源端一致
173
+ -- 1. Row count comparison: target table row count matches source
173
174
  SELECT COUNT(*) FROM <ods_schema>.<table>;
174
- -- 与源端 MySQL/PG 执行 SELECT COUNT(*) FROM <table> 对比
175
+ -- Compare with source MySQL/PG: SELECT COUNT(*) FROM <table>
175
176
 
176
- -- 2. 关键字段非空率
177
+ -- 2. Key field non-null rate
177
178
  SELECT
178
179
  COUNT(*) AS total,
179
180
  COUNT(key_field) AS non_null,
@@ -181,47 +182,46 @@ SELECT
181
182
  FROM <ods_schema>.<table>;
182
183
  ```
183
184
 
184
- **验收标准:**
185
- - [ ] 目标表行数与源端一致
186
- - [ ] 关键字段非空率符合预期
187
- - [ ] 同步任务最近运行状态为 SUCCESS
188
- - [ ] 字段类型映射正确(重点检查 BIT/ENUM/TEXT 等异构类型)
189
- - [ ] 调度 Cron 配置正确,下次执行时间符合预期
185
+ **Acceptance Criteria:**
186
+ - [ ] Target table row count matches source
187
+ - [ ] Key field non-null rate meets expectations
188
+ - [ ] Latest task run status is SUCCESS
189
+ - [ ] Column type mapping is correct (pay attention to BIT/ENUM/TEXT and other heterogeneous types)
190
+ - [ ] Cron schedule is configured correctly; next execution time is as expected
190
191
 
191
192
  ---
192
193
 
193
- ## 故障排除
194
+ ## Troubleshooting
194
195
 
195
- | 问题 | 排查方向 |
196
- |------|---------|
197
- | 任务创建失败 | 确认账号有创建任务权限;检查文件夹 ID 是否存在 |
198
- | 源端连接失败 | 检查数据源配置中的连接信息、网络可达性、账号权限 |
199
- | 字段映射失败 | 检查源表和目标表的字段类型兼容性 |
200
- | 同步速度慢 | 调整并发数(最大 10)和同步速率;检查源端数据库负载 |
201
- | Schema Evolution 失败 | 不支持修改主键字段;字段类型仅支持同类型扩展(int8→int16→int32→int64 |
202
- | 多表同步部分表失败 | 在实例详情的「同步对象」Tab 查看各表状态;可对失败表单独重跑 |
203
- | upsert 模式数据不一致 | 确认目标表有正确的主键定义;检查源端数据是否有主键冲突 |
204
- | VCluster 类型错误 | 离线同步必须使用 Sync VCluster,通过 `SHOW VCLUSTERS` 确认类型 |
196
+ | Issue | Investigation |
197
+ |-------|--------------|
198
+ | Task creation failed | Verify account has task creation permissions; check if folder ID exists |
199
+ | Source connection failed | Check data source connection info, network reachability, account permissions |
200
+ | Column mapping failed | Check column type compatibility between source and target tables |
201
+ | Slow sync speed | Adjust concurrency (max 10) and sync rate; check source database load |
202
+ | Schema Evolution failed | Primary key column changes not supported; type changes only support same-type widening (int8→int16→int32→int64) |
203
+ | Partial table failure in multi-table sync | Check per-table status in instance detail "Sync Objects" tab; failed tables can be re-run individually |
204
+ | Data inconsistency in upsert mode | Verify target table has correct primary key definition; check for primary key conflicts in source data |
205
+ | Wrong VCluster type | Batch sync must use Sync VCluster — verify type with `SHOW VCLUSTERS` |
205
206
 
206
207
  ---
207
208
 
208
- ## 注意事项
209
+ ## Notes
209
210
 
210
- **权限要求**
211
- - 源端:数据源配置的账号需具备 SELECT 权限
212
- - 目标端:任务负责人需具备 CREATE INSERT 权限
211
+ **Permission Requirements**
212
+ - Source: the account configured in the data source must have SELECT permission
213
+ - Target: the task owner must have CREATE and INSERT permissions
213
214
 
214
- **性能考虑**
215
- - 合理配置并发度,避免对源端数据库造成过大压力
216
- - 首次执行需初始化所有同步对象,可能耗时较长
217
- - 选择源端数据库压力较小的时间窗口执行调度
215
+ **Performance Considerations**
216
+ - Configure concurrency appropriately to avoid excessive load on the source database
217
+ - First execution initializes all sync objects and may take longer
218
+ - Schedule execution during low-load windows on the source database
218
219
 
219
- **Schema Evolution 限制(多表离线同步)**
220
- - 不支持修改主键字段(Lakehouse 主键表限制)
221
- - 字段类型修改仅支持同类型扩展(int8 → int16 → int32 → int64
222
- - 不支持跨类型转换(如 int → double
223
-
224
- **支持的数据源**
225
- - 来源端:关系型数据库(MySQL、PostgreSQL、SQL Server 等)及其云变体(Aurora、PolarDB 等),具体支持列表以 Studio UI 中显示为准,可通过 `cz-cli datasource list` 查看已配置的数据源
226
- - 目标端:Lakehouse
220
+ **Schema Evolution Limitations (Multi-table Batch Sync)**
221
+ - Primary key column changes are not supported (Lakehouse primary key table limitation)
222
+ - Column type changes only support same-type widening (int8 → int16 → int32 → int64)
223
+ - Cross-type conversions are not supported (e.g., int → double)
227
224
 
225
+ **Supported Data Sources**
226
+ - Source: relational databases (MySQL, PostgreSQL, SQL Server, etc.) and their cloud variants (Aurora, PolarDB, etc.). The specific supported list is shown in Studio UI; use `cz-cli datasource list` to view configured data sources
227
+ - Target: Lakehouse
@@ -1,5 +1,5 @@
1
- {"case_id":"001","type":"should_call","user_input":"怎么创建离线同步任务把 MySQL 表定期同步到 Lakehouse","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["离线同步","Cron","调度"]}
2
- {"case_id":"002","type":"should_call","user_input":"多表离线同步支持自动建表吗?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["自动创建","多表"]}
3
- {"case_id":"003","type":"should_call","user_input":"离线同步的单表模式和多表模式怎么选?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["单表","多表","task_type"]}
4
- {"case_id":"004","type":"should_call","user_input":"批量同步支持 Schema Evolution 吗?新增字段会自动适配吗?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["Schema Evolution","多表"]}
5
- {"case_id":"005","type":"should_call","user_input":"怎么配置离线同步任务的调度周期?用 Cron 表达式吗?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["Cron","调度"]}
1
+ {"case_id":"001","type":"should_call","user_input":"How do I create a batch sync task to periodically sync a MySQL table to Lakehouse?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["batch sync","Cron","schedule"]}
2
+ {"case_id":"002","type":"should_call","user_input":"Does multi-table batch sync support auto table creation?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["auto-create","multi-table"]}
3
+ {"case_id":"003","type":"should_call","user_input":"How do I choose between single-table and multi-table batch sync modes?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["single-table","multi-table","task_type"]}
4
+ {"case_id":"004","type":"should_call","user_input":"Does batch sync support Schema Evolution? Will new columns be auto-adapted?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["Schema Evolution","multi-table"]}
5
+ {"case_id":"005","type":"should_call","user_input":"How do I configure the schedule for a batch sync task? Does it use Cron expressions?","expected_skill":"clickzetta-batch-sync-pipeline","expected_output_contains":["Cron","schedule"]}