@clickzetta/cz-cli-darwin-x64 0.3.91 → 0.3.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
  3. package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
  4. package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
  5. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
  6. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
  7. package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
  8. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
  9. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
  10. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
  11. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
  12. package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
  13. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
  14. package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
  15. package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
  16. package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
  17. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
  18. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
  19. package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
  20. package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
  21. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
  22. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
  23. package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
  24. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
  25. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
  26. package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
  27. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
  28. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
  29. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
  30. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
  31. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
  32. package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
  33. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
  34. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
  35. package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
  36. package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
  37. package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
  38. package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
  39. package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
  40. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
  41. package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
  42. package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
  43. package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
  44. package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
  45. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
  46. package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
  47. package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
  48. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
  49. package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
  50. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
  51. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
  52. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
  53. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
  54. package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
  55. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
  56. package/package.json +1 -1
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  58. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  60. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
  61. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  62. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  63. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  64. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  65. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  66. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  67. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  68. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  69. /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
@@ -1,323 +1,324 @@
1
1
  ---
2
2
  name: clickzetta-realtime-sync-pipeline
3
3
  description: |
4
- 创建和管理 ClickZetta Lakehouse 实时同步任务(单表),将外部数据源的数据实时同步到 Lakehouse。
5
- 支持 Kafka、MySQL、PostgreSQL 等数据源作为来源端,Lakehouse 作为目标端。
6
- 实时同步任务为持续运行的流式任务,无需配置调度策略,提交后即持续运行。
7
- 当用户说"Studio 实时同步"、"realtime sync"、"单表 CDC 同步"、"实时数据同步"、"Kafka 实时同步到 Lakehouse"、
8
- "MySQL 单表实时同步""单表实时同步""实时数据迁移"时触发。
9
- 包含实时同步任务创建、数据源配置、字段映射(含 JSONPath 计算列)、部署运维等
10
- ClickZetta Studio 特有逻辑。
11
- Keywords: real-time sync, single table, Kafka source, MySQL source, streaming
4
+ Create and manage ClickZetta Lakehouse real-time sync tasks (single-table), syncing data from external sources
5
+ to Lakehouse in real time.
6
+ Supports Kafka, MySQL, PostgreSQL, and other data sources as the source, with Lakehouse as the target.
7
+ Real-time sync tasks are continuously running streaming tasks — no scheduling required; they start running upon submission.
8
+ Triggered when the user says "Studio real-time sync", "realtime sync", "single-table CDC sync",
9
+ "real-time data sync", "Kafka real-time sync to Lakehouse", "MySQL single-table real-time sync",
10
+ "single-table real-time sync", "real-time data migration".
11
+ Covers real-time sync task creation, data source configuration, column mapping (including JSONPath computed columns),
12
+ deployment, and operations — all ClickZetta Studio specific logic.
13
+ Keywords: real-time sync, single table, Kafka source, MySQL source, streaming, CDC
12
14
  ---
13
15
 
14
- # 实时同步(单表)Pipeline 工作流
16
+ # Single-table Real-time Sync Pipeline Workflow
15
17
 
16
- ## 向导:收集必要信息
18
+ ## Wizard: Collect Required Information
17
19
 
18
- 开始创建实时同步任务前,优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
20
+ Before creating a real-time sync task, use an interactive question tool (e.g., `question`) to collect the following information via option menus. If no such tool is available, list all questions in text at once:
19
21
 
20
22
  ```
21
23
  question({
22
24
  questions: [
23
25
  {
24
- question: "数据源类型?",
26
+ question: "Data source type?",
25
27
  options: [
26
- { label: "Kafka", description: "Kafka Topic 实时接入,支持 JSON 消息解析" },
27
- { label: "MySQL / Aurora MySQL", description: "单表 CDC 实时同步" },
28
- { label: "PostgreSQL / Aurora PG", description: "单表 CDC 实时同步" },
29
- { label: "SQL Server", description: "单表 CDC 实时同步" }
28
+ { label: "Kafka", description: "Kafka Topic real-time ingestion, supports JSON message parsing" },
29
+ { label: "MySQL / Aurora MySQL", description: "Single-table CDC real-time sync" },
30
+ { label: "PostgreSQL / Aurora PG", description: "Single-table CDC real-time sync" },
31
+ { label: "SQL Server", description: "Single-table CDC real-time sync" }
30
32
  ]
31
33
  },
32
34
  {
33
- question: "同步粒度?",
35
+ question: "Sync granularity?",
34
36
  options: [
35
- { label: "单表/单 Topic", description: " skill 支持,精细化配置" },
36
- { label: "整库/多表", description: "建议改用 clickzetta-cdc-sync-pipeline" }
37
+ { label: "Single table/topic", description: "Supported by this skill, fine-grained configuration" },
38
+ { label: "Full database/multi-table", description: "Use clickzetta-cdc-sync-pipeline instead" }
37
39
  ]
38
40
  }
39
41
  ]
40
42
  })
41
43
  ```
42
44
 
43
- **如果用户已经提供了足够信息,直接进入工作流,不再弹出菜单。**
45
+ **If the user has already provided sufficient information, proceed directly to the workflow without showing the menu.**
44
46
 
45
47
  ---
46
48
 
47
- ## 适用场景
49
+ ## Applicable Scenarios
48
50
 
49
- - 将外部数据源的数据实时同步到 Lakehouse(低延迟、持续运行)
50
- - Kafka Topic → Lakehouse 表(支持 JSON 消息解析)
51
- - MySQL / PostgreSQL / SQL Server 等数据库 → Lakehouse 表(CDC 变更捕获)
52
- - 数据时效性要求高,需要秒级或分钟级延迟
53
- - 单张源表/Topic 到单张目标表的实时同步
54
- - 关键词:实时同步、CDC、流式同步、realtime syncKafka 实时同步
51
+ - Sync data from external sources to Lakehouse in real time (low latency, continuously running)
52
+ - Kafka Topic → Lakehouse table (supports JSON message parsing)
53
+ - MySQL / PostgreSQL / SQL Server databases → Lakehouse table (CDC change capture)
54
+ - High data freshness requirements — second-level or minute-level latency
55
+ - Single source table/topic to single target table real-time sync
56
+ - Keywords: real-time sync, CDC, streaming sync, Kafka real-time sync
55
57
 
56
- ## 与其他同步方式的区别
58
+ ## Comparison with Other Sync Methods
57
59
 
58
- | 维度 | 实时同步(本 Skill | 离线同步 | 多表实时同步 |
59
- |------|---------------------|---------|------------|
60
- | 任务类型 ID | `14`(REALTIME/CDC | `10` / `291` | `281` |
61
- | 同步粒度 | 单表/单 Topic | 单表/多表 | 整库/多表 |
62
- | 运行模式 | 持续运行(流式) | 周期调度(批量) | 持续运行(流式) |
63
- | 调度策略 | 无需配置,提交即运行 | 需配置 Cron 表达式 | 无需配置,提交即运行 |
64
- | 延迟 | 秒级~分钟级 | 取决于调度周期 | 秒级~分钟级 |
65
- | 适用 Skill | `clickzetta-realtime-sync-pipeline` | `clickzetta-batch-sync-pipeline` | `clickzetta-cdc-sync-pipeline` |
60
+ | Dimension | Real-time Sync (This Skill) | Batch Sync | Multi-table Real-time Sync |
61
+ |-----------|---------------------------|------------|--------------------------|
62
+ | Task Type ID | `14` (REALTIME/CDC) | `10` / `291` | `281` |
63
+ | Sync Granularity | Single table/topic | Single/multi-table | Full database/multi-table |
64
+ | Run Mode | Continuously running (streaming) | Scheduled (batch) | Continuously running (streaming) |
65
+ | Scheduling | Not required, runs upon submission | Cron expression required | Not required, runs upon submission |
66
+ | Latency | Seconds to minutes | Depends on schedule interval | Seconds to minutes |
67
+ | Applicable Skill | `clickzetta-realtime-sync-pipeline` | `clickzetta-batch-sync-pipeline` | `clickzetta-cdc-sync-pipeline` |
66
68
 
67
- ## 前置依赖
69
+ ## Prerequisites
68
70
 
69
- - ClickZetta Lakehouse Studio 账户,具备创建同步任务、目标表的权限
70
- - 源端数据源已在 Studio 中配置(Kafka / MySQL / PostgreSQL / SQL Server 等)
71
- - 目标端 Lakehouse 数据源可用
72
- - Sync VCluster 可用(实时同步任务 task_type=14 需要 Sync VCluster
73
- - **执行环境(满足其一即可,优先使用 cz-cli)**:
74
- - **cz-cli 路径**:已安装 cz-cli(`brew install cz-cli 或参考官方文档安装`),并完成 `cz-cli setup` 配置
75
- - **MCP 路径**:clickzetta-studio-mcp 工具可用(`create_task`、`save_integration_task`、`publish_task`、`list_data_sources`、`LH_show_object_list` 等)
71
+ - ClickZetta Lakehouse Studio account with permissions to create sync tasks and target tables
72
+ - Source data source already configured in Studio (Kafka / MySQL / PostgreSQL / SQL Server, etc.)
73
+ - Target Lakehouse data source available
74
+ - Sync VCluster available (real-time sync task_type=14 requires a Sync VCluster)
75
+ - **Execution environment (one of the following, cz-cli preferred)**:
76
+ - **cz-cli path**: cz-cli installed (`brew install cz-cli or refer to official docs`) and `cz-cli setup` completed
77
+ - **MCP path**: clickzetta-studio-mcp tools available (`create_task`, `save_integration_task`, `publish_task`, `list_data_sources`, `LH_show_object_list`, etc.)
76
78
 
77
- ## 环境探测(执行前必读)
79
+ ## Environment Detection (Read Before Execution)
78
80
 
79
- 在开始任何操作前,先判断当前执行环境:
81
+ Before starting any operation, determine the current execution environment:
80
82
 
81
- **第一步:检测 cz-cli 是否可用**
83
+ **Step 1: Check if cz-cli is available**
82
84
  ```bash
83
85
  cz-cli --version
84
86
  ```
85
- - 若命令存在**走 cz-cli 路径**(见本文档末尾"cz-cli 替代路径"章节)
86
- - 若命令不存在继续检测 MCP
87
+ - If command exists **use cz-cli path** (see "cz-cli Alternative Path" section at the end of this document)
88
+ - If command not found continue to check MCP
87
89
 
88
- **第二步:检测 MCP 是否可用(仅在 cz-cli 不可用时)**
90
+ **Step 2: Check if MCP is available (only when cz-cli is unavailable)**
89
91
 
90
- 尝试调用 `list_data_sources` 工具查询数据源列表。
91
- - 若工具存在于 tool list → **走 MCP 路径**(本文档默认路径)
92
- - 若工具不存在停止执行,提示用户:
93
- > "当前环境既无 cz-cli 也无 MCP 工具,请安装其中之一后重试。
94
- > cz-cli 安装:`brew install cz-cli 或参考官方文档安装`,然后运行 `cz-cli setup`
95
- > MCP 安装:参考 clickzetta-studio-mcp 配置文档"
92
+ Try calling the `list_data_sources` tool to query the data source list.
93
+ - If tool exists in tool list → **use MCP path** (default path in this document)
94
+ - If tool not found stop execution and prompt the user:
95
+ > "Neither cz-cli nor MCP tools are available in the current environment. Please install one of them before retrying.
96
+ > cz-cli installation: `brew install cz-cli or refer to official docs`, then run `cz-cli setup`
97
+ > MCP installation: refer to clickzetta-studio-mcp configuration docs"
96
98
 
97
- ## 工作流
99
+ ## Workflow
98
100
 
99
- ### 步骤 1:确认 Sync VCluster 可用
101
+ ### Step 1: Confirm Sync VCluster Availability
100
102
 
101
103
  ```
102
- 使用 LH_show_object_listobject_type='VCLUSTERS')查看可用虚拟集群。
103
- 筛选 vcluster_type 包含 SYNC 的集群。
104
- 如无可用 Sync VCluster,需先创建后再继续。
104
+ Use LH_show_object_list (object_type='VCLUSTERS') to view available virtual clusters.
105
+ Filter for clusters where vcluster_type contains SYNC.
106
+ If no Sync VCluster is available, create one before proceeding.
105
107
  ```
106
108
 
107
- ### 步骤 2:查找可用数据源
109
+ ### Step 2: Find Available Data Sources
108
110
 
109
111
  ```
110
- 使用 list_data_sources 查看已配置的数据源列表。
111
- 按类型过滤:
112
+ Use list_data_sources to view configured data source list.
113
+ Filter by type:
112
114
  - Kafka: ds_type=2
113
115
  - MySQL: ds_type=5
114
116
  - PostgreSQL: ds_type=7
115
117
  - SQL Server: ds_type=8
116
- 记录源端 datasource_name 和目标端 Lakehouse datasource_name
118
+ Record the source datasource_name and target Lakehouse datasource_name.
117
119
  ```
118
120
 
119
- ### 步骤 3:探查源端数据结构(可选)
121
+ ### Step 3: Explore Source Data Structure (Optional)
120
122
 
121
123
  ```
122
- 使用 list_namespaces 查看源端数据源的命名空间(数据库/Schema)。
123
- 使用 list_metadata_objects 查看命名空间下的表/Topic 列表。
124
- 使用 get_metadata_detail 查看具体表/Topic 的字段结构。
124
+ Use list_namespaces to view the source data source's namespaces (databases/schemas).
125
+ Use list_metadata_objects to view tables/topics under a namespace.
126
+ Use get_metadata_detail to view the column structure of a specific table/topic.
125
127
  ```
126
128
 
127
- ### 步骤 4:创建实时同步任务
129
+ ### Step 4: Create Real-time Sync Task
128
130
 
129
131
  ```
130
- 使用 create_task 创建任务:
131
- - task_type: 14(实时同步)
132
- - task_name: 自定义任务名称(建议包含源和目标信息,如 "rt_sync_kafka_orders"
133
- - data_folder_id: 目标文件夹 ID(可通过 list_folders 获取)
132
+ Use create_task to create the task:
133
+ - task_type: 14 (real-time sync)
134
+ - task_name: custom task name (recommend including source and target info, e.g., "rt_sync_kafka_orders")
135
+ - data_folder_id: target folder ID (obtainable via list_folders)
134
136
 
135
- 记录返回的 task_id studio_url
137
+ Record the returned task_id and studio_url.
136
138
  ```
137
139
 
138
- ### 步骤 5:配置同步内容
140
+ ### Step 5: Configure Sync Content
139
141
 
140
142
  ```
141
- 使用 save_integration_task 配置同步:
142
- - task_id: 步骤 4 返回的任务 ID
143
- - source_datasource_name: 源端数据源名称
144
- - source_schema: 源端数据库/Schema(Kafka 场景为 Topic 所在命名空间)
145
- - source_table: 源端表名或 Kafka Topic 名称
146
- - source_ds_type: 源端类型(2=Kafka, 5=MySQL, 7=PostgreSQL, 8=SQL Server
147
- - sink_datasource_name: 目标 Lakehouse 数据源名称
148
- - sink_schema: 目标 Schema(默认 public
149
- - sink_table: 目标表名(可选,默认与源表同名)
150
- - sink_ds_type: 1Lakehouse
143
+ Use save_integration_task to configure sync:
144
+ - task_id: task ID returned in Step 4
145
+ - source_datasource_name: source data source name
146
+ - source_schema: source database/schema (for Kafka, the namespace containing the topic)
147
+ - source_table: source table name or Kafka topic name
148
+ - source_ds_type: source type (2=Kafka, 5=MySQL, 7=PostgreSQL, 8=SQL Server)
149
+ - sink_datasource_name: target Lakehouse data source name
150
+ - sink_schema: target schema (default: public)
151
+ - sink_table: target table name (optional, defaults to same as source table)
152
+ - sink_ds_type: 1 (Lakehouse)
151
153
  ```
152
154
 
153
- > **说明**:系统会自动获取源端和目标端的元数据,生成字段映射。如目标表不存在,会自动创建。
155
+ > **Note**: The system automatically retrieves source and target metadata to generate column mappings. If the target table does not exist, it will be auto-created.
154
156
 
155
- ### 步骤 6Kafka JSON 消息解析(Kafka 数据源专用)
157
+ ### Step 6: Kafka JSON Message Parsing (Kafka Source Only)
156
158
 
157
- 如果 Kafka Topic 的消息格式为 JSON,可在 Studio UI 中通过新增计算列解析嵌套字段:
159
+ If the Kafka topic message format is JSON, you can add computed columns in Studio UI to parse nested fields:
158
160
 
159
- - 使用 JSONPath 规则解析 value 字段中的内容
160
- - 示例:`$.id` 提取顶层 id 字段,`$.data.code` 提取嵌套字段
161
- - 默认使用 Kafka Topic 内置字段(keyvaluetimestamppartitionoffset)进行映射
162
- - 计算列配置需在 Studio UI 中完成(通过 studio_url 打开)
161
+ - Use JSONPath rules to parse content from the value field
162
+ - Examples: `$.id` extracts the top-level id field, `$.data.code` extracts a nested field
163
+ - By default, Kafka topic built-in fields (key, value, timestamp, partition, offset) are used for mapping
164
+ - Computed column configuration must be done in Studio UI (open via studio_url)
163
165
 
164
- ### 步骤 7:提交部署
166
+ ### Step 7: Submit and Deploy
165
167
 
166
168
  ```
167
- 实时同步任务不需要配置调度策略(无需调用 save_task_configuration)。
168
- 直接使用 publish_task 提交任务:
169
- - task_id: 任务 ID
170
- - task_version: 当前版本号(通过 get_task_detail 获取)
169
+ Real-time sync tasks do not require scheduling configuration (no need to call save_task_configuration).
170
+ Use publish_task to submit the task directly:
171
+ - task_id: task ID
172
+ - task_version: current version number (obtainable via get_task_detail)
171
173
 
172
- 提交后任务即开始持续运行。
174
+ The task starts running continuously upon submission.
173
175
  ```
174
176
 
175
- > **重要**:实时同步任务不支持开发状态下的测试运行,提交即为正式部署。
177
+ > **Important**: Real-time sync tasks do not support test runs in development state — submission is production deployment.
176
178
 
177
- ### 步骤 8:运维监控
179
+ ### Step 8: Operations and Monitoring
178
180
 
179
181
  ```
180
- 提交后在运维中心管理实时同步任务:
182
+ After submission, manage real-time sync tasks in the Operations Center:
181
183
 
182
- 查看任务状态:get_task_detail
183
- 查看运行记录:list_task_run(注意实时任务为持续运行,不同于离线任务的周期实例)
184
+ View task status: get_task_detail
185
+ View run history: list_task_run (note: real-time tasks run continuously, unlike batch tasks with periodic instances)
184
186
 
185
- Studio UI 中可进行:
186
- - 启动/停止任务
187
- - 查看同步延迟和吞吐量
188
- - 查看错误日志
187
+ In Studio UI you can:
188
+ - Start/stop the task
189
+ - View sync latency and throughput
190
+ - View error logs
189
191
  ```
190
192
 
191
193
  ---
192
194
 
193
- ## 支持的数据源
195
+ ## Supported Data Sources
194
196
 
195
- ### 来源端
197
+ ### Source
196
198
 
197
- | 数据源 | ds_type | 说明 |
198
- |--------|---------|------|
199
- | Kafka | 2 | 支持 JSON 消息解析(JSONPath 计算列) |
200
- | MySQL | 5 | CDC 变更捕获 |
201
- | PostgreSQL | 7 | CDC 变更捕获 |
202
- | SQL Server | 8 | CDC 变更捕获 |
203
- | Aurora MySQL | 39 | CDC 变更捕获 |
204
- | Aurora PostgreSQL | 40 | CDC 变更捕获 |
205
- | PolarDB MySQL | 19 | CDC 变更捕获 |
206
- | PolarDB PostgreSQL | 48 | CDC 变更捕获 |
199
+ | Data Source | ds_type | Description |
200
+ |------------|---------|-------------|
201
+ | Kafka | 2 | Supports JSON message parsing (JSONPath computed columns) |
202
+ | MySQL | 5 | CDC change capture |
203
+ | PostgreSQL | 7 | CDC change capture |
204
+ | SQL Server | 8 | CDC change capture |
205
+ | Aurora MySQL | 39 | CDC change capture |
206
+ | Aurora PostgreSQL | 40 | CDC change capture |
207
+ | PolarDB MySQL | 19 | CDC change capture |
208
+ | PolarDB PostgreSQL | 48 | CDC change capture |
207
209
 
208
- ### 目标端
210
+ ### Target
209
211
 
210
- | 数据源 | ds_type |
211
- |--------|---------|
212
+ | Data Source | ds_type |
213
+ |------------|---------|
212
214
  | Lakehouse | 1 |
213
215
 
214
- ## 故障排除
216
+ ## Troubleshooting
215
217
 
216
- | 问题 | 排查方向 |
217
- |------|---------|
218
- | 任务创建失败 | 检查是否有可用的 Sync VCluster(`LH_show_object_list` 查看 VCLUSTERS,筛选 SYNC 类型) |
219
- | 源端连接失败 | 检查数据源配置中的连接信息、网络可达性、账号权限 |
220
- | Kafka 消费无数据 | 检查 Topic 名称是否正确、消费位点设置、Kafka 集群连通性 |
221
- | JSON 解析失败 | 检查 JSONPath 表达式是否正确、消息格式是否为合法 JSON |
222
- | 同步延迟增大 | 检查 Sync VCluster 资源是否充足、源端数据量是否突增 |
223
- | 目标表写入失败 | 检查目标表是否存在、字段类型是否兼容、权限是否充足 |
224
- | 任务异常停止 | 查看执行日志(`list_executions` + `get_execution_log`)排查具体错误 |
218
+ | Issue | Investigation |
219
+ |-------|--------------|
220
+ | Task creation failed | Check if a Sync VCluster is available (`LH_show_object_list` to view VCLUSTERS, filter for SYNC type) |
221
+ | Source connection failed | Check data source connection info, network reachability, account permissions |
222
+ | No data consumed from Kafka | Check topic name, consumer offset settings, Kafka cluster connectivity |
223
+ | JSON parsing failed | Check JSONPath expression correctness, verify message format is valid JSON |
224
+ | Increasing sync latency | Check if Sync VCluster resources are sufficient, whether source data volume has spiked |
225
+ | Target table write failed | Check if target table exists, column type compatibility, sufficient permissions |
226
+ | Task stopped unexpectedly | Check execution logs (`list_executions` + `get_execution_log`) for specific errors |
225
227
 
226
- ## 注意事项
228
+ ## Notes
227
229
 
228
- ### 运行模式
230
+ ### Run Mode
229
231
 
230
- - 实时同步任务为持续运行的流式任务,提交后即开始运行,无需配置调度
231
- - 不支持开发状态下的测试运行
232
- - 停止后需手动重新启动
232
+ - Real-time sync tasks are continuously running streaming tasks — they start running upon submission without scheduling
233
+ - Test runs in development state are not supported
234
+ - After stopping, manual restart is required
233
235
 
234
- ### Sync VCluster 要求
236
+ ### Sync VCluster Requirements
235
237
 
236
- - 实时同步任务(task_type=14)必须使用 Sync VCluster
237
- - 创建任务前需确认有可用的 Sync VCluster
238
- - 可通过 `LH_show_object_list`(object_type='VCLUSTERS')查看,筛选 vcluster_type 包含 SYNC 的集群
238
+ - Real-time sync tasks (task_type=14) must use a Sync VCluster
239
+ - Confirm a Sync VCluster is available before creating the task
240
+ - Check via `LH_show_object_list` (object_type='VCLUSTERS'), filter for clusters where vcluster_type contains SYNC
239
241
 
240
- ### Kafka 数据源特殊说明
242
+ ### Kafka Source Special Notes
241
243
 
242
- - 支持指定消费起始位点(earliest / latest / 指定 offset
243
- - JSON 消息可通过 JSONPath 计算列解析嵌套字段
244
- - 默认字段包括:keyvaluetimestamppartitionoffset
244
+ - Supports specifying consumer start offset (earliest / latest / specific offset)
245
+ - JSON messages can be parsed via JSONPath computed columns for nested fields
246
+ - Default fields include: key, value, timestamp, partition, offset
245
247
 
246
- ### 与多表实时同步的选择
248
+ ### Choosing Between Single-table and Multi-table Real-time Sync
247
249
 
248
- - 单表实时同步(本 Skill):适合单张表/Topic 的精细化同步
249
- - 多表实时同步(`clickzetta-cdc-sync-pipeline`):适合整库 CDC、多表批量实时同步
250
- - 如需同步整个数据库的所有表,建议使用多表实时同步
250
+ - Single-table real-time sync (this skill): suitable for fine-grained sync of a single table/topic
251
+ - Multi-table real-time sync (`clickzetta-cdc-sync-pipeline`): suitable for full database CDC, multi-table batch real-time sync
252
+ - If you need to sync all tables in a database, use multi-table real-time sync
251
253
 
252
254
  ---
253
255
 
254
- ## cz-cli 替代路径
256
+ ## cz-cli Alternative Path
255
257
 
256
- > 仅在 cz-cli 可用且 MCP 不可用时使用本节。步骤编号与上方 MCP 路径对应。
257
- > 所有操作通过 `cz-cli agent run` 委托给内置 agent 完成,agent 内置完整的 Studio MCP 工具访问能力。
258
+ > Use this section only when cz-cli is available and MCP is not. Step numbers correspond to the MCP path above.
259
+ > All operations are delegated to the built-in agent via `cz-cli agent run`, which has full Studio MCP tool access.
258
260
 
259
- ### 单表实时同步(cz-cli 版)
261
+ ### Single-table Real-time Sync (cz-cli Version)
260
262
 
261
- **快速路径**:直接创建任务,然后在 Studio UI 配置数据源
263
+ **Quick path**: Create the task directly, then configure data source in Studio UI
262
264
 
263
265
  ```bash
264
- # 步骤 1:创建实时同步任务(task_type=14,即 REALTIME/CDC
266
+ # Step 1: Create real-time sync task (task_type=14, i.e., REALTIME/CDC)
265
267
  cz-cli task create "rt_sync_<table>" --type REALTIME --folder <folder_name>
266
- # 返回 task_id studio_url,在 studio_url 中完成数据源配置和字段映射
268
+ # Returns task_id and studio_url — complete data source configuration and column mapping at studio_url
267
269
 
268
- # 步骤 2:配置完成后,发布任务(实时同步无需配置调度,提交即持续运行)
270
+ # Step 2: After configuration, deploy the task (real-time sync needs no scheduling, runs continuously upon submission)
269
271
  cz-cli task deploy "rt_sync_<table>" -y
270
272
  ```
271
273
 
272
- **完整 agent 路径**(需要 agent 完成数据源探查和配置):
274
+ **Full agent path** (when agent is needed for data source exploration and configuration):
273
275
 
274
276
  ```bash
275
- # 一键完成:让 agent 完成完整的实时同步任务创建
276
- cz-cli agent run "创建实时同步任务(task_type=14),将数据源 <source_ds_name> <schema>.<table>(或 Kafka topic <topic>)实时同步到 Lakehouse public schema,使用 Sync VCluster,任务名 rt_sync_<table>,放在 <folder_name> 文件夹下" \
277
+ # One-shot: let the agent complete the full real-time sync task creation
278
+ cz-cli agent run "Create a real-time sync task (task_type=14), sync data source <source_ds_name> <schema>.<table> (or Kafka topic <topic>) to Lakehouse public schema in real time, use Sync VCluster, task name rt_sync_<table>, place in <folder_name> folder" \
277
279
  --format a2a --dangerously-skip-permissions
278
280
  ```
279
281
 
280
- 对于需要精细控制的场景,可拆分步骤:
282
+ For scenarios requiring fine-grained control, split into steps:
281
283
 
282
284
  ```bash
283
- # 步骤 1:确认 Sync VCluster 可用
284
- cz-cli agent run "列出所有可用的 VCluster,筛选 vcluster_type 包含 SYNC 的集群,确认有可用的 Sync VCluster" \
285
+ # Step 1: Confirm Sync VCluster availability
286
+ cz-cli agent run "List all available VClusters, filter for clusters where vcluster_type contains SYNC, confirm a Sync VCluster is available" \
285
287
  --format a2a --dangerously-skip-permissions
286
288
 
287
- # 步骤 2:查找数据源
288
- cz-cli agent run "列出所有已配置的数据源,按类型过滤(Kafka: ds_type=2, MySQL: ds_type=5, PostgreSQL: ds_type=7, SQL Server: ds_type=8),记录源端和目标端 Lakehouse 数据源名称" \
289
+ # Step 2: Find data sources
290
+ cz-cli agent run "List all configured data sources, filter by type (Kafka: ds_type=2, MySQL: ds_type=5, PostgreSQL: ds_type=7, SQL Server: ds_type=8), record source and target Lakehouse data source names" \
289
291
  --format a2a --dangerously-skip-permissions
290
292
 
291
- # 步骤 3(可选):探查源端数据结构
292
- cz-cli agent run "查看数据源 <source_ds_name> 的命名空间列表,以及 <schema> 下的表/Topic 列表和字段结构" \
293
+ # Step 3 (Optional): Explore source data structure
294
+ cz-cli agent run "View namespace list for data source <source_ds_name>, and the table/topic list and column structure under <schema>" \
293
295
  --format a2a --dangerously-skip-permissions
294
296
 
295
- # 步骤 4-5:创建并配置实时同步任务
296
- cz-cli agent run "创建实时同步任务(task_type=14),源端 datasource=<source_ds_name>,schema=<schema>,table=<table>(source_ds_type=<type>),目标 Lakehouse public.<table>,任务名 rt_sync_<table>" \
297
+ # Steps 4-5: Create and configure real-time sync task
298
+ cz-cli agent run "Create a real-time sync task (task_type=14), source datasource=<source_ds_name>, schema=<schema>, table=<table> (source_ds_type=<type>), target Lakehouse public.<table>, task name rt_sync_<table>" \
297
299
  --format a2a --dangerously-skip-permissions
298
300
 
299
- # 步骤 7:提交部署
300
- cz-cli agent run "提交实时同步任务 rt_sync_<table>,使其开始持续运行" \
301
+ # Step 7: Submit and deploy
302
+ cz-cli agent run "Submit real-time sync task rt_sync_<table> to start continuous running" \
301
303
  --format a2a --dangerously-skip-permissions
302
304
  ```
303
305
 
304
- > **注意**:实时同步任务不需要配置调度策略,提交即开始持续运行。Kafka JSON 消息的计算列配置需在 Studio UI 中完成。
306
+ > **Note**: Real-time sync tasks do not require scheduling configuration — they start running continuously upon submission. Kafka JSON message computed column configuration must be done in Studio UI.
305
307
 
306
308
  ---
307
309
 
308
- ### 运维监控(cz-cli 版)
310
+ ### Operations and Monitoring (cz-cli Version)
309
311
 
310
312
  ```bash
311
- # 查看最近运行记录
313
+ # View recent run history
312
314
  cz-cli runs list --task <task_name>
313
315
 
314
- # 查看运行详情
316
+ # View run details
315
317
  cz-cli runs detail <run_id>
316
318
 
317
- # 查看执行日志
319
+ # View execution logs
318
320
  cz-cli attempts log <run_id>
319
321
 
320
- # 下线任务(停止持续运行)
322
+ # Undeploy task (stop continuous running)
321
323
  cz-cli task undeploy <task_name> -y
322
324
  ```
323
-
@@ -1,5 +1,5 @@
1
- {"case_id":"001","type":"should_call","user_input":"怎么用 Studio 创建单表实时同步任务?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["实时同步","task_type","28"]}
2
- {"case_id":"002","type":"should_call","user_input":"Kafka 单个 topic 实时同步到 Lakehouse 表怎么配置?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["Kafka","实时同步"]}
3
- {"case_id":"003","type":"should_call","user_input":"单表实时同步和多表实时同步有什么区别?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["单表","多表","28","281"]}
4
- {"case_id":"004","type":"should_call","user_input":"MySQL 单表 CDC 实时同步到 Lakehouse 怎么做?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["MySQL","实时同步","CDC"]}
5
- {"case_id":"005","type":"should_call","user_input":"实时同步任务需要配置调度策略吗?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["无需配置","持续运行"]}
1
+ {"case_id":"001","type":"should_call","user_input":"How do I create a single-table real-time sync task in Studio?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["real-time sync","task_type","14"]}
2
+ {"case_id":"002","type":"should_call","user_input":"How do I configure a single Kafka topic to sync to a Lakehouse table in real time?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["Kafka","real-time sync"]}
3
+ {"case_id":"003","type":"should_call","user_input":"What is the difference between single-table real-time sync and multi-table real-time sync?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["single","multi","14","281"]}
4
+ {"case_id":"004","type":"should_call","user_input":"How do I do MySQL single-table CDC real-time sync to Lakehouse?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["MySQL","real-time sync","CDC"]}
5
+ {"case_id":"005","type":"should_call","user_input":"Does a real-time sync task need scheduling configuration?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["no scheduling","continuously running"]}