@clickzetta/cz-cli-darwin-arm64 0.3.92 → 0.3.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
  3. package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
  4. package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
  5. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
  6. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
  7. package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
  8. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
  9. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
  10. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
  11. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
  12. package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
  13. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
  14. package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
  15. package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
  16. package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
  17. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
  18. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
  19. package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
  20. package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
  21. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
  22. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
  23. package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
  24. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
  25. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
  26. package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
  27. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
  28. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
  29. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
  30. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
  31. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
  32. package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
  33. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
  34. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
  35. package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
  36. package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
  37. package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
  38. package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
  39. package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
  40. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
  41. package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
  42. package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
  43. package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
  44. package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
  45. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
  46. package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
  47. package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
  48. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
  49. package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
  50. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
  51. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
  52. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
  53. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
  54. package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
  55. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
  56. package/package.json +1 -1
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  58. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  60. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
  61. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  62. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  63. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  64. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  65. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  66. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  67. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  68. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  69. /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
@@ -1,633 +1,637 @@
1
1
  ---
2
2
  name: clickzetta-cdc-sync-pipeline
3
3
  description: |
4
- 创建和管理 ClickZetta Lakehouse 多表实时同步任务(CDC),将 MySQL / PostgreSQL 数据库整库或多表实时同步到 Lakehouse。
5
- 支持三种同步模式:整库镜像、多表镜像、多表合并(分库分表合并)。
6
- 基于 Binlog(MySQL)或 WALs(PostgreSQL)实现秒级端到端时效性,包含全量 + 增量两阶段同步。
7
- 当用户说"多表实时同步"、"整库同步"、"整库镜像"、"CDC 整库"、"多表 CDC"、"分库分表合并"、
8
- "多表合并同步"、"MySQL 整库同步到 Lakehouse"、"PostgreSQL 整库同步"、"multi-table realtime sync"
9
- "database migration""全量+增量同步""同步运维"、"同步 SOP"、"同步告警配置"、
10
- "Binlog 位点过期""server-id 冲突""补充全量同步"、"新增同步表"时触发。
11
- 包含源端数据库准备(参数配置+权限)、三种同步模式选择、任务创建部署、运维 SOP(补全量/加表/数据修复)、
12
- 监控告警配置(5 种告警规则+IM webhook)、详细故障排除等 ClickZetta Studio 特有逻辑。
13
- Keywords: CDC, real-time sync, MySQL, PostgreSQL, change data capture, mirror, merge
4
+ Create and manage ClickZetta Lakehouse multi-table real-time sync (CDC) tasks, syncing entire MySQL / PostgreSQL
5
+ databases or multiple tables to Lakehouse in real time.
6
+ Supports three sync modes: full database mirror, multi-table mirror, and sharded table merge.
7
+ Based on Binlog (MySQL) or WALs (PostgreSQL) for second-level end-to-end latency, with full load + incremental two-phase sync.
8
+ Triggered when the user says "multi-table real-time sync", "full database sync", "database mirror",
9
+ "CDC full database", "multi-table CDC", "sharded table merge", "MySQL full database sync to Lakehouse",
10
+ "PostgreSQL full database sync", "multi-table realtime sync", "database migration",
11
+ "full load + incremental sync", "sync operations", "sync SOP", "sync alert configuration",
12
+ "Binlog position expired", "server-id conflict", "full re-sync", "add sync table".
13
+ Covers source database preparation (parameter configuration + permissions), three sync mode selection,
14
+ task creation and deployment, operations SOP (full re-sync/add table/data repair),
15
+ monitoring and alerting (5 alert rules + IM webhook), and detailed troubleshooting —
16
+ all ClickZetta Studio specific logic.
17
+ Keywords: CDC, real-time sync, MySQL, PostgreSQL, change data capture, mirror, merge, multi-table
14
18
  ---
15
19
 
16
- # 多表实时同步 Pipeline 工作流
20
+ # Multi-table Real-time Sync (CDC) Pipeline Workflow
17
21
 
18
- ## 向导:收集必要信息
22
+ ## Wizard: Collect Required Information
19
23
 
20
- 开始创建 CDC 同步任务前,优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
24
+ Before creating a CDC sync task, use an interactive question tool (e.g., `question`) to collect the following information via option menus. If no such tool is available, list all questions in text at once:
21
25
 
22
26
  ```
23
27
  question({
24
28
  questions: [
25
29
  {
26
- question: "源端数据库类型?",
30
+ question: "Source database type?",
27
31
  options: [
28
- { label: "MySQL", description: " Aurora MySQLPolarDB MySQL,基于 Binlog" },
29
- { label: "PostgreSQL", description: " Aurora PGPolarDB PG,基于 WALs,需 14+" }
32
+ { label: "MySQL", description: "Including Aurora MySQL, PolarDB MySQL — based on Binlog" },
33
+ { label: "PostgreSQL", description: "Including Aurora PG, PolarDB PG — based on WALs, requires 14+" }
30
34
  ]
31
35
  },
32
36
  {
33
- question: "同步模式?",
37
+ question: "Sync mode?",
34
38
  options: [
35
- { label: "整库镜像", description: "同步整个数据库,自动适配新增表" },
36
- { label: "多表镜像", description: "指定同步哪些表" },
37
- { label: "多表合并", description: "分库分表合并到一张目标表" }
39
+ { label: "Full database mirror", description: "Sync entire database, auto-adapts to new tables" },
40
+ { label: "Multi-table mirror", description: "Specify which tables to sync" },
41
+ { label: "Sharded table merge", description: "Merge sharded tables into one target table" }
38
42
  ]
39
43
  },
40
44
  {
41
- question: "源端是否已完成准备?",
45
+ question: "Is the source database already prepared?",
42
46
  options: [
43
- { label: "已准备好", description: "MySQL: Binlog 已开启,账号有 REPLICATION 权限;PG: wal_level=logical" },
44
- { label: "不确定,帮我检查", description: "我来帮你验证源端配置" }
47
+ { label: "Ready", description: "MySQL: Binlog enabled, account has REPLICATION permission; PG: wal_level=logical" },
48
+ { label: "Not sure, help me check", description: "I'll help verify source configuration" }
45
49
  ]
46
50
  }
47
51
  ]
48
52
  })
49
53
  ```
50
54
 
51
- 收集到信息后,还需确认目标 schema(如 `ods`)。
55
+ After collecting the above, also confirm the target schema (e.g., `ods`).
52
56
 
53
- **如果用户已经提供了足够信息,直接进入工作流,不再弹出菜单。**
57
+ **If the user has already provided sufficient information, proceed directly to the workflow without showing the menu.**
54
58
 
55
- ## 适用场景
59
+ ## Applicable Scenarios
56
60
 
57
- - MySQL / PostgreSQL 数据库整库或多表实时同步到 LakehouseCDC 变更捕获)
58
- - 整库镜像:以数据库为粒度,自动适配新增表
59
- - 多表镜像:以表粒度选择,支持自动感知字段变更
60
- - 多表合并:将分库分表数据合并写入同一张目标表
61
- - 全量 + 增量两阶段同步,秒级端到端时效性
62
- - 关键词:多表实时同步、整库同步、CDC、分库分表合并、database migration
61
+ - Sync entire MySQL / PostgreSQL databases or multiple tables to Lakehouse in real time (CDC change capture)
62
+ - Full database mirror: database-level granularity, auto-adapts to new tables
63
+ - Multi-table mirror: table-level selection, supports automatic schema change detection
64
+ - Sharded table merge: merge sharded table data into a single target table
65
+ - Full load + incremental two-phase sync, second-level end-to-end latency
66
+ - Keywords: multi-table real-time sync, full database sync, CDC, sharded table merge, database migration
63
67
 
64
- ## 与其他同步方式的区别
68
+ ## Comparison with Other Sync Methods
65
69
 
66
- | 维度 | 多表实时同步(本 Skill | 单表实时同步 | 离线同步 |
67
- |------|------------------------|------------|---------|
68
- | 任务类型 ID | `281`(多表实时同步) | `28` | `10` / `291` |
69
- | 同步粒度 | 整库/多表/分库分表合并 | 单表/单 Topic | 单表/多表 |
70
- | 运行模式 | 持续运行(流式 CDC | 持续运行(流式) | 周期调度(批量) |
71
- | 数据源 | MySQL / PostgreSQL | Kafka/MySQL/PG/SQLServer | 多种 |
72
- | 调度策略 | 无需配置,提交即运行 | 无需配置 | 需配置 Cron |
73
- | 适用 Skill | `clickzetta-cdc-sync-pipeline` | `clickzetta-realtime-sync-pipeline` | `clickzetta-batch-sync-pipeline` |
70
+ | Dimension | Multi-table Real-time Sync (This Skill) | Single-table Real-time Sync | Batch Sync |
71
+ |-----------|----------------------------------------|---------------------------|------------|
72
+ | Task Type ID | `281` (multi-table real-time sync) | `28` | `10` / `291` |
73
+ | Sync Granularity | Full database/multi-table/sharded merge | Single table/topic | Single/multi-table |
74
+ | Run Mode | Continuously running (streaming CDC) | Continuously running (streaming) | Scheduled (batch) |
75
+ | Data Sources | MySQL / PostgreSQL | Kafka/MySQL/PG/SQL Server | Multiple |
76
+ | Scheduling | Not required, runs upon submission | Not required | Cron required |
77
+ | Applicable Skill | `clickzetta-cdc-sync-pipeline` | `clickzetta-realtime-sync-pipeline` | `clickzetta-batch-sync-pipeline` |
74
78
 
75
- ## 支持的数据源
79
+ ## Supported Data Sources
76
80
 
77
- ### 来源端
81
+ ### Source
78
82
 
79
- | 数据源类型 | 增量读取模式 | 数据库版本 | ds_type |
80
- |-----------|------------|-----------|---------|
81
- | MySQL 类(含 Aurora MySQLPolarDB MySQL | Binlog | 5.6+、8.x | 5, 39, 19 |
82
- | PostgreSQL 类(含 Aurora PGPolarDB PG | WALs 日志 | 14+ | 7, 40, 48 |
83
+ | Data Source Type | Incremental Read Mode | Database Version | ds_type |
84
+ |-----------------|----------------------|-----------------|---------|
85
+ | MySQL (including Aurora MySQL, PolarDB MySQL) | Binlog | 5.6+, 8.x | 5, 39, 19 |
86
+ | PostgreSQL (including Aurora PG, PolarDB PG) | WALs | 14+ | 7, 40, 48 |
83
87
  | SQL Server | CDC | - | 8 |
84
88
  | TiDB | - | - | 17 |
85
89
 
86
- ### 目标端
90
+ ### Target
87
91
 
88
- | 数据源 | ds_type |
89
- |--------|---------|
92
+ | Data Source | ds_type |
93
+ |------------|---------|
90
94
  | Lakehouse | 1 |
91
95
  | Kafka | 2 |
92
96
 
93
- ## 前置依赖
97
+ ## Prerequisites
94
98
 
95
- - ClickZetta Lakehouse Studio 账户,具备创建同步任务权限
96
- - 源端数据源已在 Studio 中配置(通过 Studio UI 添加数据源,不是 SQL Storage Connection),且账号具备 CDC 所需权限
97
- - Sync VCluster 可用(多表实时同步任务 task_type=281 必须使用 Sync VCluster
98
- - **执行环境(满足其一即可,优先使用 cz-cli)**:
99
- - **cz-cli 路径**:已安装 cz-cli(`brew install cz-cli 或参考官方文档安装`),并完成 `cz-cli setup` 配置
100
- - **MCP 路径**:clickzetta-studio-mcp 工具可用(`create_task`、`save_cdc_realtime_task`、`publish_task`、`list_data_sources`、`LH_show_object_list` 等)
99
+ - ClickZetta Lakehouse Studio account with permissions to create sync tasks
100
+ - Source data source already configured in Studio (via Studio UI, not SQL Storage Connection), with CDC-required permissions
101
+ - Sync VCluster available (multi-table real-time sync task_type=281 must use a Sync VCluster)
102
+ - **Execution environment (one of the following, cz-cli preferred)**:
103
+ - **cz-cli path**: cz-cli installed (`brew install cz-cli or refer to official docs`) and `cz-cli setup` completed
104
+ - **MCP path**: clickzetta-studio-mcp tools available (`create_task`, `save_cdc_realtime_task`, `publish_task`, `list_data_sources`, `LH_show_object_list`, etc.)
101
105
 
102
- ## 环境探测(执行前必读)
106
+ ## Environment Detection (Read Before Execution)
103
107
 
104
- 在开始任何操作前,先判断当前执行环境:
108
+ Before starting any operation, determine the current execution environment:
105
109
 
106
- **第一步:检测 cz-cli 是否可用**
110
+ **Step 1: Check if cz-cli is available**
107
111
  ```bash
108
112
  cz-cli --version
109
113
  ```
110
- - 若命令存在**走 cz-cli 路径**(见本文档末尾"cz-cli 替代路径"章节)
111
- - 若命令不存在继续检测 MCP
114
+ - If command exists **use cz-cli path** (see "cz-cli Alternative Path" section at the end of this document)
115
+ - If command not found continue to check MCP
112
116
 
113
- **第二步:检测 MCP 是否可用(仅在 cz-cli 不可用时)**
117
+ **Step 2: Check if MCP is available (only when cz-cli is unavailable)**
114
118
 
115
- 尝试调用 `list_data_sources` 工具查询数据源列表。
116
- - 若工具存在于 tool list → **走 MCP 路径**(本文档默认路径)
117
- - 若工具不存在停止执行,提示用户:
118
- > "当前环境既无 cz-cli 也无 MCP 工具,请安装其中之一后重试。
119
- > cz-cli 安装:`brew install cz-cli 或参考官方文档安装`,然后运行 `cz-cli setup`
120
- > MCP 安装:参考 clickzetta-studio-mcp 配置文档"
119
+ Try calling the `list_data_sources` tool to query the data source list.
120
+ - If tool exists in tool list → **use MCP path** (default path in this document)
121
+ - If tool not found stop execution and prompt the user:
122
+ > "Neither cz-cli nor MCP tools are available in the current environment. Please install one of them before retrying.
123
+ > cz-cli installation: `brew install cz-cli or refer to official docs`, then run `cz-cli setup`
124
+ > MCP installation: refer to clickzetta-studio-mcp configuration docs"
121
125
 
122
- > ⚠️ **重要区分**:CDC 多表同步使用 **Studio 数据源**(通过 Studio UI API 配置),不是 SQL `CREATE STORAGE CONNECTION`。
123
- > - `CREATE STORAGE CONNECTION` 仅支持对象存储类型(OSS/COS/S3)和 Kafka
124
- > - MySQL / PostgreSQL 等关系数据库的连接通过 **Studio 数据源管理** 配置
125
- > - 使用 `list_data_sources` API 查看已配置的数据源
126
+ > ⚠️ **Important distinction**: CDC multi-table sync uses **Studio data sources** (configured via Studio UI or API), not SQL `CREATE STORAGE CONNECTION`.
127
+ > - `CREATE STORAGE CONNECTION` only supports object storage types (OSS/COS/S3) and Kafka
128
+ > - MySQL / PostgreSQL relational database connections are configured via **Studio Data Source Management**
129
+ > - Use `list_data_sources` API to view configured data sources
126
130
 
127
- ## 源端数据库准备
131
+ ## Source Database Preparation
128
132
 
129
- ### MySQL 参数要求
133
+ ### MySQL Parameter Requirements
130
134
 
131
- 在源端 MySQL 数据库上确认以下参数:
135
+ Verify the following parameters on the source MySQL database:
132
136
 
133
- | 参数 | 要求值 | 查询方法 |
134
- |------|--------|---------|
137
+ | Parameter | Required Value | Query Method |
138
+ |-----------|---------------|--------------|
135
139
  | `log_bin` | ON | `SHOW GLOBAL VARIABLES LIKE 'log_bin'` |
136
140
  | `binlog_format` | ROW | `SHOW GLOBAL VARIABLES LIKE 'binlog_format'` |
137
141
  | `binlog_row_image` | FULL | `SHOW GLOBAL VARIABLES LIKE 'binlog_row_image'` |
138
- | `binlog_expire_logs_seconds` | ≥86400(建议) | - |
142
+ | `binlog_expire_logs_seconds` | ≥86400 (recommended) | - |
139
143
 
140
- MySQL 权限要求(建议用 root 执行):
141
- - 元数据读取:`SELECT` on information_schema + 目标库表
142
- - Binlog 同步:`REPLICATION SLAVE`, `REPLICATION CLIENT`
143
- - 全量同步:`SELECT` on 目标表
144
+ MySQL permission requirements (recommend executing as root):
145
+ - Metadata read: `SELECT` on information_schema + target database tables
146
+ - Binlog sync: `REPLICATION SLAVE`, `REPLICATION CLIENT`
147
+ - Full load: `SELECT` on target tables
144
148
 
145
- ### PostgreSQL 参数要求
149
+ ### PostgreSQL Parameter Requirements
146
150
 
147
- 以下参数修改后需重启 PostgreSQL Server
151
+ The following parameters require a PostgreSQL Server restart after modification:
148
152
 
149
- | 参数 | 要求值 | 说明 |
150
- |------|--------|------|
151
- | `wal_level` | logical | 支持逻辑解码 |
152
- | `max_replication_slots` | ≥10 | 允许创建的 slot 数量 |
153
- | `max_wal_senders` | ≥10 | 最多同时运行的 WAL sender 进程数 |
153
+ | Parameter | Required Value | Description |
154
+ |-----------|---------------|-------------|
155
+ | `wal_level` | logical | Enables logical decoding |
156
+ | `max_replication_slots` | ≥10 | Maximum number of slots allowed |
157
+ | `max_wal_senders` | ≥10 | Maximum concurrent WAL sender processes |
154
158
 
155
- PostgreSQL 权限要求(建议用管理员账号执行):
156
- - 元数据读取:`SELECT` on information_schema
157
- - WAL 日志同步:`REPLICATION` 权限
158
- - 全量同步:`SELECT` on 目标表
159
- - 创建 publication:`CREATE` 权限
159
+ PostgreSQL permission requirements (recommend executing as admin):
160
+ - Metadata read: `SELECT` on information_schema
161
+ - WAL sync: `REPLICATION` permission
162
+ - Full load: `SELECT` on target tables
163
+ - Create publication: `CREATE` permission
160
164
 
161
- > **PostgreSQL 特别注意**:需要配置 replication slot,不同任务不要复用同一个 slot。任务启动时如 slot 被占用会启动失败。
165
+ > **PostgreSQL special note**: A replication slot must be configured. Different tasks should not reuse the same slot. If a slot is occupied when the task starts, it will fail to start.
162
166
 
163
- ## 工作流
167
+ ## Workflow
164
168
 
165
- ### 步骤 1:确认 Sync VCluster 可用
169
+ ### Step 1: Confirm Sync VCluster Availability
166
170
 
167
171
  ```
168
- 使用 LH_show_object_listobject_type='VCLUSTERS')查看可用虚拟集群。
169
- 筛选 vcluster_type 包含 SYNC 的集群。
170
- 如无可用 Sync VCluster,提示用户先创建后再继续。
172
+ Use LH_show_object_list (object_type='VCLUSTERS') to view available virtual clusters.
173
+ Filter for clusters where vcluster_type contains SYNC.
174
+ If no Sync VCluster is available, prompt the user to create one before proceeding.
171
175
  ```
172
176
 
173
- ### 步骤 2:查找源端数据源
177
+ ### Step 2: Find Source Data Source
174
178
 
175
179
  ```
176
- 使用 list_data_sources 查看已配置的数据源。
177
- 按类型过滤:
180
+ Use list_data_sources to view configured data sources.
181
+ Filter by type:
178
182
  - MySQL: ds_type=5
179
183
  - PostgreSQL: ds_type=7
180
- 记录源端 datasource_id datasource_type
184
+ Record the source datasource_id and datasource_type.
181
185
  ```
182
186
 
183
- ### 步骤 3:探查源端数据结构
187
+ ### Step 3: Explore Source Data Structure
184
188
 
185
189
  ```
186
- 使用 list_namespaces 查看源端数据库列表。
187
- 使用 list_metadata_objects 查看库下的表列表。
188
- 确认需要同步的范围(整库 / 指定表 / 分库分表)。
190
+ Use list_namespaces to view the source database list.
191
+ Use list_metadata_objects to view tables under a database.
192
+ Confirm the sync scope (full database / specific tables / sharded tables).
189
193
  ```
190
194
 
191
- ### 步骤 4:选择同步模式
195
+ ### Step 4: Select Sync Mode
192
196
 
193
- 根据用户需求选择三种模式之一:
197
+ Choose one of three modes based on user requirements:
194
198
 
195
- | 模式 | pipeline_type | 适用场景 |
196
- |------|--------------|---------|
197
- | 整库镜像 | 3 | 同步整个数据库所有表,自动适配新增表 |
198
- | 多表镜像 | 1 | 选定指定表同步,支持自动感知字段变更 |
199
- | 多表合并 | 2 | 分库分表数据合并写入同一张目标表 |
199
+ | Mode | pipeline_type | Use Case |
200
+ |------|--------------|----------|
201
+ | Full database mirror | 3 | Sync all tables in a database, auto-adapts to new tables |
202
+ | Multi-table mirror | 1 | Sync selected specific tables, supports automatic schema change detection |
203
+ | Sharded table merge | 2 | Merge sharded table data into a single target table |
200
204
 
201
- ### 步骤 5:创建多表实时同步任务
205
+ ### Step 5: Create Multi-table Real-time Sync Task
202
206
 
203
207
  ```
204
- 使用 create_task 创建任务:
205
- - task_type: 281(多表实时同步)
206
- - task_name: 自定义名称(如 "cdc_sync_mysql_orders_db"
207
- - data_folder_id: 目标文件夹 ID(通过 list_folders 获取)
208
+ Use create_task to create the task:
209
+ - task_type: 281 (multi-table real-time sync)
210
+ - task_name: custom name (e.g., "cdc_sync_mysql_orders_db")
211
+ - data_folder_id: target folder ID (obtainable via list_folders)
208
212
 
209
- 记录返回的 task_id(即 data_file_id)。
213
+ Record the returned task_id (i.e., data_file_id).
210
214
  ```
211
215
 
212
- ### 步骤 6:配置同步内容
216
+ ### Step 6: Configure Sync Content
213
217
 
214
218
  ```
215
- 使用 save_cdc_realtime_task 配置同步:
216
- - data_file_id: 步骤 5 返回的 task_id
217
- - pipeline_type: 步骤 4 选择的模式(1=多表镜像, 2=多表合并, 3=整库镜像)
219
+ Use save_cdc_realtime_task to configure sync:
220
+ - data_file_id: task_id returned in Step 5
221
+ - pipeline_type: mode selected in Step 4 (1=multi-table mirror, 2=sharded table merge, 3=full database mirror)
218
222
  - source_datasource_list: [{"datasourceId": <id>, "datasourceType": <type>}]
219
223
  - sync_object_list:
220
- - 整库镜像:[{"schemaName": "<数据库名>"}](仅指定库名)
221
- - 多表镜像:[{"schemaName": "<库名>", "tableName": "<表名>"}, ...]
222
- - 多表合并:通过正则或文件批量配置
224
+ - Full database mirror: [{"schemaName": "<database_name>"}] (specify database name only)
225
+ - Multi-table mirror: [{"schemaName": "<db>", "tableName": "<table>"}, ...]
226
+ - Sharded table merge: configure via regex or batch file
223
227
  - target_datasource: {"datasourceId": <lakehouse_id>, "datasourceType": 1}
224
- - sync_mode: 1(全量+增量,推荐)或 2(仅增量)
225
- - save_mode: 2(追加,推荐新任务使用)
228
+ - sync_mode: 1 (full load + incremental, recommended) or 2 (incremental only)
229
+ - save_mode: 2 (append, recommended for new tasks)
226
230
  ```
227
231
 
228
- > **sync_mode 说明**:
229
- > - `1`(全量+增量):先全量同步历史数据,再启动增量 CDC,推荐首次使用
230
- > - `2`(仅增量):仅从当前位点开始捕获变更,适合已有历史数据的场景
232
+ > **sync_mode explanation**:
233
+ > - `1` (full load + incremental): full load of historical data first, then starts incremental CDC — recommended for first use
234
+ > - `2` (incremental only): captures changes from current position only — suitable when historical data already exists
231
235
 
232
- ### 步骤 7:提交部署
236
+ ### Step 7: Submit and Deploy
233
237
 
234
238
  ```
235
- 使用 publish_task 提交任务:
236
- - task_id: 任务 ID
237
- - task_version: 当前版本号(通过 get_task_detail 获取)
239
+ Use publish_task to submit the task:
240
+ - task_id: task ID
241
+ - task_version: current version number (obtainable via get_task_detail)
238
242
 
239
- 提交后任务不会自动启动,需要手动启动。
243
+ The task does not start automatically after submission — manual start is required.
240
244
  ```
241
245
 
242
- > **重要**:多表实时同步任务是持续运行的流式任务,不需要配置调度策略(不要调用 save_task_configuration)。提交后在 Studio UI 中手动启动。
246
+ > **Important**: Multi-table real-time sync tasks are continuously running streaming tasks. No scheduling configuration is needed (do not call save_task_configuration). Start manually in Studio UI after submission.
243
247
 
244
- ### 步骤 8:启动任务
248
+ ### Step 8: Start the Task
245
249
 
246
- Studio UI 中启动任务,选择启动方式:
250
+ Start the task in Studio UI, selecting the start method:
247
251
 
248
- | 启动方式 | 说明 | 适用场景 |
249
- |---------|------|---------|
250
- | 无状态启动 | 完整同步所有数据(全量→增量) | 首次启动 |
251
- | 从上次保存状态恢复 | 从停止位点断点续传 | 停止后重启 |
252
- | 自定义起始位置 | MySQL: 指定 binlog 文件/时间;PG: 指定 LSN | 数据回刷 |
252
+ | Start Method | Description | Use Case |
253
+ |-------------|-------------|----------|
254
+ | Stateless start | Full sync of all data (full load → incremental) | First start |
255
+ | Resume from last saved state | Resume from the stop position | Restart after stop |
256
+ | Custom start position | MySQL: specify binlog file/time; PG: specify LSN | Data re-sync |
253
257
 
254
- 全量同步阶段可配置最大并发数,控制对源端数据库的压力。
258
+ During the full load phase, you can configure maximum concurrency to control pressure on the source database.
255
259
 
256
- ### 步骤 9:运维监控
260
+ ### Step 9: Operations and Monitoring
257
261
 
258
262
  ```
259
- 任务启动后经历三个阶段:初始化全量同步增量同步。
260
-
261
- 监控指标:
262
- - 读取数据 / 写入数据(记录数)
263
- - 平均读取速率 / 平均写入速率
264
- - Failover 次数
265
- - 单表级别:最新读取位置、最新更新时间、数据延迟
266
-
267
- 单表运维操作:
268
- - 优先执行:提高全量同步优先级
269
- - 取消运行 / 强制停止:停止单表同步
270
- - 重新同步:对该表重新全量+增量
271
- - 补数同步:按条件过滤部分数据重新全量同步
272
- - 查看异常:查看 Schema Evolution 异常等
263
+ After starting, the task goes through three phases: Initialization Full Load Incremental Sync.
264
+
265
+ Monitoring metrics:
266
+ - Data read / data written (record count)
267
+ - Average read rate / average write rate
268
+ - Failover count
269
+ - Per-table level: latest read position, latest update time, data latency
270
+
271
+ Per-table operations:
272
+ - Priority execution: increase full load priority for a table
273
+ - Cancel run / force stop: stop sync for a single table
274
+ - Re-sync: perform full load + incremental again for that table
275
+ - Backfill sync: re-sync partial data based on filter conditions
276
+ - View exceptions: view Schema Evolution exceptions, etc.
273
277
  ```
274
278
 
275
- ## 三种同步模式详解
279
+ ## Three Sync Modes in Detail
276
280
 
277
- ### 整库镜像
281
+ ### Full Database Mirror
278
282
 
279
- - 以数据库为粒度配置,只选库不选表
280
- - 自动适配库中新增表
281
- - 适合需要完整镜像整个数据库的场景
283
+ - Configured at database granularity — select database only, not individual tables
284
+ - Auto-adapts to new tables added to the database
285
+ - Suitable for scenarios requiring a complete mirror of an entire database
282
286
 
283
- ### 多表镜像
287
+ ### Multi-table Mirror
284
288
 
285
- - 以表粒度选择需要同步的表
286
- - 支持自动感知字段个数的新增和删除
287
- - 支持批量配置(上传配置文件)
288
- - PostgreSQL 需要配置 replication slotdecoderbufs pgoutput 插件)
289
+ - Select specific tables to sync at table granularity
290
+ - Supports automatic detection of column additions and deletions
291
+ - Supports batch configuration (upload configuration file)
292
+ - PostgreSQL requires replication slot configuration (decoderbufs or pgoutput plugin)
289
293
 
290
- ### 多表合并
294
+ ### Sharded Table Merge
291
295
 
292
- - 将分库分表数据合并写入同一张目标表
293
- - 使用"虚拟表"作为中间承接:新建虚拟表时,基于数据源/Schema/Table 名称给定筛选条件,将匹配的源端表定义为写入同一张虚拟表
294
- - 两种配置方式:
295
- - 基于规则:正则匹配筛选表(如以 `abc` 开头的所有表)
296
- - 基于文件:上传配置文件批量指定
297
- - 扩展字段功能:可在目标表中额外新增字段记录来源信息(server/database/schema/table 名称)
298
- - 分库分表主键冲突解决:开启扩展字段并将其设为联合主键,避免不同分库分表中主键相同记录的写入冲突
299
- - 异构字段合并:当分库分表字段结构不完全一致时,系统自动校验并提示差异,可选择异构字段合并功能处理
296
+ - Merges sharded table data into a single target table
297
+ - Uses "virtual tables" as an intermediate layer: when creating a virtual table, define filter conditions based on data source/schema/table names to map matching source tables to the same virtual table
298
+ - Two configuration methods:
299
+ - Rule-based: regex matching to filter tables (e.g., all tables starting with `abc`)
300
+ - File-based: upload configuration file for batch specification
301
+ - Extended fields feature: add extra columns to the target table to record source information (server/database/schema/table names)
302
+ - Sharded table primary key conflict resolution: enable extended fields and set them as composite primary key to avoid write conflicts from records with the same primary key across different shards
303
+ - Heterogeneous column merge: when sharded tables have inconsistent column structures, the system automatically validates and reports differences — use the heterogeneous column merge feature to handle this
300
304
 
301
- ## 高阶参数
305
+ ## Advanced Parameters
302
306
 
303
- 在任务「参数」区域可设定以下高阶参数(默认不建议调整,调整前请联系技术支持):
307
+ The following advanced parameters can be set in the task "Parameters" area (not recommended to adjust by default — contact technical support before adjusting):
304
308
 
305
- | 参数 | 含义 | 默认值 | 调优建议 |
306
- |------|------|--------|---------|
307
- | `step1.taskmanager.memory.process.size` | 增量同步进程总内存 | 1600m | 全量数据特别大时可调至 4000m |
308
- | `step2.taskmanager.memory.process.size` | 全量同步进程总内存 | 2000m | - |
309
- | `step1.taskmanager.memory.task.off-heap.size` | 增量同步堆外内存 | 256m | 全量数据特别大时可调至 500M |
310
- | `lh.table.cz.common.output.file.max.size` | 全量同步单文件切分大小 | 33554432 | - |
311
- | `pod.limit.memory` | 提交客户端内存上限 | 1Gi | - |
309
+ | Parameter | Description | Default | Tuning Advice |
310
+ |-----------|-------------|---------|---------------|
311
+ | `step1.taskmanager.memory.process.size` | Incremental sync process total memory | 1600m | Increase to 4000m for very large full loads |
312
+ | `step2.taskmanager.memory.process.size` | Full load process total memory | 2000m | - |
313
+ | `step1.taskmanager.memory.task.off-heap.size` | Incremental sync off-heap memory | 256m | Increase to 500M for very large full loads |
314
+ | `lh.table.cz.common.output.file.max.size` | Full load single file split size | 33554432 | - |
315
+ | `pod.limit.memory` | Submit client memory limit | 1Gi | - |
312
316
 
313
- ## 停止与下线
317
+ ## Stop and Undeploy
314
318
 
315
- ### 停止任务
319
+ ### Stop Task
316
320
 
317
- - 停止会自动保存增量同步位点
318
- - 全量阶段停止:重启后未完成的表会重新全量同步
319
- - 增量阶段停止:重启后从停止位点继续
320
- - 恢复方式:点击"启动",选择"从上次保存状态恢复"即可断点续传
321
- - 如需回溯数据:选择"自定义起始位置",指定 binlog 文件/位点(MySQL)或 LSNPostgreSQL),确保指定位点未过期
321
+ - Stopping automatically saves the incremental sync position
322
+ - Stop during full load phase: incomplete tables will re-sync from full load on restart
323
+ - Stop during incremental phase: resumes from stop position on restart
324
+ - Recovery: click "Start", select "Resume from last saved state" for checkpoint recovery
325
+ - To backtrack data: select "Custom start position", specify binlog file/position (MySQL) or LSN (PostgreSQL) — ensure the specified position has not expired
322
326
 
323
- ### 下线任务(高危)
327
+ ### Undeploy Task (High Risk)
324
328
 
325
- - 不保存同步位点,再次上线需重新同步
326
- - 不清理已同步到目标端的数据,不删除目标表
327
- - 重新同步不会重建表:全量覆盖写入(insert overwrite),增量 merge into 更新
328
- - 仅在以下情况使用:任务确定不再需要、任务状态异常需修复
329
+ - Does not save sync position — re-deployment requires full re-sync
330
+ - Does not clean up data already synced to target, does not delete target tables
331
+ - Re-sync does not recreate tables: full load uses insert overwrite, incremental uses merge into
332
+ - Use only when: the task is definitively no longer needed, or task state is abnormal and needs repair
329
333
 
330
- ## 运维 SOP
334
+ ## Operations SOP
331
335
 
332
- ### 后续补充全量同步
336
+ ### Supplementary Full Load After Initial Start
333
337
 
334
- 首次启动未选择全量同步,后续需要补充全量数据的 3 种方案:
338
+ Three approaches when full load was not selected at first start but historical data is needed later:
335
339
 
336
- | 方案 | 操作 | 影响 |
337
- |------|------|------|
338
- | 方案一:单表重新同步 | 对指定表执行"重新同步" | 源端数据同步到临时表,insert overwrite 写入目标表,不影响查询 |
339
- | 方案二:单表补数同步 | 对指定表执行"补数同步",过滤条件设为 `where 1=1` | 按条件从源端拉取数据到临时表,delete + merge into 写入目标表 |
340
- | 方案三:下线重上线 | 停止→下线→上线→启动(选择全量同步) | 清空位点信息,重新全量+增量同步,不删除目标表 |
340
+ | Approach | Operation | Impact |
341
+ |----------|-----------|--------|
342
+ | Approach 1: Single-table re-sync | Execute "Re-sync" for the specified table | Source data synced to temp table, insert overwrite to target table, no query impact |
343
+ | Approach 2: Single-table backfill | Execute "Backfill sync" for the specified table, filter condition set to `where 1=1` | Data pulled from source to temp table based on condition, delete + merge into target table |
344
+ | Approach 3: Undeploy and redeploy | Stop → Undeploy → Deploy → Start (select full load) | Clears position info, full load + incremental re-sync, does not delete target tables |
341
345
 
342
- ### 新增同步表
346
+ ### Add Sync Tables
343
347
 
344
- 1. 编辑任务,添加需要新增的表,保存
345
- 2. 提交任务发布
346
- 3. 在运维中心停止任务,再启动任务
347
- 4. 重启后自动同步新增表数据(如设定全量同步则执行全量,否则仅增量)
348
- 5. 不影响存量表的同步进度
348
+ 1. Edit the task, add the tables to sync, save
349
+ 2. Submit task for deployment
350
+ 3. Stop the task in Operations Center, then restart
351
+ 4. After restart, new tables are automatically synced (full load if configured, otherwise incremental only)
352
+ 5. Does not affect sync progress of existing tables
349
353
 
350
- ### 分库分表加减数据源/Schema/Table
354
+ ### Add/Remove Data Sources/Schemas/Tables for Sharded Tables
351
355
 
352
- - 在任务开发界面直接编辑
353
- - 保存→提交→重启任务后生效
354
- - 新增对象如设定全量同步会自动执行全量
355
- - 不影响存量表同步进度
356
+ - Edit directly in the task development interface
357
+ - Save → Submit → Restart task to take effect
358
+ - New objects will automatically execute full load if configured
359
+ - Does not affect sync progress of existing tables
356
360
 
357
- ### 优先同步重要表
361
+ ### Priority Sync for Important Tables
358
362
 
359
- - 全量同步阶段,对重要表使用"优先执行"操作
360
- - 在资源队列中插队,优先处理该表的全量同步
363
+ - During full load phase, use "Priority execution" for important tables
364
+ - Jumps the queue in the resource pool to prioritize full load for that table
361
365
 
362
- ### 暂停/恢复单表增量同步
366
+ ### Pause/Resume Single-table Incremental Sync
363
367
 
364
- - 暂停:对单表执行"停止增量同步",暂停该表变更消息消费
365
- - 恢复:执行"恢复增量同步",为保证数据连续性会从源端重新拉取一次全量数据
366
- - 适用场景:源端突发大流量时,暂停不重要表为重要表让出处理资源
368
+ - Pause: execute "Stop incremental sync" for a table to pause change message consumption
369
+ - Resume: execute "Resume incremental sync" — to ensure data continuity, a full load from source is performed
370
+ - Use case: during sudden high traffic from source, pause less important tables to free processing resources for important ones
367
371
 
368
- ### 单表数据修复
372
+ ### Single-table Data Repair
369
373
 
370
- | 操作 | 说明 | 写入方式 |
371
- |------|------|---------|
372
- | 重新同步 | 重新同步源端表全量数据 | 同步到临时表 → insert overwrite 写入目标表 |
373
- | 补数同步 | 按过滤条件从源端拉取部分/全部数据 | 同步到临时表 → delete 目标表相关数据 → merge into 写入 |
374
+ | Operation | Description | Write Method |
375
+ |-----------|-------------|--------------|
376
+ | Re-sync | Re-sync full source table data | Sync to temp table → insert overwrite to target table |
377
+ | Backfill sync | Pull partial/full data from source based on filter conditions | Sync to temp table → delete related data from target → merge into target |
374
378
 
375
- ## 监控告警配置
379
+ ## Monitoring and Alerting Configuration
376
380
 
377
- ### 推荐告警规则
381
+ ### Recommended Alert Rules
378
382
 
379
- 建议配置以下 5 种告警规则,全方位监控任务健康度:
383
+ Configure the following 5 alert rules for comprehensive task health monitoring:
380
384
 
381
- | 告警类型 | 监控事项 | 说明 |
382
- |---------|---------|------|
383
- | 任务 Failover | 多表实时同步作业 failover | 监控任务运行稳定性 |
384
- | 任务停止 | 多表实时同步任务运行失败 | 任务异常停止告警 |
385
- | 单表异常 | 多表实时同步任务目标表变更失败 | Schema Evolution 失败、单字段超 10M 限制等 |
386
- | 端到端延迟 | 多表实时同步延迟 | 数据从源端到目标端的时间间隔 |
387
- | 读取位点延迟 | 多表实时同步读取点位延迟 | 读取位点与源端最新位点的差距 |
385
+ | Alert Type | Monitored Item | Description |
386
+ |-----------|---------------|-------------|
387
+ | Task Failover | Multi-table real-time sync job failover | Monitors task runtime stability |
388
+ | Task Stopped | Multi-table real-time sync task run failure | Alerts on unexpected task stop |
389
+ | Single-table Exception | Multi-table real-time sync target table change failure | Schema Evolution failure, single field exceeding 10M limit, etc. |
390
+ | End-to-end Latency | Multi-table real-time sync latency | Time interval from source to target |
391
+ | Read Position Lag | Multi-table real-time sync read position lag | Gap between read position and source latest position |
388
392
 
389
- 每种告警可额外增加过滤属性(工作空间、任务名称等),不增加过滤则默认监控实例下所有多表实时任务。
393
+ Each alert can have additional filter attributes (workspace, task name, etc.). Without filters, all multi-table real-time tasks under the instance are monitored by default.
390
394
 
391
- ### IM 告警机器人配置
395
+ ### IM Alert Bot Configuration
392
396
 
393
- 1. 在飞书/企业微信中配置群机器人,获取 webhook 地址
394
- 2. 在产品中新增 webhook 配置,渠道选择飞书/企业微信,填写 webhook 地址
395
- 3. 在通知策略中启用 webhook
396
- 4. 在监控规则中选择启用了 webhook 的通知策略
397
+ 1. Configure a group bot in Feishu/WeCom, obtain the webhook URL
398
+ 2. Add a webhook configuration in the product, select Feishu/WeCom as channel, enter the webhook URL
399
+ 3. Enable webhook in the notification policy
400
+ 4. Select the notification policy with webhook enabled in the monitoring rule
397
401
 
398
- ## 示例
402
+ ## Examples
399
403
 
400
- ### 示例 1MySQL 整库实时同步到 Lakehouse
404
+ ### Example 1: MySQL Full Database Real-time Sync to Lakehouse
401
405
 
402
- 用户说:" MySQL ecommerce 数据库整库实时同步到 Lakehouse"
406
+ User says: "Sync the MySQL ecommerce database to Lakehouse in real time"
403
407
 
404
- 操作:
405
- 1. 源端准备:确认 MySQL 已开启 Binlog(`binlog_format=ROW`),创建同步账号并授权 REPLICATION SLAVESELECT
406
- 2. `list_data_sources` 找到 MySQL 数据源(ds_type=5)和 Lakehouse 数据源
407
- 3. `create_task(task_type=281, task_name="realtime_sync_ecommerce")` → 获取 studio_url
408
- 4. Studio UI 中:选择整库镜像选择 ecommerce 数据库配置目标 workspace → sync_mode 选全量+增量
409
- 5. `publish_task(...)` 提交,任务立即开始全量初始化,完成后自动切换增量 CDC
408
+ Steps:
409
+ 1. Source preparation: confirm MySQL has Binlog enabled (`binlog_format=ROW`), create sync account with REPLICATION SLAVE and SELECT permissions
410
+ 2. `list_data_sources` to find MySQL data source (ds_type=5) and Lakehouse data source
411
+ 3. `create_task(task_type=281, task_name="realtime_sync_ecommerce")` → get studio_url
412
+ 4. In Studio UI: select full database mirror select ecommerce databaseconfigure target workspace → sync_mode select full load + incremental
413
+ 5. `publish_task(...)` to submit — task immediately begins full load initialization, then automatically switches to incremental CDC
410
414
 
411
- ### 示例 2:分库分表合并同步
415
+ ### Example 2: Sharded Table Merge Sync
412
416
 
413
- 用户说:"我有 order_0order_1order_2 三张分表,要合并同步到一张 orders "
417
+ User says: "I have three sharded tables order_0, order_1, order_2 that need to be merged into one orders table"
414
418
 
415
- 操作:
419
+ Steps:
416
420
  1. `create_task(task_type=281, task_name="sync_sharding_orders")`
417
- 2. Studio UI 中:选择多表合并选择 order_0/order_1/order_2 → 目标表设为 orders → 配置扩展字段(如 `__source_table__`)区分来源
418
- 3. `publish_task(...)` 提交
421
+ 2. In Studio UI: select sharded table merge select order_0/order_1/order_2 → set target table as orders → configure extended fields (e.g., `__source_table__`) to identify source
422
+ 3. `publish_task(...)` to submit
419
423
 
420
- ## 故障排除
424
+ ## Troubleshooting
421
425
 
422
- ### 快速排查表
426
+ ### Quick Reference Table
423
427
 
424
- | 问题 | 排查方向 |
425
- |------|---------|
426
- | `CREATE STORAGE CONNECTION TYPE MYSQL` 报错 | ❌ ClickZetta 不支持 MySQL/PostgreSQL 类型的 Storage Connection。CDC 数据源通过 **Studio UI 数据源管理** 配置,不是 SQL 命令 |
427
- | 任务创建失败 | 检查是否有可用 Sync VCluster |
428
- | 源端连接失败 | 检查 Studio 中数据源配置、网络可达性、账号权限 |
429
- | Binlog 读取失败 | 确认 MySQL `log_bin=ON`、`binlog_format=ROW`、`binlog_row_image=FULL` |
430
- | WAL 读取失败 | 确认 PostgreSQL `wal_level=logical`,slot 未被其他任务占用 |
431
- | Slot 启动冲突 | 不同任务不要复用同一个 slot,检查是否有其他运行中任务占用 |
432
- | 全量同步慢 | 调整最大并发数,检查源端数据库负载,调大内存参数 |
433
- | 增量延迟增大 | 检查 Sync VCluster 资源、源端数据量是否突增 |
434
- | Schema Evolution 异常 | 通过"查看异常"操作查看详情,注意不支持变更字段类型 |
435
- | 分库分表主键冲突 | 开启扩展字段并设为联合主键 |
428
+ | Issue | Investigation |
429
+ |-------|--------------|
430
+ | `CREATE STORAGE CONNECTION TYPE MYSQL` error | ❌ ClickZetta does not support MySQL/PostgreSQL type Storage Connections. CDC data sources are configured via **Studio UI Data Source Management**, not SQL commands |
431
+ | Task creation failed | Check if a Sync VCluster is available |
432
+ | Source connection failed | Check Studio data source configuration, network reachability, account permissions |
433
+ | Binlog read failed | Confirm MySQL `log_bin=ON`, `binlog_format=ROW`, `binlog_row_image=FULL` |
434
+ | WAL read failed | Confirm PostgreSQL `wal_level=logical`, slot not occupied by another task |
435
+ | Slot startup conflict | Different tasks should not reuse the same slot — check if another running task is occupying it |
436
+ | Slow full load | Adjust maximum concurrency, check source database load, increase memory parameters |
437
+ | Increasing incremental latency | Check Sync VCluster resources, whether source data volume has spiked |
438
+ | Schema Evolution exception | Use "View exceptions" to see details — note that column type changes are not supported |
439
+ | Sharded table primary key conflict | Enable extended fields and set as composite primary key |
436
440
 
437
- ### 增量同步失败
441
+ ### Incremental Sync Failures
438
442
 
439
- #### Binlog 位点过期
443
+ #### Binlog Position Expired
440
444
 
441
- - 现象:报错 `The connector is trying to read binlog starting at ... but this is no longer available on the server`
442
- - 原因:指定的 binlog 文件已被 MySQL 定期回收清理,或任务停止时间过长导致位点过期
443
- - 解决:
444
- 1. 在源端执行 `SHOW MASTER STATUS` 查询当前最新 binlog 文件和位点
445
- 2. 使用最新的 file position 重启同步任务(选择"自定义起始位置"
446
- 3. 如需补回丢失数据,对相应表执行"重新同步"
445
+ - Symptom: error `The connector is trying to read binlog starting at ... but this is no longer available on the server`
446
+ - Cause: the specified binlog file has been purged by MySQL periodic cleanup, or the task was stopped too long causing position expiration
447
+ - Resolution:
448
+ 1. Execute `SHOW MASTER STATUS` on source to query current latest binlog file and position
449
+ 2. Restart sync task with the latest file and position (select "Custom start position")
450
+ 3. If lost data needs recovery, execute "Re-sync" for the affected tables
447
451
 
448
- #### Server-id 冲突
452
+ #### Server-id Conflict
449
453
 
450
- - 现象:报错 `A slave with the same server_uuid/server_id as this slave has connected to the master`
451
- - 原因:任务分配的 server-id(范围 5400-6400)与同一数据库上的其他同步工具/任务冲突
452
- - 解决:检查同一数据库实例下是否有其他同步任务或工具正在同步 binlog,重启同步任务
454
+ - Symptom: error `A slave with the same server_uuid/server_id as this slave has connected to the master`
455
+ - Cause: the task's assigned server-id (range 5400-6400) conflicts with another sync tool/task on the same database
456
+ - Resolution: check if other sync tasks or tools are syncing binlog on the same database instance, restart the sync task
453
457
 
454
- #### 数据源时区配置错误
458
+ #### Data Source Timezone Configuration Error
455
459
 
456
- - 现象:报错 `The MySQL server has a timezone offset ... which does not match the configured timezone`
457
- - 原因:数据源中配置的时区(默认 Asia/Shanghai)与数据库实际时区不一致
458
- - 解决:确认数据库配置的时区,修改数据源中的时区配置
460
+ - Symptom: error `The MySQL server has a timezone offset ... which does not match the configured timezone`
461
+ - Cause: the timezone configured in the data source (default Asia/Shanghai) does not match the actual database timezone
462
+ - Resolution: confirm the database's configured timezone, modify the timezone in the data source configuration
459
463
 
460
- #### Binlog 事件 size 超限
464
+ #### Binlog Event Size Exceeded
461
465
 
462
- - 现象:报错 `log event entry exceeded max_allowed_packet`
463
- - 原因:数据库 `max_allowed_packet` 小于 Binlog 中某个事件的 size,或 binlog 文件损坏
464
- - 解决:
465
- 1. 联系 DBA 调大 `max_allowed_packet`(上限 1G),生效后重新同步
466
- 2. 如调整后仍失败(binlog 可能损坏),重启任务选择更新的位点跳过问题位点
467
- 3. 对可能缺少数据的表执行"重新同步"补全
466
+ - Symptom: error `log event entry exceeded max_allowed_packet`
467
+ - Cause: database `max_allowed_packet` is smaller than a binlog event size, or binlog file is corrupted
468
+ - Resolution:
469
+ 1. Contact DBA to increase `max_allowed_packet` (max 1G), re-sync after it takes effect
470
+ 2. If still failing after adjustment (binlog may be corrupted), restart task with a newer position to skip the problematic position
471
+ 3. Execute "Re-sync" for tables that may have missing data
468
472
 
469
- ### 全量同步失败
473
+ ### Full Load Failures
470
474
 
471
- #### PK 长度超限
475
+ #### PK Length Exceeded
472
476
 
473
- - 现象:报错 `Encoded key size 191 exceeds max size 128`
474
- - 原因:源表主键字段总长度超过 128 字节,或多表合并场景中扩展字段联合主键过长
475
- - 解决:在同步任务配置中增加参数调大 PK 长度限制
477
+ - Symptom: error `Encoded key size 191 exceeds max size 128`
478
+ - Cause: source table primary key total field length exceeds 128 bytes, or extended field composite primary key is too long in sharded table merge scenarios
479
+ - Resolution: add a parameter in the sync task configuration to increase the PK length limit
476
480
 
477
- ### 同步任务 Failover
481
+ ### Sync Task Failover
478
482
 
479
- #### Lakehouse Ingestion Service 断连
483
+ #### Disconnected from Lakehouse Ingestion Service
480
484
 
481
- - 现象:Failover 详情中包含 `Async commit for instance ... failed. rpcProxy call hit final failed after max retry reached`
482
- - 原因:通常发生在 Lakehouse 服务端升级期间,连接中断
483
- - 解决:
484
- 1. 服务升级完成后任务通常自动恢复
485
- 2. 如持续 Failover,手动重启任务
486
- 3. 如仍无法恢复,检查 Lakehouse Ingestion Service 健康状态
485
+ - Symptom: failover details contain `Async commit for instance ... failed. rpcProxy call hit final failed after max retry reached`
486
+ - Cause: typically occurs during Lakehouse service upgrades, connection interrupted
487
+ - Resolution:
488
+ 1. Task usually auto-recovers after service upgrade completes
489
+ 2. If failover persists, manually restart the task
490
+ 3. If still unrecoverable, check Lakehouse Ingestion Service health status
487
491
 
488
- #### Binlog 事件反序列化失败
492
+ #### Binlog Event Deserialization Failed
489
493
 
490
- - 现象:Failover 详情中包含 `Failed to deserialize data of EventHeaderV4`
491
- - 原因:源端 binlog 突发大量事件(大量更新/批量删除),写入端反压导致读取端停止消费,binlog client 连接超时中断
492
- - 解决:
493
- 1. 短时间流量增长:任务通常在有限 Failover 次数内自动恢复
494
- 2. 持续出现:调大 MySQL 参数 `slave_net_timeout` `thread_pool_idle_timeout`
495
- 3. 临时调整(重启失效):`SET GLOBAL slave_net_timeout = 120; SET GLOBAL thread_pool_idle_timeout = 120;`
496
- 4. 永久调整:修改 MySQL 配置文件
494
+ - Symptom: failover details contain `Failed to deserialize data of EventHeaderV4`
495
+ - Cause: sudden burst of binlog events from source (mass updates/bulk deletes), write-side backpressure causes read-side to stop consuming, binlog client connection times out
496
+ - Resolution:
497
+ 1. Short-term traffic spike: task usually auto-recovers within limited failover attempts
498
+ 2. Persistent occurrence: increase MySQL parameters `slave_net_timeout` and `thread_pool_idle_timeout`
499
+ 3. Temporary adjustment (lost on restart): `SET GLOBAL slave_net_timeout = 120; SET GLOBAL thread_pool_idle_timeout = 120;`
500
+ 4. Permanent adjustment: modify MySQL configuration file
497
501
 
498
- ### 表进入黑名单
502
+ ### Table Enters Blocklist
499
503
 
500
- #### Schema Evolution 失败
504
+ #### Schema Evolution Failed
501
505
 
502
- - 现象:表状态自动变为停止同步,提示 `pk column different`、`pk column type mismatch`、`invalid modify column`
503
- - 原因:源端表结构发生 Lakehouse 不支持的变更(PK 字段列表变更、PK 字段类型变更、字段类型不兼容修改)
504
- - 解决:
505
- 1. 检查源端表结构,修改为正确的结构
506
- 2. 对停止同步的表执行"重新同步",全量同步完成后增量数据会继续同步
506
+ - Symptom: table status automatically changes to sync stopped, with messages like `pk column different`, `pk column type mismatch`, `invalid modify column`
507
+ - Cause: source table structure changed in a way not supported by Lakehouse (PK column list change, PK column type change, incompatible column type modification)
508
+ - Resolution:
509
+ 1. Check source table structure, correct it to the proper structure
510
+ 2. Execute "Re-sync" for the stopped table — after full load completes, incremental data will continue syncing
507
511
 
508
- ## 已知局限
512
+ ## Known Limitations
509
513
 
510
- - **不支持 SQL 创建 MySQL/PostgreSQL Connection**:`CREATE STORAGE CONNECTION TYPE MYSQL/POSTGRESQL` 会报错 `no connection info factory for connection kind 'STORAGE', type 'mysql'`。CDC 数据源必须通过 Studio UI 数据源管理配置
511
- - Schema Evolution 暂不支持变更字段类型、不支持自动新增表
512
- - 仅支持带主键(PK)字段的表,非 PK 表不支持同步
513
- - 源端不同库表中若存在主键相同的数据,同步结果会异常
514
- - 无特别必要不要手动创建/修改/删除目标表(系统自动管理目标表结构)
515
- - MySQL 不支持的字段类型:`year`(取值不对应)
516
- - PostgreSQL 不支持的字段类型:`varbit`、`bytea`、`TIMETZ`、`interval`、`NAME`(取值不对应),`NUMERIC`、`decimal`(精度不对应,目标端精度更高)
514
+ - **Cannot create MySQL/PostgreSQL Connection via SQL**: `CREATE STORAGE CONNECTION TYPE MYSQL/POSTGRESQL` will error with `no connection info factory for connection kind 'STORAGE', type 'mysql'`. CDC data sources must be configured via Studio UI Data Source Management
515
+ - Schema Evolution does not support column type changes or automatic new table detection
516
+ - Only tables with primary key (PK) fields are supported — non-PK tables cannot be synced
517
+ - If different source databases/tables contain records with the same primary key, sync results will be abnormal
518
+ - Do not manually create/modify/delete target tables unless necessary (the system auto-manages target table structure)
519
+ - MySQL unsupported column types: `year` (value mismatch)
520
+ - PostgreSQL unsupported column types: `varbit`, `bytea`, `TIMETZ`, `interval`, `NAME` (value mismatch), `NUMERIC`, `decimal` (precision mismatch — target has higher precision)
517
521
 
518
522
  ---
519
523
 
520
- ## cz-cli 替代路径
524
+ ## cz-cli Alternative Path
521
525
 
522
- > 仅在 cz-cli 可用且 MCP 不可用时使用本节。步骤编号与上方 MCP 路径对应。
523
- > 所有操作通过 `cz-cli agent run` 委托给内置 agent 完成,agent 内置完整的 Studio MCP 工具访问能力。
526
+ > Use this section only when cz-cli is available and MCP is not. Step numbers correspond to the MCP path above.
527
+ > All operations are delegated to the built-in agent via `cz-cli agent run`, which has full Studio MCP tool access.
524
528
 
525
- ### 快速路径:直接创建任务 + Studio UI 配置
529
+ ### Quick Path: Create Task + Studio UI Configuration
526
530
 
527
531
  ```bash
528
- # 创建 CDC 多表实时同步任务(task_type=281,即 MULTI_REALTIME
532
+ # Create CDC multi-table real-time sync task (task_type=281, i.e., MULTI_REALTIME)
529
533
  cz-cli task create "cdc_<database>" --type MULTI_REALTIME --folder <folder_name>
530
- # 返回 task_id studio_url,在 studio_url 中完成数据源选择、表映射等配置
534
+ # Returns task_id and studio_url — complete data source selection, table mapping, etc. at studio_url
531
535
 
532
- # 配置完成后发布(CDC 任务无需调度,提交即持续运行)
536
+ # After configuration, deploy (CDC tasks need no scheduling, runs continuously upon submission)
533
537
  cz-cli task deploy "cdc_<database>" -y
534
538
  ```
535
539
 
536
- ### 模式一:整库镜像同步(cz-cli agent 版)
540
+ ### Mode 1: Full Database Mirror Sync (cz-cli agent version)
537
541
 
538
542
  ```bash
539
- # 步骤 1-9 合并:让 agent 完成完整的 CDC 整库同步任务创建
540
- cz-cli agent run "创建 CDC 多表实时同步任务,将 MySQL 数据源 <source_ds_name> <database> 库整库镜像同步到 Lakehouse,使用 Sync VCluster,任务名 cdc_<database>,放在 <folder_name> 文件夹下" \
543
+ # Steps 1-9 combined: let the agent complete the full CDC database sync task creation
544
+ cz-cli agent run "Create a CDC multi-table real-time sync task, mirror the entire <database> database from MySQL data source <source_ds_name> to Lakehouse, use Sync VCluster, task name cdc_<database>, place in <folder_name> folder" \
541
545
  --format a2a --dangerously-skip-permissions
542
546
  ```
543
547
 
544
- 对于需要精细控制的场景,可拆分步骤:
548
+ For scenarios requiring fine-grained control, split into steps:
545
549
 
546
550
  ```bash
547
- # 步骤 1:确认 Sync VCluster 可用
548
- cz-cli agent run "列出所有可用的 VCluster,筛选 vcluster_type 包含 SYNC 的集群,确认有可用的 Sync VCluster" \
551
+ # Step 1: Confirm Sync VCluster availability
552
+ cz-cli agent run "List all available VClusters, filter for clusters where vcluster_type contains SYNC, confirm a Sync VCluster is available" \
549
553
  --format a2a --dangerously-skip-permissions
550
554
 
551
- # 步骤 2:查找数据源
552
- cz-cli agent run "列出所有已配置的数据源,包括 MySQL 类型(ds_type=5)的,记录源端和目标端 Lakehouse 数据源名称" \
555
+ # Step 2: Find data sources
556
+ cz-cli agent run "List all configured data sources, including MySQL type (ds_type=5), record source and target Lakehouse data source names" \
553
557
  --format a2a --dangerously-skip-permissions
554
558
 
555
- # 步骤 3-4:创建并配置 CDC 任务(整库镜像)
556
- cz-cli agent run "创建 CDC 多表实时同步任务(task_type=281),pipeline_type 为整库镜像(3),源端 datasource=<source_ds_name>,同步 <database> 库的所有表,目标 Lakehouse,任务名 cdc_<database>" \
559
+ # Steps 3-4: Create and configure CDC task (full database mirror)
560
+ cz-cli agent run "Create a CDC multi-table real-time sync task (task_type=281), pipeline_type full database mirror (3), source datasource=<source_ds_name>, sync all tables in <database>, target Lakehouse, task name cdc_<database>" \
557
561
  --format a2a --dangerously-skip-permissions
558
562
 
559
- # 步骤 5:提交部署
560
- cz-cli agent run "提交 CDC 任务 cdc_<database>,使其开始持续运行" \
563
+ # Step 5: Submit and deploy
564
+ cz-cli agent run "Submit CDC task cdc_<database> to start continuous running" \
561
565
  --format a2a --dangerously-skip-permissions
562
566
  ```
563
567
 
564
568
  ---
565
569
 
566
- ### 模式二:多表镜像同步(cz-cli agent 版)
570
+ ### Mode 2: Multi-table Mirror Sync (cz-cli agent version)
567
571
 
568
572
  ```bash
569
- # 创建多表镜像 CDC 任务(指定具体表)
570
- cz-cli agent run "创建 CDC 多表实时同步任务(task_type=281),pipeline_type 为多表镜像(1),源端 datasource=<source_ds_name>,同步 <database> 库中的表 <table1>, <table2>, <table3>,目标 Lakehouse,任务名 cdc_<database>_selected" \
573
+ # Create multi-table mirror CDC task (specify specific tables)
574
+ cz-cli agent run "Create a CDC multi-table real-time sync task (task_type=281), pipeline_type multi-table mirror (1), source datasource=<source_ds_name>, sync tables <table1>, <table2>, <table3> from <database>, target Lakehouse, task name cdc_<database>_selected" \
571
575
  --format a2a --dangerously-skip-permissions
572
576
  ```
573
577
 
574
578
  ---
575
579
 
576
- ### 模式三:多表合并同步(cz-cli agent 版)
580
+ ### Mode 3: Sharded Table Merge Sync (cz-cli agent version)
577
581
 
578
582
  ```bash
579
- # 创建多表合并 CDC 任务(多源表合并到单目标表)
580
- cz-cli agent run "创建 CDC 多表实时同步任务(task_type=281),pipeline_type 为多表合并(2),源端 datasource=<source_ds_name>,将 <database> 库中的多张表合并同步到 Lakehouse 目标表,任务名 cdc_<database>_merged" \
583
+ # Create sharded table merge CDC task (multiple source tables merged to single target)
584
+ cz-cli agent run "Create a CDC multi-table real-time sync task (task_type=281), pipeline_type sharded table merge (2), source datasource=<source_ds_name>, merge multiple tables from <database> to Lakehouse target table, task name cdc_<database>_merged" \
581
585
  --format a2a --dangerously-skip-permissions
582
586
  ```
583
587
 
584
588
  ---
585
589
 
586
- ### 运维监控(cz-cli 版)
590
+ ### Operations and Monitoring (cz-cli version)
587
591
 
588
592
  ```bash
589
- # 查看最近运行记录
593
+ # View recent run history
590
594
  cz-cli runs list --task <task_name>
591
595
 
592
- # 查看运行详情
596
+ # View run details
593
597
  cz-cli runs detail <run_id>
594
598
 
595
- # 查看执行日志
599
+ # View execution logs
596
600
  cz-cli attempts log <run_id>
597
601
 
598
- # 下线任务(停止持续运行)
602
+ # Undeploy task (stop continuous running)
599
603
  cz-cli task undeploy <task_name> -y
600
604
  ```
601
605
 
602
606
  ---
603
607
 
604
- ## 交付验收 Checklist
608
+ ## Delivery Acceptance Checklist
605
609
 
606
- CDC 同步任务发布运行后,**必须逐项验证**:
610
+ After the CDC sync task is deployed and running, **verify each item**:
607
611
 
608
612
  ```sql
609
- -- 1. 行数比对:全量阶段完成后,ODS 层行数与源端一致
613
+ -- 1. Row count comparison: after full load phase, ODS layer row count matches source
610
614
  SELECT COUNT(*) FROM <ods_schema>.<table>;
611
615
 
612
- -- 2. 增量验证:写入一条测试数据到源端,确认 Lakehouse 侧同步到位
613
- -- 在源端 MySQL 执行 INSERT,等待 10~30 秒后在 Lakehouse 查询
616
+ -- 2. Incremental verification: insert a test record to source, confirm it syncs to Lakehouse
617
+ -- Execute INSERT on source MySQL, wait 10-30 seconds, then query in Lakehouse
614
618
 
615
- -- 3. 关键字段非空率
619
+ -- 3. Key field non-null rate
616
620
  SELECT
617
621
  COUNT(*) AS total,
618
622
  COUNT(key_field) AS non_null,
619
623
  ROUND(COUNT(key_field) * 100.0 / COUNT(*), 2) AS non_null_pct
620
624
  FROM <ods_schema>.<table>;
621
625
 
622
- -- 4. 检查 _op 字段分布(CDC 接入时)
626
+ -- 4. Check _op field distribution (for CDC ingestion)
623
627
  SELECT _op, COUNT(*) FROM <ods_schema>.<table> GROUP BY _op;
624
- -- 正常应有 IINSERT)记录,UPDATE/DELETE 场景下有 U/D
628
+ -- Normal should have I (INSERT) records; UPDATE/DELETE scenarios will have U/D
625
629
  ```
626
630
 
627
- **验收标准:**
628
- - [ ] 全量阶段完成,ODS 层行数与源端一致
629
- - [ ] 增量写入测试数据,Lakehouse 30 秒内同步到位
630
- - [ ] 关键字段非空率符合预期
631
- - [ ] _op 字段分布合理(无异常大量 D 记录)
632
- - [ ] 任务状态为持续运行(RUNNING),无频繁重启
633
- - [ ] 字段类型映射正确(重点检查 BIT/ENUM/TEXT 等异构类型)
631
+ **Acceptance Criteria:**
632
+ - [ ] Full load phase complete, ODS layer row count matches source
633
+ - [ ] Incremental test data written, synced to Lakehouse within 30 seconds
634
+ - [ ] Key field non-null rate meets expectations
635
+ - [ ] _op field distribution is reasonable (no abnormally large number of D records)
636
+ - [ ] Task status is continuously running (RUNNING), no frequent restarts
637
+ - [ ] Column type mapping is correct (pay attention to BIT/ENUM/TEXT and other heterogeneous types)