@clickzetta/cz-cli-darwin-x64 0.3.92 → 0.3.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
- package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
- package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
- package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
- package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
- package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
- package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
- package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
- package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
- package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
- package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
- package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
- package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
- package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
- package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
- package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
- package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
- package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
- package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
- package/package.json +1 -1
- package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
|
@@ -1,633 +1,637 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: clickzetta-cdc-sync-pipeline
|
|
3
3
|
description: |
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
"database
|
|
10
|
-
"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
4
|
+
Create and manage ClickZetta Lakehouse multi-table real-time sync (CDC) tasks, syncing entire MySQL / PostgreSQL
|
|
5
|
+
databases or multiple tables to Lakehouse in real time.
|
|
6
|
+
Supports three sync modes: full database mirror, multi-table mirror, and sharded table merge.
|
|
7
|
+
Based on Binlog (MySQL) or WALs (PostgreSQL) for second-level end-to-end latency, with full load + incremental two-phase sync.
|
|
8
|
+
Triggered when the user says "multi-table real-time sync", "full database sync", "database mirror",
|
|
9
|
+
"CDC full database", "multi-table CDC", "sharded table merge", "MySQL full database sync to Lakehouse",
|
|
10
|
+
"PostgreSQL full database sync", "multi-table realtime sync", "database migration",
|
|
11
|
+
"full load + incremental sync", "sync operations", "sync SOP", "sync alert configuration",
|
|
12
|
+
"Binlog position expired", "server-id conflict", "full re-sync", "add sync table".
|
|
13
|
+
Covers source database preparation (parameter configuration + permissions), three sync mode selection,
|
|
14
|
+
task creation and deployment, operations SOP (full re-sync/add table/data repair),
|
|
15
|
+
monitoring and alerting (5 alert rules + IM webhook), and detailed troubleshooting —
|
|
16
|
+
all ClickZetta Studio specific logic.
|
|
17
|
+
Keywords: CDC, real-time sync, MySQL, PostgreSQL, change data capture, mirror, merge, multi-table
|
|
14
18
|
---
|
|
15
19
|
|
|
16
|
-
#
|
|
20
|
+
# Multi-table Real-time Sync (CDC) Pipeline Workflow
|
|
17
21
|
|
|
18
|
-
##
|
|
22
|
+
## Wizard: Collect Required Information
|
|
19
23
|
|
|
20
|
-
|
|
24
|
+
Before creating a CDC sync task, use an interactive question tool (e.g., `question`) to collect the following information via option menus. If no such tool is available, list all questions in text at once:
|
|
21
25
|
|
|
22
26
|
```
|
|
23
27
|
question({
|
|
24
28
|
questions: [
|
|
25
29
|
{
|
|
26
|
-
question: "
|
|
30
|
+
question: "Source database type?",
|
|
27
31
|
options: [
|
|
28
|
-
{ label: "MySQL", description: "
|
|
29
|
-
{ label: "PostgreSQL", description: "
|
|
32
|
+
{ label: "MySQL", description: "Including Aurora MySQL, PolarDB MySQL — based on Binlog" },
|
|
33
|
+
{ label: "PostgreSQL", description: "Including Aurora PG, PolarDB PG — based on WALs, requires 14+" }
|
|
30
34
|
]
|
|
31
35
|
},
|
|
32
36
|
{
|
|
33
|
-
question: "
|
|
37
|
+
question: "Sync mode?",
|
|
34
38
|
options: [
|
|
35
|
-
{ label: "
|
|
36
|
-
{ label: "
|
|
37
|
-
{ label: "
|
|
39
|
+
{ label: "Full database mirror", description: "Sync entire database, auto-adapts to new tables" },
|
|
40
|
+
{ label: "Multi-table mirror", description: "Specify which tables to sync" },
|
|
41
|
+
{ label: "Sharded table merge", description: "Merge sharded tables into one target table" }
|
|
38
42
|
]
|
|
39
43
|
},
|
|
40
44
|
{
|
|
41
|
-
question: "
|
|
45
|
+
question: "Is the source database already prepared?",
|
|
42
46
|
options: [
|
|
43
|
-
{ label: "
|
|
44
|
-
{ label: "
|
|
47
|
+
{ label: "Ready", description: "MySQL: Binlog enabled, account has REPLICATION permission; PG: wal_level=logical" },
|
|
48
|
+
{ label: "Not sure, help me check", description: "I'll help verify source configuration" }
|
|
45
49
|
]
|
|
46
50
|
}
|
|
47
51
|
]
|
|
48
52
|
})
|
|
49
53
|
```
|
|
50
54
|
|
|
51
|
-
|
|
55
|
+
After collecting the above, also confirm the target schema (e.g., `ods`).
|
|
52
56
|
|
|
53
|
-
|
|
57
|
+
**If the user has already provided sufficient information, proceed directly to the workflow without showing the menu.**
|
|
54
58
|
|
|
55
|
-
##
|
|
59
|
+
## Applicable Scenarios
|
|
56
60
|
|
|
57
|
-
-
|
|
58
|
-
-
|
|
59
|
-
-
|
|
60
|
-
-
|
|
61
|
-
-
|
|
62
|
-
-
|
|
61
|
+
- Sync entire MySQL / PostgreSQL databases or multiple tables to Lakehouse in real time (CDC change capture)
|
|
62
|
+
- Full database mirror: database-level granularity, auto-adapts to new tables
|
|
63
|
+
- Multi-table mirror: table-level selection, supports automatic schema change detection
|
|
64
|
+
- Sharded table merge: merge sharded table data into a single target table
|
|
65
|
+
- Full load + incremental two-phase sync, second-level end-to-end latency
|
|
66
|
+
- Keywords: multi-table real-time sync, full database sync, CDC, sharded table merge, database migration
|
|
63
67
|
|
|
64
|
-
##
|
|
68
|
+
## Comparison with Other Sync Methods
|
|
65
69
|
|
|
66
|
-
|
|
|
67
|
-
|
|
68
|
-
|
|
|
69
|
-
|
|
|
70
|
-
|
|
|
71
|
-
|
|
|
72
|
-
|
|
|
73
|
-
|
|
|
70
|
+
| Dimension | Multi-table Real-time Sync (This Skill) | Single-table Real-time Sync | Batch Sync |
|
|
71
|
+
|-----------|----------------------------------------|---------------------------|------------|
|
|
72
|
+
| Task Type ID | `281` (multi-table real-time sync) | `28` | `10` / `291` |
|
|
73
|
+
| Sync Granularity | Full database/multi-table/sharded merge | Single table/topic | Single/multi-table |
|
|
74
|
+
| Run Mode | Continuously running (streaming CDC) | Continuously running (streaming) | Scheduled (batch) |
|
|
75
|
+
| Data Sources | MySQL / PostgreSQL | Kafka/MySQL/PG/SQL Server | Multiple |
|
|
76
|
+
| Scheduling | Not required, runs upon submission | Not required | Cron required |
|
|
77
|
+
| Applicable Skill | `clickzetta-cdc-sync-pipeline` | `clickzetta-realtime-sync-pipeline` | `clickzetta-batch-sync-pipeline` |
|
|
74
78
|
|
|
75
|
-
##
|
|
79
|
+
## Supported Data Sources
|
|
76
80
|
|
|
77
|
-
###
|
|
81
|
+
### Source
|
|
78
82
|
|
|
79
|
-
|
|
|
80
|
-
|
|
81
|
-
| MySQL
|
|
82
|
-
| PostgreSQL
|
|
83
|
+
| Data Source Type | Incremental Read Mode | Database Version | ds_type |
|
|
84
|
+
|-----------------|----------------------|-----------------|---------|
|
|
85
|
+
| MySQL (including Aurora MySQL, PolarDB MySQL) | Binlog | 5.6+, 8.x | 5, 39, 19 |
|
|
86
|
+
| PostgreSQL (including Aurora PG, PolarDB PG) | WALs | 14+ | 7, 40, 48 |
|
|
83
87
|
| SQL Server | CDC | - | 8 |
|
|
84
88
|
| TiDB | - | - | 17 |
|
|
85
89
|
|
|
86
|
-
###
|
|
90
|
+
### Target
|
|
87
91
|
|
|
88
|
-
|
|
|
89
|
-
|
|
92
|
+
| Data Source | ds_type |
|
|
93
|
+
|------------|---------|
|
|
90
94
|
| Lakehouse | 1 |
|
|
91
95
|
| Kafka | 2 |
|
|
92
96
|
|
|
93
|
-
##
|
|
97
|
+
## Prerequisites
|
|
94
98
|
|
|
95
|
-
- ClickZetta Lakehouse Studio
|
|
96
|
-
-
|
|
97
|
-
- Sync VCluster
|
|
98
|
-
-
|
|
99
|
-
- **cz-cli
|
|
100
|
-
- **MCP
|
|
99
|
+
- ClickZetta Lakehouse Studio account with permissions to create sync tasks
|
|
100
|
+
- Source data source already configured in Studio (via Studio UI, not SQL Storage Connection), with CDC-required permissions
|
|
101
|
+
- Sync VCluster available (multi-table real-time sync task_type=281 must use a Sync VCluster)
|
|
102
|
+
- **Execution environment (one of the following, cz-cli preferred)**:
|
|
103
|
+
- **cz-cli path**: cz-cli installed (`brew install cz-cli or refer to official docs`) and `cz-cli setup` completed
|
|
104
|
+
- **MCP path**: clickzetta-studio-mcp tools available (`create_task`, `save_cdc_realtime_task`, `publish_task`, `list_data_sources`, `LH_show_object_list`, etc.)
|
|
101
105
|
|
|
102
|
-
##
|
|
106
|
+
## Environment Detection (Read Before Execution)
|
|
103
107
|
|
|
104
|
-
|
|
108
|
+
Before starting any operation, determine the current execution environment:
|
|
105
109
|
|
|
106
|
-
|
|
110
|
+
**Step 1: Check if cz-cli is available**
|
|
107
111
|
```bash
|
|
108
112
|
cz-cli --version
|
|
109
113
|
```
|
|
110
|
-
-
|
|
111
|
-
-
|
|
114
|
+
- If command exists → **use cz-cli path** (see "cz-cli Alternative Path" section at the end of this document)
|
|
115
|
+
- If command not found → continue to check MCP
|
|
112
116
|
|
|
113
|
-
|
|
117
|
+
**Step 2: Check if MCP is available (only when cz-cli is unavailable)**
|
|
114
118
|
|
|
115
|
-
|
|
116
|
-
-
|
|
117
|
-
-
|
|
118
|
-
> "
|
|
119
|
-
> cz-cli
|
|
120
|
-
> MCP
|
|
119
|
+
Try calling the `list_data_sources` tool to query the data source list.
|
|
120
|
+
- If tool exists in tool list → **use MCP path** (default path in this document)
|
|
121
|
+
- If tool not found → stop execution and prompt the user:
|
|
122
|
+
> "Neither cz-cli nor MCP tools are available in the current environment. Please install one of them before retrying.
|
|
123
|
+
> cz-cli installation: `brew install cz-cli or refer to official docs`, then run `cz-cli setup`
|
|
124
|
+
> MCP installation: refer to clickzetta-studio-mcp configuration docs"
|
|
121
125
|
|
|
122
|
-
> ⚠️
|
|
123
|
-
> - `CREATE STORAGE CONNECTION`
|
|
124
|
-
> - MySQL / PostgreSQL
|
|
125
|
-
> -
|
|
126
|
+
> ⚠️ **Important distinction**: CDC multi-table sync uses **Studio data sources** (configured via Studio UI or API), not SQL `CREATE STORAGE CONNECTION`.
|
|
127
|
+
> - `CREATE STORAGE CONNECTION` only supports object storage types (OSS/COS/S3) and Kafka
|
|
128
|
+
> - MySQL / PostgreSQL relational database connections are configured via **Studio Data Source Management**
|
|
129
|
+
> - Use `list_data_sources` API to view configured data sources
|
|
126
130
|
|
|
127
|
-
##
|
|
131
|
+
## Source Database Preparation
|
|
128
132
|
|
|
129
|
-
### MySQL
|
|
133
|
+
### MySQL Parameter Requirements
|
|
130
134
|
|
|
131
|
-
|
|
135
|
+
Verify the following parameters on the source MySQL database:
|
|
132
136
|
|
|
133
|
-
|
|
|
134
|
-
|
|
137
|
+
| Parameter | Required Value | Query Method |
|
|
138
|
+
|-----------|---------------|--------------|
|
|
135
139
|
| `log_bin` | ON | `SHOW GLOBAL VARIABLES LIKE 'log_bin'` |
|
|
136
140
|
| `binlog_format` | ROW | `SHOW GLOBAL VARIABLES LIKE 'binlog_format'` |
|
|
137
141
|
| `binlog_row_image` | FULL | `SHOW GLOBAL VARIABLES LIKE 'binlog_row_image'` |
|
|
138
|
-
| `binlog_expire_logs_seconds` | ≥86400
|
|
142
|
+
| `binlog_expire_logs_seconds` | ≥86400 (recommended) | - |
|
|
139
143
|
|
|
140
|
-
MySQL
|
|
141
|
-
-
|
|
142
|
-
- Binlog
|
|
143
|
-
-
|
|
144
|
+
MySQL permission requirements (recommend executing as root):
|
|
145
|
+
- Metadata read: `SELECT` on information_schema + target database tables
|
|
146
|
+
- Binlog sync: `REPLICATION SLAVE`, `REPLICATION CLIENT`
|
|
147
|
+
- Full load: `SELECT` on target tables
|
|
144
148
|
|
|
145
|
-
### PostgreSQL
|
|
149
|
+
### PostgreSQL Parameter Requirements
|
|
146
150
|
|
|
147
|
-
|
|
151
|
+
The following parameters require a PostgreSQL Server restart after modification:
|
|
148
152
|
|
|
149
|
-
|
|
|
150
|
-
|
|
151
|
-
| `wal_level` | logical |
|
|
152
|
-
| `max_replication_slots` | ≥10 |
|
|
153
|
-
| `max_wal_senders` | ≥10 |
|
|
153
|
+
| Parameter | Required Value | Description |
|
|
154
|
+
|-----------|---------------|-------------|
|
|
155
|
+
| `wal_level` | logical | Enables logical decoding |
|
|
156
|
+
| `max_replication_slots` | ≥10 | Maximum number of slots allowed |
|
|
157
|
+
| `max_wal_senders` | ≥10 | Maximum concurrent WAL sender processes |
|
|
154
158
|
|
|
155
|
-
PostgreSQL
|
|
156
|
-
-
|
|
157
|
-
- WAL
|
|
158
|
-
-
|
|
159
|
-
-
|
|
159
|
+
PostgreSQL permission requirements (recommend executing as admin):
|
|
160
|
+
- Metadata read: `SELECT` on information_schema
|
|
161
|
+
- WAL sync: `REPLICATION` permission
|
|
162
|
+
- Full load: `SELECT` on target tables
|
|
163
|
+
- Create publication: `CREATE` permission
|
|
160
164
|
|
|
161
|
-
> **PostgreSQL
|
|
165
|
+
> **PostgreSQL special note**: A replication slot must be configured. Different tasks should not reuse the same slot. If a slot is occupied when the task starts, it will fail to start.
|
|
162
166
|
|
|
163
|
-
##
|
|
167
|
+
## Workflow
|
|
164
168
|
|
|
165
|
-
###
|
|
169
|
+
### Step 1: Confirm Sync VCluster Availability
|
|
166
170
|
|
|
167
171
|
```
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
172
|
+
Use LH_show_object_list (object_type='VCLUSTERS') to view available virtual clusters.
|
|
173
|
+
Filter for clusters where vcluster_type contains SYNC.
|
|
174
|
+
If no Sync VCluster is available, prompt the user to create one before proceeding.
|
|
171
175
|
```
|
|
172
176
|
|
|
173
|
-
###
|
|
177
|
+
### Step 2: Find Source Data Source
|
|
174
178
|
|
|
175
179
|
```
|
|
176
|
-
|
|
177
|
-
|
|
180
|
+
Use list_data_sources to view configured data sources.
|
|
181
|
+
Filter by type:
|
|
178
182
|
- MySQL: ds_type=5
|
|
179
183
|
- PostgreSQL: ds_type=7
|
|
180
|
-
|
|
184
|
+
Record the source datasource_id and datasource_type.
|
|
181
185
|
```
|
|
182
186
|
|
|
183
|
-
###
|
|
187
|
+
### Step 3: Explore Source Data Structure
|
|
184
188
|
|
|
185
189
|
```
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
190
|
+
Use list_namespaces to view the source database list.
|
|
191
|
+
Use list_metadata_objects to view tables under a database.
|
|
192
|
+
Confirm the sync scope (full database / specific tables / sharded tables).
|
|
189
193
|
```
|
|
190
194
|
|
|
191
|
-
###
|
|
195
|
+
### Step 4: Select Sync Mode
|
|
192
196
|
|
|
193
|
-
|
|
197
|
+
Choose one of three modes based on user requirements:
|
|
194
198
|
|
|
195
|
-
|
|
|
196
|
-
|
|
197
|
-
|
|
|
198
|
-
|
|
|
199
|
-
|
|
|
199
|
+
| Mode | pipeline_type | Use Case |
|
|
200
|
+
|------|--------------|----------|
|
|
201
|
+
| Full database mirror | 3 | Sync all tables in a database, auto-adapts to new tables |
|
|
202
|
+
| Multi-table mirror | 1 | Sync selected specific tables, supports automatic schema change detection |
|
|
203
|
+
| Sharded table merge | 2 | Merge sharded table data into a single target table |
|
|
200
204
|
|
|
201
|
-
###
|
|
205
|
+
### Step 5: Create Multi-table Real-time Sync Task
|
|
202
206
|
|
|
203
207
|
```
|
|
204
|
-
|
|
205
|
-
- task_type: 281
|
|
206
|
-
- task_name:
|
|
207
|
-
- data_folder_id:
|
|
208
|
+
Use create_task to create the task:
|
|
209
|
+
- task_type: 281 (multi-table real-time sync)
|
|
210
|
+
- task_name: custom name (e.g., "cdc_sync_mysql_orders_db")
|
|
211
|
+
- data_folder_id: target folder ID (obtainable via list_folders)
|
|
208
212
|
|
|
209
|
-
|
|
213
|
+
Record the returned task_id (i.e., data_file_id).
|
|
210
214
|
```
|
|
211
215
|
|
|
212
|
-
###
|
|
216
|
+
### Step 6: Configure Sync Content
|
|
213
217
|
|
|
214
218
|
```
|
|
215
|
-
|
|
216
|
-
- data_file_id:
|
|
217
|
-
- pipeline_type:
|
|
219
|
+
Use save_cdc_realtime_task to configure sync:
|
|
220
|
+
- data_file_id: task_id returned in Step 5
|
|
221
|
+
- pipeline_type: mode selected in Step 4 (1=multi-table mirror, 2=sharded table merge, 3=full database mirror)
|
|
218
222
|
- source_datasource_list: [{"datasourceId": <id>, "datasourceType": <type>}]
|
|
219
223
|
- sync_object_list:
|
|
220
|
-
-
|
|
221
|
-
-
|
|
222
|
-
-
|
|
224
|
+
- Full database mirror: [{"schemaName": "<database_name>"}] (specify database name only)
|
|
225
|
+
- Multi-table mirror: [{"schemaName": "<db>", "tableName": "<table>"}, ...]
|
|
226
|
+
- Sharded table merge: configure via regex or batch file
|
|
223
227
|
- target_datasource: {"datasourceId": <lakehouse_id>, "datasourceType": 1}
|
|
224
|
-
- sync_mode: 1
|
|
225
|
-
- save_mode: 2
|
|
228
|
+
- sync_mode: 1 (full load + incremental, recommended) or 2 (incremental only)
|
|
229
|
+
- save_mode: 2 (append, recommended for new tasks)
|
|
226
230
|
```
|
|
227
231
|
|
|
228
|
-
> **sync_mode
|
|
229
|
-
> - `1
|
|
230
|
-
> - `2
|
|
232
|
+
> **sync_mode explanation**:
|
|
233
|
+
> - `1` (full load + incremental): full load of historical data first, then starts incremental CDC — recommended for first use
|
|
234
|
+
> - `2` (incremental only): captures changes from current position only — suitable when historical data already exists
|
|
231
235
|
|
|
232
|
-
###
|
|
236
|
+
### Step 7: Submit and Deploy
|
|
233
237
|
|
|
234
238
|
```
|
|
235
|
-
|
|
236
|
-
- task_id:
|
|
237
|
-
- task_version:
|
|
239
|
+
Use publish_task to submit the task:
|
|
240
|
+
- task_id: task ID
|
|
241
|
+
- task_version: current version number (obtainable via get_task_detail)
|
|
238
242
|
|
|
239
|
-
|
|
243
|
+
The task does not start automatically after submission — manual start is required.
|
|
240
244
|
```
|
|
241
245
|
|
|
242
|
-
>
|
|
246
|
+
> **Important**: Multi-table real-time sync tasks are continuously running streaming tasks. No scheduling configuration is needed (do not call save_task_configuration). Start manually in Studio UI after submission.
|
|
243
247
|
|
|
244
|
-
###
|
|
248
|
+
### Step 8: Start the Task
|
|
245
249
|
|
|
246
|
-
|
|
250
|
+
Start the task in Studio UI, selecting the start method:
|
|
247
251
|
|
|
248
|
-
|
|
|
249
|
-
|
|
250
|
-
|
|
|
251
|
-
|
|
|
252
|
-
|
|
|
252
|
+
| Start Method | Description | Use Case |
|
|
253
|
+
|-------------|-------------|----------|
|
|
254
|
+
| Stateless start | Full sync of all data (full load → incremental) | First start |
|
|
255
|
+
| Resume from last saved state | Resume from the stop position | Restart after stop |
|
|
256
|
+
| Custom start position | MySQL: specify binlog file/time; PG: specify LSN | Data re-sync |
|
|
253
257
|
|
|
254
|
-
|
|
258
|
+
During the full load phase, you can configure maximum concurrency to control pressure on the source database.
|
|
255
259
|
|
|
256
|
-
###
|
|
260
|
+
### Step 9: Operations and Monitoring
|
|
257
261
|
|
|
258
262
|
```
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
-
|
|
263
|
-
-
|
|
264
|
-
- Failover
|
|
265
|
-
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
-
|
|
269
|
-
-
|
|
270
|
-
-
|
|
271
|
-
-
|
|
272
|
-
-
|
|
263
|
+
After starting, the task goes through three phases: Initialization → Full Load → Incremental Sync.
|
|
264
|
+
|
|
265
|
+
Monitoring metrics:
|
|
266
|
+
- Data read / data written (record count)
|
|
267
|
+
- Average read rate / average write rate
|
|
268
|
+
- Failover count
|
|
269
|
+
- Per-table level: latest read position, latest update time, data latency
|
|
270
|
+
|
|
271
|
+
Per-table operations:
|
|
272
|
+
- Priority execution: increase full load priority for a table
|
|
273
|
+
- Cancel run / force stop: stop sync for a single table
|
|
274
|
+
- Re-sync: perform full load + incremental again for that table
|
|
275
|
+
- Backfill sync: re-sync partial data based on filter conditions
|
|
276
|
+
- View exceptions: view Schema Evolution exceptions, etc.
|
|
273
277
|
```
|
|
274
278
|
|
|
275
|
-
##
|
|
279
|
+
## Three Sync Modes in Detail
|
|
276
280
|
|
|
277
|
-
###
|
|
281
|
+
### Full Database Mirror
|
|
278
282
|
|
|
279
|
-
-
|
|
280
|
-
-
|
|
281
|
-
-
|
|
283
|
+
- Configured at database granularity — select database only, not individual tables
|
|
284
|
+
- Auto-adapts to new tables added to the database
|
|
285
|
+
- Suitable for scenarios requiring a complete mirror of an entire database
|
|
282
286
|
|
|
283
|
-
###
|
|
287
|
+
### Multi-table Mirror
|
|
284
288
|
|
|
285
|
-
-
|
|
286
|
-
-
|
|
287
|
-
-
|
|
288
|
-
- PostgreSQL
|
|
289
|
+
- Select specific tables to sync at table granularity
|
|
290
|
+
- Supports automatic detection of column additions and deletions
|
|
291
|
+
- Supports batch configuration (upload configuration file)
|
|
292
|
+
- PostgreSQL requires replication slot configuration (decoderbufs or pgoutput plugin)
|
|
289
293
|
|
|
290
|
-
###
|
|
294
|
+
### Sharded Table Merge
|
|
291
295
|
|
|
292
|
-
-
|
|
293
|
-
-
|
|
294
|
-
-
|
|
295
|
-
-
|
|
296
|
-
-
|
|
297
|
-
-
|
|
298
|
-
-
|
|
299
|
-
-
|
|
296
|
+
- Merges sharded table data into a single target table
|
|
297
|
+
- Uses "virtual tables" as an intermediate layer: when creating a virtual table, define filter conditions based on data source/schema/table names to map matching source tables to the same virtual table
|
|
298
|
+
- Two configuration methods:
|
|
299
|
+
- Rule-based: regex matching to filter tables (e.g., all tables starting with `abc`)
|
|
300
|
+
- File-based: upload configuration file for batch specification
|
|
301
|
+
- Extended fields feature: add extra columns to the target table to record source information (server/database/schema/table names)
|
|
302
|
+
- Sharded table primary key conflict resolution: enable extended fields and set them as composite primary key to avoid write conflicts from records with the same primary key across different shards
|
|
303
|
+
- Heterogeneous column merge: when sharded tables have inconsistent column structures, the system automatically validates and reports differences — use the heterogeneous column merge feature to handle this
|
|
300
304
|
|
|
301
|
-
##
|
|
305
|
+
## Advanced Parameters
|
|
302
306
|
|
|
303
|
-
|
|
307
|
+
The following advanced parameters can be set in the task "Parameters" area (not recommended to adjust by default — contact technical support before adjusting):
|
|
304
308
|
|
|
305
|
-
|
|
|
306
|
-
|
|
307
|
-
| `step1.taskmanager.memory.process.size` |
|
|
308
|
-
| `step2.taskmanager.memory.process.size` |
|
|
309
|
-
| `step1.taskmanager.memory.task.off-heap.size` |
|
|
310
|
-
| `lh.table.cz.common.output.file.max.size` |
|
|
311
|
-
| `pod.limit.memory` |
|
|
309
|
+
| Parameter | Description | Default | Tuning Advice |
|
|
310
|
+
|-----------|-------------|---------|---------------|
|
|
311
|
+
| `step1.taskmanager.memory.process.size` | Incremental sync process total memory | 1600m | Increase to 4000m for very large full loads |
|
|
312
|
+
| `step2.taskmanager.memory.process.size` | Full load process total memory | 2000m | - |
|
|
313
|
+
| `step1.taskmanager.memory.task.off-heap.size` | Incremental sync off-heap memory | 256m | Increase to 500M for very large full loads |
|
|
314
|
+
| `lh.table.cz.common.output.file.max.size` | Full load single file split size | 33554432 | - |
|
|
315
|
+
| `pod.limit.memory` | Submit client memory limit | 1Gi | - |
|
|
312
316
|
|
|
313
|
-
##
|
|
317
|
+
## Stop and Undeploy
|
|
314
318
|
|
|
315
|
-
###
|
|
319
|
+
### Stop Task
|
|
316
320
|
|
|
317
|
-
-
|
|
318
|
-
-
|
|
319
|
-
-
|
|
320
|
-
-
|
|
321
|
-
-
|
|
321
|
+
- Stopping automatically saves the incremental sync position
|
|
322
|
+
- Stop during full load phase: incomplete tables will re-sync from full load on restart
|
|
323
|
+
- Stop during incremental phase: resumes from stop position on restart
|
|
324
|
+
- Recovery: click "Start", select "Resume from last saved state" for checkpoint recovery
|
|
325
|
+
- To backtrack data: select "Custom start position", specify binlog file/position (MySQL) or LSN (PostgreSQL) — ensure the specified position has not expired
|
|
322
326
|
|
|
323
|
-
###
|
|
327
|
+
### Undeploy Task (High Risk)
|
|
324
328
|
|
|
325
|
-
-
|
|
326
|
-
-
|
|
327
|
-
-
|
|
328
|
-
-
|
|
329
|
+
- Does not save sync position — re-deployment requires full re-sync
|
|
330
|
+
- Does not clean up data already synced to target, does not delete target tables
|
|
331
|
+
- Re-sync does not recreate tables: full load uses insert overwrite, incremental uses merge into
|
|
332
|
+
- Use only when: the task is definitively no longer needed, or task state is abnormal and needs repair
|
|
329
333
|
|
|
330
|
-
##
|
|
334
|
+
## Operations SOP
|
|
331
335
|
|
|
332
|
-
###
|
|
336
|
+
### Supplementary Full Load After Initial Start
|
|
333
337
|
|
|
334
|
-
|
|
338
|
+
Three approaches when full load was not selected at first start but historical data is needed later:
|
|
335
339
|
|
|
336
|
-
|
|
|
337
|
-
|
|
338
|
-
|
|
|
339
|
-
|
|
|
340
|
-
|
|
|
340
|
+
| Approach | Operation | Impact |
|
|
341
|
+
|----------|-----------|--------|
|
|
342
|
+
| Approach 1: Single-table re-sync | Execute "Re-sync" for the specified table | Source data synced to temp table, insert overwrite to target table, no query impact |
|
|
343
|
+
| Approach 2: Single-table backfill | Execute "Backfill sync" for the specified table, filter condition set to `where 1=1` | Data pulled from source to temp table based on condition, delete + merge into target table |
|
|
344
|
+
| Approach 3: Undeploy and redeploy | Stop → Undeploy → Deploy → Start (select full load) | Clears position info, full load + incremental re-sync, does not delete target tables |
|
|
341
345
|
|
|
342
|
-
###
|
|
346
|
+
### Add Sync Tables
|
|
343
347
|
|
|
344
|
-
1.
|
|
345
|
-
2.
|
|
346
|
-
3.
|
|
347
|
-
4.
|
|
348
|
-
5.
|
|
348
|
+
1. Edit the task, add the tables to sync, save
|
|
349
|
+
2. Submit task for deployment
|
|
350
|
+
3. Stop the task in Operations Center, then restart
|
|
351
|
+
4. After restart, new tables are automatically synced (full load if configured, otherwise incremental only)
|
|
352
|
+
5. Does not affect sync progress of existing tables
|
|
349
353
|
|
|
350
|
-
###
|
|
354
|
+
### Add/Remove Data Sources/Schemas/Tables for Sharded Tables
|
|
351
355
|
|
|
352
|
-
-
|
|
353
|
-
-
|
|
354
|
-
-
|
|
355
|
-
-
|
|
356
|
+
- Edit directly in the task development interface
|
|
357
|
+
- Save → Submit → Restart task to take effect
|
|
358
|
+
- New objects will automatically execute full load if configured
|
|
359
|
+
- Does not affect sync progress of existing tables
|
|
356
360
|
|
|
357
|
-
###
|
|
361
|
+
### Priority Sync for Important Tables
|
|
358
362
|
|
|
359
|
-
-
|
|
360
|
-
-
|
|
363
|
+
- During full load phase, use "Priority execution" for important tables
|
|
364
|
+
- Jumps the queue in the resource pool to prioritize full load for that table
|
|
361
365
|
|
|
362
|
-
###
|
|
366
|
+
### Pause/Resume Single-table Incremental Sync
|
|
363
367
|
|
|
364
|
-
-
|
|
365
|
-
-
|
|
366
|
-
-
|
|
368
|
+
- Pause: execute "Stop incremental sync" for a table to pause change message consumption
|
|
369
|
+
- Resume: execute "Resume incremental sync" — to ensure data continuity, a full load from source is performed
|
|
370
|
+
- Use case: during sudden high traffic from source, pause less important tables to free processing resources for important ones
|
|
367
371
|
|
|
368
|
-
###
|
|
372
|
+
### Single-table Data Repair
|
|
369
373
|
|
|
370
|
-
|
|
|
371
|
-
|
|
372
|
-
|
|
|
373
|
-
|
|
|
374
|
+
| Operation | Description | Write Method |
|
|
375
|
+
|-----------|-------------|--------------|
|
|
376
|
+
| Re-sync | Re-sync full source table data | Sync to temp table → insert overwrite to target table |
|
|
377
|
+
| Backfill sync | Pull partial/full data from source based on filter conditions | Sync to temp table → delete related data from target → merge into target |
|
|
374
378
|
|
|
375
|
-
##
|
|
379
|
+
## Monitoring and Alerting Configuration
|
|
376
380
|
|
|
377
|
-
###
|
|
381
|
+
### Recommended Alert Rules
|
|
378
382
|
|
|
379
|
-
|
|
383
|
+
Configure the following 5 alert rules for comprehensive task health monitoring:
|
|
380
384
|
|
|
381
|
-
|
|
|
382
|
-
|
|
383
|
-
|
|
|
384
|
-
|
|
|
385
|
-
|
|
|
386
|
-
|
|
|
387
|
-
|
|
|
385
|
+
| Alert Type | Monitored Item | Description |
|
|
386
|
+
|-----------|---------------|-------------|
|
|
387
|
+
| Task Failover | Multi-table real-time sync job failover | Monitors task runtime stability |
|
|
388
|
+
| Task Stopped | Multi-table real-time sync task run failure | Alerts on unexpected task stop |
|
|
389
|
+
| Single-table Exception | Multi-table real-time sync target table change failure | Schema Evolution failure, single field exceeding 10M limit, etc. |
|
|
390
|
+
| End-to-end Latency | Multi-table real-time sync latency | Time interval from source to target |
|
|
391
|
+
| Read Position Lag | Multi-table real-time sync read position lag | Gap between read position and source latest position |
|
|
388
392
|
|
|
389
|
-
|
|
393
|
+
Each alert can have additional filter attributes (workspace, task name, etc.). Without filters, all multi-table real-time tasks under the instance are monitored by default.
|
|
390
394
|
|
|
391
|
-
### IM
|
|
395
|
+
### IM Alert Bot Configuration
|
|
392
396
|
|
|
393
|
-
1.
|
|
394
|
-
2.
|
|
395
|
-
3.
|
|
396
|
-
4.
|
|
397
|
+
1. Configure a group bot in Feishu/WeCom, obtain the webhook URL
|
|
398
|
+
2. Add a webhook configuration in the product, select Feishu/WeCom as channel, enter the webhook URL
|
|
399
|
+
3. Enable webhook in the notification policy
|
|
400
|
+
4. Select the notification policy with webhook enabled in the monitoring rule
|
|
397
401
|
|
|
398
|
-
##
|
|
402
|
+
## Examples
|
|
399
403
|
|
|
400
|
-
###
|
|
404
|
+
### Example 1: MySQL Full Database Real-time Sync to Lakehouse
|
|
401
405
|
|
|
402
|
-
|
|
406
|
+
User says: "Sync the MySQL ecommerce database to Lakehouse in real time"
|
|
403
407
|
|
|
404
|
-
|
|
405
|
-
1.
|
|
406
|
-
2. `list_data_sources`
|
|
407
|
-
3. `create_task(task_type=281, task_name="realtime_sync_ecommerce")` →
|
|
408
|
-
4.
|
|
409
|
-
5. `publish_task(...)`
|
|
408
|
+
Steps:
|
|
409
|
+
1. Source preparation: confirm MySQL has Binlog enabled (`binlog_format=ROW`), create sync account with REPLICATION SLAVE and SELECT permissions
|
|
410
|
+
2. `list_data_sources` to find MySQL data source (ds_type=5) and Lakehouse data source
|
|
411
|
+
3. `create_task(task_type=281, task_name="realtime_sync_ecommerce")` → get studio_url
|
|
412
|
+
4. In Studio UI: select full database mirror → select ecommerce database → configure target workspace → sync_mode select full load + incremental
|
|
413
|
+
5. `publish_task(...)` to submit — task immediately begins full load initialization, then automatically switches to incremental CDC
|
|
410
414
|
|
|
411
|
-
###
|
|
415
|
+
### Example 2: Sharded Table Merge Sync
|
|
412
416
|
|
|
413
|
-
|
|
417
|
+
User says: "I have three sharded tables order_0, order_1, order_2 that need to be merged into one orders table"
|
|
414
418
|
|
|
415
|
-
|
|
419
|
+
Steps:
|
|
416
420
|
1. `create_task(task_type=281, task_name="sync_sharding_orders")`
|
|
417
|
-
2.
|
|
418
|
-
3. `publish_task(...)`
|
|
421
|
+
2. In Studio UI: select sharded table merge → select order_0/order_1/order_2 → set target table as orders → configure extended fields (e.g., `__source_table__`) to identify source
|
|
422
|
+
3. `publish_task(...)` to submit
|
|
419
423
|
|
|
420
|
-
##
|
|
424
|
+
## Troubleshooting
|
|
421
425
|
|
|
422
|
-
###
|
|
426
|
+
### Quick Reference Table
|
|
423
427
|
|
|
424
|
-
|
|
|
425
|
-
|
|
426
|
-
| `CREATE STORAGE CONNECTION TYPE MYSQL`
|
|
427
|
-
|
|
|
428
|
-
|
|
|
429
|
-
| Binlog
|
|
430
|
-
| WAL
|
|
431
|
-
| Slot
|
|
432
|
-
|
|
|
433
|
-
|
|
|
434
|
-
| Schema Evolution
|
|
435
|
-
|
|
|
428
|
+
| Issue | Investigation |
|
|
429
|
+
|-------|--------------|
|
|
430
|
+
| `CREATE STORAGE CONNECTION TYPE MYSQL` error | ❌ ClickZetta does not support MySQL/PostgreSQL type Storage Connections. CDC data sources are configured via **Studio UI Data Source Management**, not SQL commands |
|
|
431
|
+
| Task creation failed | Check if a Sync VCluster is available |
|
|
432
|
+
| Source connection failed | Check Studio data source configuration, network reachability, account permissions |
|
|
433
|
+
| Binlog read failed | Confirm MySQL `log_bin=ON`, `binlog_format=ROW`, `binlog_row_image=FULL` |
|
|
434
|
+
| WAL read failed | Confirm PostgreSQL `wal_level=logical`, slot not occupied by another task |
|
|
435
|
+
| Slot startup conflict | Different tasks should not reuse the same slot — check if another running task is occupying it |
|
|
436
|
+
| Slow full load | Adjust maximum concurrency, check source database load, increase memory parameters |
|
|
437
|
+
| Increasing incremental latency | Check Sync VCluster resources, whether source data volume has spiked |
|
|
438
|
+
| Schema Evolution exception | Use "View exceptions" to see details — note that column type changes are not supported |
|
|
439
|
+
| Sharded table primary key conflict | Enable extended fields and set as composite primary key |
|
|
436
440
|
|
|
437
|
-
###
|
|
441
|
+
### Incremental Sync Failures
|
|
438
442
|
|
|
439
|
-
#### Binlog
|
|
443
|
+
#### Binlog Position Expired
|
|
440
444
|
|
|
441
|
-
-
|
|
442
|
-
-
|
|
443
|
-
-
|
|
444
|
-
1.
|
|
445
|
-
2.
|
|
446
|
-
3.
|
|
445
|
+
- Symptom: error `The connector is trying to read binlog starting at ... but this is no longer available on the server`
|
|
446
|
+
- Cause: the specified binlog file has been purged by MySQL periodic cleanup, or the task was stopped too long causing position expiration
|
|
447
|
+
- Resolution:
|
|
448
|
+
1. Execute `SHOW MASTER STATUS` on source to query current latest binlog file and position
|
|
449
|
+
2. Restart sync task with the latest file and position (select "Custom start position")
|
|
450
|
+
3. If lost data needs recovery, execute "Re-sync" for the affected tables
|
|
447
451
|
|
|
448
|
-
#### Server-id
|
|
452
|
+
#### Server-id Conflict
|
|
449
453
|
|
|
450
|
-
-
|
|
451
|
-
-
|
|
452
|
-
-
|
|
454
|
+
- Symptom: error `A slave with the same server_uuid/server_id as this slave has connected to the master`
|
|
455
|
+
- Cause: the task's assigned server-id (range 5400-6400) conflicts with another sync tool/task on the same database
|
|
456
|
+
- Resolution: check if other sync tasks or tools are syncing binlog on the same database instance, restart the sync task
|
|
453
457
|
|
|
454
|
-
####
|
|
458
|
+
#### Data Source Timezone Configuration Error
|
|
455
459
|
|
|
456
|
-
-
|
|
457
|
-
-
|
|
458
|
-
-
|
|
460
|
+
- Symptom: error `The MySQL server has a timezone offset ... which does not match the configured timezone`
|
|
461
|
+
- Cause: the timezone configured in the data source (default Asia/Shanghai) does not match the actual database timezone
|
|
462
|
+
- Resolution: confirm the database's configured timezone, modify the timezone in the data source configuration
|
|
459
463
|
|
|
460
|
-
#### Binlog
|
|
464
|
+
#### Binlog Event Size Exceeded
|
|
461
465
|
|
|
462
|
-
-
|
|
463
|
-
-
|
|
464
|
-
-
|
|
465
|
-
1.
|
|
466
|
-
2.
|
|
467
|
-
3.
|
|
466
|
+
- Symptom: error `log event entry exceeded max_allowed_packet`
|
|
467
|
+
- Cause: database `max_allowed_packet` is smaller than a binlog event size, or binlog file is corrupted
|
|
468
|
+
- Resolution:
|
|
469
|
+
1. Contact DBA to increase `max_allowed_packet` (max 1G), re-sync after it takes effect
|
|
470
|
+
2. If still failing after adjustment (binlog may be corrupted), restart task with a newer position to skip the problematic position
|
|
471
|
+
3. Execute "Re-sync" for tables that may have missing data
|
|
468
472
|
|
|
469
|
-
###
|
|
473
|
+
### Full Load Failures
|
|
470
474
|
|
|
471
|
-
#### PK
|
|
475
|
+
#### PK Length Exceeded
|
|
472
476
|
|
|
473
|
-
-
|
|
474
|
-
-
|
|
475
|
-
-
|
|
477
|
+
- Symptom: error `Encoded key size 191 exceeds max size 128`
|
|
478
|
+
- Cause: source table primary key total field length exceeds 128 bytes, or extended field composite primary key is too long in sharded table merge scenarios
|
|
479
|
+
- Resolution: add a parameter in the sync task configuration to increase the PK length limit
|
|
476
480
|
|
|
477
|
-
###
|
|
481
|
+
### Sync Task Failover
|
|
478
482
|
|
|
479
|
-
####
|
|
483
|
+
#### Disconnected from Lakehouse Ingestion Service
|
|
480
484
|
|
|
481
|
-
-
|
|
482
|
-
-
|
|
483
|
-
-
|
|
484
|
-
1.
|
|
485
|
-
2.
|
|
486
|
-
3.
|
|
485
|
+
- Symptom: failover details contain `Async commit for instance ... failed. rpcProxy call hit final failed after max retry reached`
|
|
486
|
+
- Cause: typically occurs during Lakehouse service upgrades, connection interrupted
|
|
487
|
+
- Resolution:
|
|
488
|
+
1. Task usually auto-recovers after service upgrade completes
|
|
489
|
+
2. If failover persists, manually restart the task
|
|
490
|
+
3. If still unrecoverable, check Lakehouse Ingestion Service health status
|
|
487
491
|
|
|
488
|
-
#### Binlog
|
|
492
|
+
#### Binlog Event Deserialization Failed
|
|
489
493
|
|
|
490
|
-
-
|
|
491
|
-
-
|
|
492
|
-
-
|
|
493
|
-
1.
|
|
494
|
-
2.
|
|
495
|
-
3.
|
|
496
|
-
4.
|
|
494
|
+
- Symptom: failover details contain `Failed to deserialize data of EventHeaderV4`
|
|
495
|
+
- Cause: sudden burst of binlog events from source (mass updates/bulk deletes), write-side backpressure causes read-side to stop consuming, binlog client connection times out
|
|
496
|
+
- Resolution:
|
|
497
|
+
1. Short-term traffic spike: task usually auto-recovers within limited failover attempts
|
|
498
|
+
2. Persistent occurrence: increase MySQL parameters `slave_net_timeout` and `thread_pool_idle_timeout`
|
|
499
|
+
3. Temporary adjustment (lost on restart): `SET GLOBAL slave_net_timeout = 120; SET GLOBAL thread_pool_idle_timeout = 120;`
|
|
500
|
+
4. Permanent adjustment: modify MySQL configuration file
|
|
497
501
|
|
|
498
|
-
###
|
|
502
|
+
### Table Enters Blocklist
|
|
499
503
|
|
|
500
|
-
#### Schema Evolution
|
|
504
|
+
#### Schema Evolution Failed
|
|
501
505
|
|
|
502
|
-
-
|
|
503
|
-
-
|
|
504
|
-
-
|
|
505
|
-
1.
|
|
506
|
-
2.
|
|
506
|
+
- Symptom: table status automatically changes to sync stopped, with messages like `pk column different`, `pk column type mismatch`, `invalid modify column`
|
|
507
|
+
- Cause: source table structure changed in a way not supported by Lakehouse (PK column list change, PK column type change, incompatible column type modification)
|
|
508
|
+
- Resolution:
|
|
509
|
+
1. Check source table structure, correct it to the proper structure
|
|
510
|
+
2. Execute "Re-sync" for the stopped table — after full load completes, incremental data will continue syncing
|
|
507
511
|
|
|
508
|
-
##
|
|
512
|
+
## Known Limitations
|
|
509
513
|
|
|
510
|
-
-
|
|
511
|
-
- Schema Evolution
|
|
512
|
-
-
|
|
513
|
-
-
|
|
514
|
-
-
|
|
515
|
-
- MySQL
|
|
516
|
-
- PostgreSQL
|
|
514
|
+
- **Cannot create MySQL/PostgreSQL Connection via SQL**: `CREATE STORAGE CONNECTION TYPE MYSQL/POSTGRESQL` will error with `no connection info factory for connection kind 'STORAGE', type 'mysql'`. CDC data sources must be configured via Studio UI Data Source Management
|
|
515
|
+
- Schema Evolution does not support column type changes or automatic new table detection
|
|
516
|
+
- Only tables with primary key (PK) fields are supported — non-PK tables cannot be synced
|
|
517
|
+
- If different source databases/tables contain records with the same primary key, sync results will be abnormal
|
|
518
|
+
- Do not manually create/modify/delete target tables unless necessary (the system auto-manages target table structure)
|
|
519
|
+
- MySQL unsupported column types: `year` (value mismatch)
|
|
520
|
+
- PostgreSQL unsupported column types: `varbit`, `bytea`, `TIMETZ`, `interval`, `NAME` (value mismatch), `NUMERIC`, `decimal` (precision mismatch — target has higher precision)
|
|
517
521
|
|
|
518
522
|
---
|
|
519
523
|
|
|
520
|
-
## cz-cli
|
|
524
|
+
## cz-cli Alternative Path
|
|
521
525
|
|
|
522
|
-
>
|
|
523
|
-
>
|
|
526
|
+
> Use this section only when cz-cli is available and MCP is not. Step numbers correspond to the MCP path above.
|
|
527
|
+
> All operations are delegated to the built-in agent via `cz-cli agent run`, which has full Studio MCP tool access.
|
|
524
528
|
|
|
525
|
-
###
|
|
529
|
+
### Quick Path: Create Task + Studio UI Configuration
|
|
526
530
|
|
|
527
531
|
```bash
|
|
528
|
-
#
|
|
532
|
+
# Create CDC multi-table real-time sync task (task_type=281, i.e., MULTI_REALTIME)
|
|
529
533
|
cz-cli task create "cdc_<database>" --type MULTI_REALTIME --folder <folder_name>
|
|
530
|
-
#
|
|
534
|
+
# Returns task_id and studio_url — complete data source selection, table mapping, etc. at studio_url
|
|
531
535
|
|
|
532
|
-
#
|
|
536
|
+
# After configuration, deploy (CDC tasks need no scheduling, runs continuously upon submission)
|
|
533
537
|
cz-cli task deploy "cdc_<database>" -y
|
|
534
538
|
```
|
|
535
539
|
|
|
536
|
-
###
|
|
540
|
+
### Mode 1: Full Database Mirror Sync (cz-cli agent version)
|
|
537
541
|
|
|
538
542
|
```bash
|
|
539
|
-
#
|
|
540
|
-
cz-cli agent run "
|
|
543
|
+
# Steps 1-9 combined: let the agent complete the full CDC database sync task creation
|
|
544
|
+
cz-cli agent run "Create a CDC multi-table real-time sync task, mirror the entire <database> database from MySQL data source <source_ds_name> to Lakehouse, use Sync VCluster, task name cdc_<database>, place in <folder_name> folder" \
|
|
541
545
|
--format a2a --dangerously-skip-permissions
|
|
542
546
|
```
|
|
543
547
|
|
|
544
|
-
|
|
548
|
+
For scenarios requiring fine-grained control, split into steps:
|
|
545
549
|
|
|
546
550
|
```bash
|
|
547
|
-
#
|
|
548
|
-
cz-cli agent run "
|
|
551
|
+
# Step 1: Confirm Sync VCluster availability
|
|
552
|
+
cz-cli agent run "List all available VClusters, filter for clusters where vcluster_type contains SYNC, confirm a Sync VCluster is available" \
|
|
549
553
|
--format a2a --dangerously-skip-permissions
|
|
550
554
|
|
|
551
|
-
#
|
|
552
|
-
cz-cli agent run "
|
|
555
|
+
# Step 2: Find data sources
|
|
556
|
+
cz-cli agent run "List all configured data sources, including MySQL type (ds_type=5), record source and target Lakehouse data source names" \
|
|
553
557
|
--format a2a --dangerously-skip-permissions
|
|
554
558
|
|
|
555
|
-
#
|
|
556
|
-
cz-cli agent run "
|
|
559
|
+
# Steps 3-4: Create and configure CDC task (full database mirror)
|
|
560
|
+
cz-cli agent run "Create a CDC multi-table real-time sync task (task_type=281), pipeline_type full database mirror (3), source datasource=<source_ds_name>, sync all tables in <database>, target Lakehouse, task name cdc_<database>" \
|
|
557
561
|
--format a2a --dangerously-skip-permissions
|
|
558
562
|
|
|
559
|
-
#
|
|
560
|
-
cz-cli agent run "
|
|
563
|
+
# Step 5: Submit and deploy
|
|
564
|
+
cz-cli agent run "Submit CDC task cdc_<database> to start continuous running" \
|
|
561
565
|
--format a2a --dangerously-skip-permissions
|
|
562
566
|
```
|
|
563
567
|
|
|
564
568
|
---
|
|
565
569
|
|
|
566
|
-
###
|
|
570
|
+
### Mode 2: Multi-table Mirror Sync (cz-cli agent version)
|
|
567
571
|
|
|
568
572
|
```bash
|
|
569
|
-
#
|
|
570
|
-
cz-cli agent run "
|
|
573
|
+
# Create multi-table mirror CDC task (specify specific tables)
|
|
574
|
+
cz-cli agent run "Create a CDC multi-table real-time sync task (task_type=281), pipeline_type multi-table mirror (1), source datasource=<source_ds_name>, sync tables <table1>, <table2>, <table3> from <database>, target Lakehouse, task name cdc_<database>_selected" \
|
|
571
575
|
--format a2a --dangerously-skip-permissions
|
|
572
576
|
```
|
|
573
577
|
|
|
574
578
|
---
|
|
575
579
|
|
|
576
|
-
###
|
|
580
|
+
### Mode 3: Sharded Table Merge Sync (cz-cli agent version)
|
|
577
581
|
|
|
578
582
|
```bash
|
|
579
|
-
#
|
|
580
|
-
cz-cli agent run "
|
|
583
|
+
# Create sharded table merge CDC task (multiple source tables merged to single target)
|
|
584
|
+
cz-cli agent run "Create a CDC multi-table real-time sync task (task_type=281), pipeline_type sharded table merge (2), source datasource=<source_ds_name>, merge multiple tables from <database> to Lakehouse target table, task name cdc_<database>_merged" \
|
|
581
585
|
--format a2a --dangerously-skip-permissions
|
|
582
586
|
```
|
|
583
587
|
|
|
584
588
|
---
|
|
585
589
|
|
|
586
|
-
###
|
|
590
|
+
### Operations and Monitoring (cz-cli version)
|
|
587
591
|
|
|
588
592
|
```bash
|
|
589
|
-
#
|
|
593
|
+
# View recent run history
|
|
590
594
|
cz-cli runs list --task <task_name>
|
|
591
595
|
|
|
592
|
-
#
|
|
596
|
+
# View run details
|
|
593
597
|
cz-cli runs detail <run_id>
|
|
594
598
|
|
|
595
|
-
#
|
|
599
|
+
# View execution logs
|
|
596
600
|
cz-cli attempts log <run_id>
|
|
597
601
|
|
|
598
|
-
#
|
|
602
|
+
# Undeploy task (stop continuous running)
|
|
599
603
|
cz-cli task undeploy <task_name> -y
|
|
600
604
|
```
|
|
601
605
|
|
|
602
606
|
---
|
|
603
607
|
|
|
604
|
-
##
|
|
608
|
+
## Delivery Acceptance Checklist
|
|
605
609
|
|
|
606
|
-
CDC
|
|
610
|
+
After the CDC sync task is deployed and running, **verify each item**:
|
|
607
611
|
|
|
608
612
|
```sql
|
|
609
|
-
-- 1.
|
|
613
|
+
-- 1. Row count comparison: after full load phase, ODS layer row count matches source
|
|
610
614
|
SELECT COUNT(*) FROM <ods_schema>.<table>;
|
|
611
615
|
|
|
612
|
-
-- 2.
|
|
613
|
-
--
|
|
616
|
+
-- 2. Incremental verification: insert a test record to source, confirm it syncs to Lakehouse
|
|
617
|
+
-- Execute INSERT on source MySQL, wait 10-30 seconds, then query in Lakehouse
|
|
614
618
|
|
|
615
|
-
-- 3.
|
|
619
|
+
-- 3. Key field non-null rate
|
|
616
620
|
SELECT
|
|
617
621
|
COUNT(*) AS total,
|
|
618
622
|
COUNT(key_field) AS non_null,
|
|
619
623
|
ROUND(COUNT(key_field) * 100.0 / COUNT(*), 2) AS non_null_pct
|
|
620
624
|
FROM <ods_schema>.<table>;
|
|
621
625
|
|
|
622
|
-
-- 4.
|
|
626
|
+
-- 4. Check _op field distribution (for CDC ingestion)
|
|
623
627
|
SELECT _op, COUNT(*) FROM <ods_schema>.<table> GROUP BY _op;
|
|
624
|
-
--
|
|
628
|
+
-- Normal should have I (INSERT) records; UPDATE/DELETE scenarios will have U/D
|
|
625
629
|
```
|
|
626
630
|
|
|
627
|
-
|
|
628
|
-
- [ ]
|
|
629
|
-
- [ ]
|
|
630
|
-
- [ ]
|
|
631
|
-
- [ ] _op
|
|
632
|
-
- [ ]
|
|
633
|
-
- [ ]
|
|
631
|
+
**Acceptance Criteria:**
|
|
632
|
+
- [ ] Full load phase complete, ODS layer row count matches source
|
|
633
|
+
- [ ] Incremental test data written, synced to Lakehouse within 30 seconds
|
|
634
|
+
- [ ] Key field non-null rate meets expectations
|
|
635
|
+
- [ ] _op field distribution is reasonable (no abnormally large number of D records)
|
|
636
|
+
- [ ] Task status is continuously running (RUNNING), no frequent restarts
|
|
637
|
+
- [ ] Column type mapping is correct (pay attention to BIT/ENUM/TEXT and other heterogeneous types)
|