@clickzetta/cz-cli-darwin-x64 0.3.92 → 0.3.94
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
- package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
- package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
- package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
- package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
- package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
- package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
- package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
- package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
- package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
- package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
- package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
- package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
- package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
- package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
- package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
- package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
- package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
- package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
- package/package.json +1 -1
- package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
|
@@ -1,323 +1,324 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: clickzetta-realtime-sync-pipeline
|
|
3
3
|
description: |
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
4
|
+
Create and manage ClickZetta Lakehouse real-time sync tasks (single-table), syncing data from external sources
|
|
5
|
+
to Lakehouse in real time.
|
|
6
|
+
Supports Kafka, MySQL, PostgreSQL, and other data sources as the source, with Lakehouse as the target.
|
|
7
|
+
Real-time sync tasks are continuously running streaming tasks — no scheduling required; they start running upon submission.
|
|
8
|
+
Triggered when the user says "Studio real-time sync", "realtime sync", "single-table CDC sync",
|
|
9
|
+
"real-time data sync", "Kafka real-time sync to Lakehouse", "MySQL single-table real-time sync",
|
|
10
|
+
"single-table real-time sync", "real-time data migration".
|
|
11
|
+
Covers real-time sync task creation, data source configuration, column mapping (including JSONPath computed columns),
|
|
12
|
+
deployment, and operations — all ClickZetta Studio specific logic.
|
|
13
|
+
Keywords: real-time sync, single table, Kafka source, MySQL source, streaming, CDC
|
|
12
14
|
---
|
|
13
15
|
|
|
14
|
-
#
|
|
16
|
+
# Single-table Real-time Sync Pipeline Workflow
|
|
15
17
|
|
|
16
|
-
##
|
|
18
|
+
## Wizard: Collect Required Information
|
|
17
19
|
|
|
18
|
-
|
|
20
|
+
Before creating a real-time sync task, use an interactive question tool (e.g., `question`) to collect the following information via option menus. If no such tool is available, list all questions in text at once:
|
|
19
21
|
|
|
20
22
|
```
|
|
21
23
|
question({
|
|
22
24
|
questions: [
|
|
23
25
|
{
|
|
24
|
-
question: "
|
|
26
|
+
question: "Data source type?",
|
|
25
27
|
options: [
|
|
26
|
-
{ label: "Kafka", description: "Kafka Topic
|
|
27
|
-
{ label: "MySQL / Aurora MySQL", description: "
|
|
28
|
-
{ label: "PostgreSQL / Aurora PG", description: "
|
|
29
|
-
{ label: "SQL Server", description: "
|
|
28
|
+
{ label: "Kafka", description: "Kafka Topic real-time ingestion, supports JSON message parsing" },
|
|
29
|
+
{ label: "MySQL / Aurora MySQL", description: "Single-table CDC real-time sync" },
|
|
30
|
+
{ label: "PostgreSQL / Aurora PG", description: "Single-table CDC real-time sync" },
|
|
31
|
+
{ label: "SQL Server", description: "Single-table CDC real-time sync" }
|
|
30
32
|
]
|
|
31
33
|
},
|
|
32
34
|
{
|
|
33
|
-
question: "
|
|
35
|
+
question: "Sync granularity?",
|
|
34
36
|
options: [
|
|
35
|
-
{ label: "
|
|
36
|
-
{ label: "
|
|
37
|
+
{ label: "Single table/topic", description: "Supported by this skill, fine-grained configuration" },
|
|
38
|
+
{ label: "Full database/multi-table", description: "Use clickzetta-cdc-sync-pipeline instead" }
|
|
37
39
|
]
|
|
38
40
|
}
|
|
39
41
|
]
|
|
40
42
|
})
|
|
41
43
|
```
|
|
42
44
|
|
|
43
|
-
|
|
45
|
+
**If the user has already provided sufficient information, proceed directly to the workflow without showing the menu.**
|
|
44
46
|
|
|
45
47
|
---
|
|
46
48
|
|
|
47
|
-
##
|
|
49
|
+
## Applicable Scenarios
|
|
48
50
|
|
|
49
|
-
-
|
|
50
|
-
- Kafka Topic → Lakehouse
|
|
51
|
-
- MySQL / PostgreSQL / SQL Server
|
|
52
|
-
-
|
|
53
|
-
-
|
|
54
|
-
-
|
|
51
|
+
- Sync data from external sources to Lakehouse in real time (low latency, continuously running)
|
|
52
|
+
- Kafka Topic → Lakehouse table (supports JSON message parsing)
|
|
53
|
+
- MySQL / PostgreSQL / SQL Server databases → Lakehouse table (CDC change capture)
|
|
54
|
+
- High data freshness requirements — second-level or minute-level latency
|
|
55
|
+
- Single source table/topic to single target table real-time sync
|
|
56
|
+
- Keywords: real-time sync, CDC, streaming sync, Kafka real-time sync
|
|
55
57
|
|
|
56
|
-
##
|
|
58
|
+
## Comparison with Other Sync Methods
|
|
57
59
|
|
|
58
|
-
|
|
|
59
|
-
|
|
60
|
-
|
|
|
61
|
-
|
|
|
62
|
-
|
|
|
63
|
-
|
|
|
64
|
-
|
|
|
65
|
-
|
|
|
60
|
+
| Dimension | Real-time Sync (This Skill) | Batch Sync | Multi-table Real-time Sync |
|
|
61
|
+
|-----------|---------------------------|------------|--------------------------|
|
|
62
|
+
| Task Type ID | `14` (REALTIME/CDC) | `10` / `291` | `281` |
|
|
63
|
+
| Sync Granularity | Single table/topic | Single/multi-table | Full database/multi-table |
|
|
64
|
+
| Run Mode | Continuously running (streaming) | Scheduled (batch) | Continuously running (streaming) |
|
|
65
|
+
| Scheduling | Not required, runs upon submission | Cron expression required | Not required, runs upon submission |
|
|
66
|
+
| Latency | Seconds to minutes | Depends on schedule interval | Seconds to minutes |
|
|
67
|
+
| Applicable Skill | `clickzetta-realtime-sync-pipeline` | `clickzetta-batch-sync-pipeline` | `clickzetta-cdc-sync-pipeline` |
|
|
66
68
|
|
|
67
|
-
##
|
|
69
|
+
## Prerequisites
|
|
68
70
|
|
|
69
|
-
- ClickZetta Lakehouse Studio
|
|
70
|
-
-
|
|
71
|
-
-
|
|
72
|
-
- Sync VCluster
|
|
73
|
-
-
|
|
74
|
-
- **cz-cli
|
|
75
|
-
- **MCP
|
|
71
|
+
- ClickZetta Lakehouse Studio account with permissions to create sync tasks and target tables
|
|
72
|
+
- Source data source already configured in Studio (Kafka / MySQL / PostgreSQL / SQL Server, etc.)
|
|
73
|
+
- Target Lakehouse data source available
|
|
74
|
+
- Sync VCluster available (real-time sync task_type=14 requires a Sync VCluster)
|
|
75
|
+
- **Execution environment (one of the following, cz-cli preferred)**:
|
|
76
|
+
- **cz-cli path**: cz-cli installed (`brew install cz-cli or refer to official docs`) and `cz-cli setup` completed
|
|
77
|
+
- **MCP path**: clickzetta-studio-mcp tools available (`create_task`, `save_integration_task`, `publish_task`, `list_data_sources`, `LH_show_object_list`, etc.)
|
|
76
78
|
|
|
77
|
-
##
|
|
79
|
+
## Environment Detection (Read Before Execution)
|
|
78
80
|
|
|
79
|
-
|
|
81
|
+
Before starting any operation, determine the current execution environment:
|
|
80
82
|
|
|
81
|
-
|
|
83
|
+
**Step 1: Check if cz-cli is available**
|
|
82
84
|
```bash
|
|
83
85
|
cz-cli --version
|
|
84
86
|
```
|
|
85
|
-
-
|
|
86
|
-
-
|
|
87
|
+
- If command exists → **use cz-cli path** (see "cz-cli Alternative Path" section at the end of this document)
|
|
88
|
+
- If command not found → continue to check MCP
|
|
87
89
|
|
|
88
|
-
|
|
90
|
+
**Step 2: Check if MCP is available (only when cz-cli is unavailable)**
|
|
89
91
|
|
|
90
|
-
|
|
91
|
-
-
|
|
92
|
-
-
|
|
93
|
-
> "
|
|
94
|
-
> cz-cli
|
|
95
|
-
> MCP
|
|
92
|
+
Try calling the `list_data_sources` tool to query the data source list.
|
|
93
|
+
- If tool exists in tool list → **use MCP path** (default path in this document)
|
|
94
|
+
- If tool not found → stop execution and prompt the user:
|
|
95
|
+
> "Neither cz-cli nor MCP tools are available in the current environment. Please install one of them before retrying.
|
|
96
|
+
> cz-cli installation: `brew install cz-cli or refer to official docs`, then run `cz-cli setup`
|
|
97
|
+
> MCP installation: refer to clickzetta-studio-mcp configuration docs"
|
|
96
98
|
|
|
97
|
-
##
|
|
99
|
+
## Workflow
|
|
98
100
|
|
|
99
|
-
###
|
|
101
|
+
### Step 1: Confirm Sync VCluster Availability
|
|
100
102
|
|
|
101
103
|
```
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
104
|
+
Use LH_show_object_list (object_type='VCLUSTERS') to view available virtual clusters.
|
|
105
|
+
Filter for clusters where vcluster_type contains SYNC.
|
|
106
|
+
If no Sync VCluster is available, create one before proceeding.
|
|
105
107
|
```
|
|
106
108
|
|
|
107
|
-
###
|
|
109
|
+
### Step 2: Find Available Data Sources
|
|
108
110
|
|
|
109
111
|
```
|
|
110
|
-
|
|
111
|
-
|
|
112
|
+
Use list_data_sources to view configured data source list.
|
|
113
|
+
Filter by type:
|
|
112
114
|
- Kafka: ds_type=2
|
|
113
115
|
- MySQL: ds_type=5
|
|
114
116
|
- PostgreSQL: ds_type=7
|
|
115
117
|
- SQL Server: ds_type=8
|
|
116
|
-
|
|
118
|
+
Record the source datasource_name and target Lakehouse datasource_name.
|
|
117
119
|
```
|
|
118
120
|
|
|
119
|
-
###
|
|
121
|
+
### Step 3: Explore Source Data Structure (Optional)
|
|
120
122
|
|
|
121
123
|
```
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
124
|
+
Use list_namespaces to view the source data source's namespaces (databases/schemas).
|
|
125
|
+
Use list_metadata_objects to view tables/topics under a namespace.
|
|
126
|
+
Use get_metadata_detail to view the column structure of a specific table/topic.
|
|
125
127
|
```
|
|
126
128
|
|
|
127
|
-
###
|
|
129
|
+
### Step 4: Create Real-time Sync Task
|
|
128
130
|
|
|
129
131
|
```
|
|
130
|
-
|
|
131
|
-
- task_type: 14
|
|
132
|
-
- task_name:
|
|
133
|
-
- data_folder_id:
|
|
132
|
+
Use create_task to create the task:
|
|
133
|
+
- task_type: 14 (real-time sync)
|
|
134
|
+
- task_name: custom task name (recommend including source and target info, e.g., "rt_sync_kafka_orders")
|
|
135
|
+
- data_folder_id: target folder ID (obtainable via list_folders)
|
|
134
136
|
|
|
135
|
-
|
|
137
|
+
Record the returned task_id and studio_url.
|
|
136
138
|
```
|
|
137
139
|
|
|
138
|
-
###
|
|
140
|
+
### Step 5: Configure Sync Content
|
|
139
141
|
|
|
140
142
|
```
|
|
141
|
-
|
|
142
|
-
- task_id:
|
|
143
|
-
- source_datasource_name:
|
|
144
|
-
- source_schema:
|
|
145
|
-
- source_table:
|
|
146
|
-
- source_ds_type:
|
|
147
|
-
- sink_datasource_name:
|
|
148
|
-
- sink_schema:
|
|
149
|
-
- sink_table:
|
|
150
|
-
- sink_ds_type: 1
|
|
143
|
+
Use save_integration_task to configure sync:
|
|
144
|
+
- task_id: task ID returned in Step 4
|
|
145
|
+
- source_datasource_name: source data source name
|
|
146
|
+
- source_schema: source database/schema (for Kafka, the namespace containing the topic)
|
|
147
|
+
- source_table: source table name or Kafka topic name
|
|
148
|
+
- source_ds_type: source type (2=Kafka, 5=MySQL, 7=PostgreSQL, 8=SQL Server)
|
|
149
|
+
- sink_datasource_name: target Lakehouse data source name
|
|
150
|
+
- sink_schema: target schema (default: public)
|
|
151
|
+
- sink_table: target table name (optional, defaults to same as source table)
|
|
152
|
+
- sink_ds_type: 1 (Lakehouse)
|
|
151
153
|
```
|
|
152
154
|
|
|
153
|
-
>
|
|
155
|
+
> **Note**: The system automatically retrieves source and target metadata to generate column mappings. If the target table does not exist, it will be auto-created.
|
|
154
156
|
|
|
155
|
-
###
|
|
157
|
+
### Step 6: Kafka JSON Message Parsing (Kafka Source Only)
|
|
156
158
|
|
|
157
|
-
|
|
159
|
+
If the Kafka topic message format is JSON, you can add computed columns in Studio UI to parse nested fields:
|
|
158
160
|
|
|
159
|
-
-
|
|
160
|
-
-
|
|
161
|
-
-
|
|
162
|
-
-
|
|
161
|
+
- Use JSONPath rules to parse content from the value field
|
|
162
|
+
- Examples: `$.id` extracts the top-level id field, `$.data.code` extracts a nested field
|
|
163
|
+
- By default, Kafka topic built-in fields (key, value, timestamp, partition, offset) are used for mapping
|
|
164
|
+
- Computed column configuration must be done in Studio UI (open via studio_url)
|
|
163
165
|
|
|
164
|
-
###
|
|
166
|
+
### Step 7: Submit and Deploy
|
|
165
167
|
|
|
166
168
|
```
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
- task_id:
|
|
170
|
-
- task_version:
|
|
169
|
+
Real-time sync tasks do not require scheduling configuration (no need to call save_task_configuration).
|
|
170
|
+
Use publish_task to submit the task directly:
|
|
171
|
+
- task_id: task ID
|
|
172
|
+
- task_version: current version number (obtainable via get_task_detail)
|
|
171
173
|
|
|
172
|
-
|
|
174
|
+
The task starts running continuously upon submission.
|
|
173
175
|
```
|
|
174
176
|
|
|
175
|
-
>
|
|
177
|
+
> **Important**: Real-time sync tasks do not support test runs in development state — submission is production deployment.
|
|
176
178
|
|
|
177
|
-
###
|
|
179
|
+
### Step 8: Operations and Monitoring
|
|
178
180
|
|
|
179
181
|
```
|
|
180
|
-
|
|
182
|
+
After submission, manage real-time sync tasks in the Operations Center:
|
|
181
183
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
+
View task status: get_task_detail
|
|
185
|
+
View run history: list_task_run (note: real-time tasks run continuously, unlike batch tasks with periodic instances)
|
|
184
186
|
|
|
185
|
-
Studio UI
|
|
186
|
-
-
|
|
187
|
-
-
|
|
188
|
-
-
|
|
187
|
+
In Studio UI you can:
|
|
188
|
+
- Start/stop the task
|
|
189
|
+
- View sync latency and throughput
|
|
190
|
+
- View error logs
|
|
189
191
|
```
|
|
190
192
|
|
|
191
193
|
---
|
|
192
194
|
|
|
193
|
-
##
|
|
195
|
+
## Supported Data Sources
|
|
194
196
|
|
|
195
|
-
###
|
|
197
|
+
### Source
|
|
196
198
|
|
|
197
|
-
|
|
|
198
|
-
|
|
199
|
-
| Kafka | 2 |
|
|
200
|
-
| MySQL | 5 | CDC
|
|
201
|
-
| PostgreSQL | 7 | CDC
|
|
202
|
-
| SQL Server | 8 | CDC
|
|
203
|
-
| Aurora MySQL | 39 | CDC
|
|
204
|
-
| Aurora PostgreSQL | 40 | CDC
|
|
205
|
-
| PolarDB MySQL | 19 | CDC
|
|
206
|
-
| PolarDB PostgreSQL | 48 | CDC
|
|
199
|
+
| Data Source | ds_type | Description |
|
|
200
|
+
|------------|---------|-------------|
|
|
201
|
+
| Kafka | 2 | Supports JSON message parsing (JSONPath computed columns) |
|
|
202
|
+
| MySQL | 5 | CDC change capture |
|
|
203
|
+
| PostgreSQL | 7 | CDC change capture |
|
|
204
|
+
| SQL Server | 8 | CDC change capture |
|
|
205
|
+
| Aurora MySQL | 39 | CDC change capture |
|
|
206
|
+
| Aurora PostgreSQL | 40 | CDC change capture |
|
|
207
|
+
| PolarDB MySQL | 19 | CDC change capture |
|
|
208
|
+
| PolarDB PostgreSQL | 48 | CDC change capture |
|
|
207
209
|
|
|
208
|
-
###
|
|
210
|
+
### Target
|
|
209
211
|
|
|
210
|
-
|
|
|
211
|
-
|
|
212
|
+
| Data Source | ds_type |
|
|
213
|
+
|------------|---------|
|
|
212
214
|
| Lakehouse | 1 |
|
|
213
215
|
|
|
214
|
-
##
|
|
216
|
+
## Troubleshooting
|
|
215
217
|
|
|
216
|
-
|
|
|
217
|
-
|
|
218
|
-
|
|
|
219
|
-
|
|
|
220
|
-
| Kafka
|
|
221
|
-
| JSON
|
|
222
|
-
|
|
|
223
|
-
|
|
|
224
|
-
|
|
|
218
|
+
| Issue | Investigation |
|
|
219
|
+
|-------|--------------|
|
|
220
|
+
| Task creation failed | Check if a Sync VCluster is available (`LH_show_object_list` to view VCLUSTERS, filter for SYNC type) |
|
|
221
|
+
| Source connection failed | Check data source connection info, network reachability, account permissions |
|
|
222
|
+
| No data consumed from Kafka | Check topic name, consumer offset settings, Kafka cluster connectivity |
|
|
223
|
+
| JSON parsing failed | Check JSONPath expression correctness, verify message format is valid JSON |
|
|
224
|
+
| Increasing sync latency | Check if Sync VCluster resources are sufficient, whether source data volume has spiked |
|
|
225
|
+
| Target table write failed | Check if target table exists, column type compatibility, sufficient permissions |
|
|
226
|
+
| Task stopped unexpectedly | Check execution logs (`list_executions` + `get_execution_log`) for specific errors |
|
|
225
227
|
|
|
226
|
-
##
|
|
228
|
+
## Notes
|
|
227
229
|
|
|
228
|
-
###
|
|
230
|
+
### Run Mode
|
|
229
231
|
|
|
230
|
-
-
|
|
231
|
-
-
|
|
232
|
-
-
|
|
232
|
+
- Real-time sync tasks are continuously running streaming tasks — they start running upon submission without scheduling
|
|
233
|
+
- Test runs in development state are not supported
|
|
234
|
+
- After stopping, manual restart is required
|
|
233
235
|
|
|
234
|
-
### Sync VCluster
|
|
236
|
+
### Sync VCluster Requirements
|
|
235
237
|
|
|
236
|
-
-
|
|
237
|
-
-
|
|
238
|
-
-
|
|
238
|
+
- Real-time sync tasks (task_type=14) must use a Sync VCluster
|
|
239
|
+
- Confirm a Sync VCluster is available before creating the task
|
|
240
|
+
- Check via `LH_show_object_list` (object_type='VCLUSTERS'), filter for clusters where vcluster_type contains SYNC
|
|
239
241
|
|
|
240
|
-
### Kafka
|
|
242
|
+
### Kafka Source Special Notes
|
|
241
243
|
|
|
242
|
-
-
|
|
243
|
-
- JSON
|
|
244
|
-
-
|
|
244
|
+
- Supports specifying consumer start offset (earliest / latest / specific offset)
|
|
245
|
+
- JSON messages can be parsed via JSONPath computed columns for nested fields
|
|
246
|
+
- Default fields include: key, value, timestamp, partition, offset
|
|
245
247
|
|
|
246
|
-
###
|
|
248
|
+
### Choosing Between Single-table and Multi-table Real-time Sync
|
|
247
249
|
|
|
248
|
-
-
|
|
249
|
-
-
|
|
250
|
-
-
|
|
250
|
+
- Single-table real-time sync (this skill): suitable for fine-grained sync of a single table/topic
|
|
251
|
+
- Multi-table real-time sync (`clickzetta-cdc-sync-pipeline`): suitable for full database CDC, multi-table batch real-time sync
|
|
252
|
+
- If you need to sync all tables in a database, use multi-table real-time sync
|
|
251
253
|
|
|
252
254
|
---
|
|
253
255
|
|
|
254
|
-
## cz-cli
|
|
256
|
+
## cz-cli Alternative Path
|
|
255
257
|
|
|
256
|
-
>
|
|
257
|
-
>
|
|
258
|
+
> Use this section only when cz-cli is available and MCP is not. Step numbers correspond to the MCP path above.
|
|
259
|
+
> All operations are delegated to the built-in agent via `cz-cli agent run`, which has full Studio MCP tool access.
|
|
258
260
|
|
|
259
|
-
###
|
|
261
|
+
### Single-table Real-time Sync (cz-cli Version)
|
|
260
262
|
|
|
261
|
-
|
|
263
|
+
**Quick path**: Create the task directly, then configure data source in Studio UI
|
|
262
264
|
|
|
263
265
|
```bash
|
|
264
|
-
#
|
|
266
|
+
# Step 1: Create real-time sync task (task_type=14, i.e., REALTIME/CDC)
|
|
265
267
|
cz-cli task create "rt_sync_<table>" --type REALTIME --folder <folder_name>
|
|
266
|
-
#
|
|
268
|
+
# Returns task_id and studio_url — complete data source configuration and column mapping at studio_url
|
|
267
269
|
|
|
268
|
-
#
|
|
270
|
+
# Step 2: After configuration, deploy the task (real-time sync needs no scheduling, runs continuously upon submission)
|
|
269
271
|
cz-cli task deploy "rt_sync_<table>" -y
|
|
270
272
|
```
|
|
271
273
|
|
|
272
|
-
|
|
274
|
+
**Full agent path** (when agent is needed for data source exploration and configuration):
|
|
273
275
|
|
|
274
276
|
```bash
|
|
275
|
-
#
|
|
276
|
-
cz-cli agent run "
|
|
277
|
+
# One-shot: let the agent complete the full real-time sync task creation
|
|
278
|
+
cz-cli agent run "Create a real-time sync task (task_type=14), sync data source <source_ds_name> <schema>.<table> (or Kafka topic <topic>) to Lakehouse public schema in real time, use Sync VCluster, task name rt_sync_<table>, place in <folder_name> folder" \
|
|
277
279
|
--format a2a --dangerously-skip-permissions
|
|
278
280
|
```
|
|
279
281
|
|
|
280
|
-
|
|
282
|
+
For scenarios requiring fine-grained control, split into steps:
|
|
281
283
|
|
|
282
284
|
```bash
|
|
283
|
-
#
|
|
284
|
-
cz-cli agent run "
|
|
285
|
+
# Step 1: Confirm Sync VCluster availability
|
|
286
|
+
cz-cli agent run "List all available VClusters, filter for clusters where vcluster_type contains SYNC, confirm a Sync VCluster is available" \
|
|
285
287
|
--format a2a --dangerously-skip-permissions
|
|
286
288
|
|
|
287
|
-
#
|
|
288
|
-
cz-cli agent run "
|
|
289
|
+
# Step 2: Find data sources
|
|
290
|
+
cz-cli agent run "List all configured data sources, filter by type (Kafka: ds_type=2, MySQL: ds_type=5, PostgreSQL: ds_type=7, SQL Server: ds_type=8), record source and target Lakehouse data source names" \
|
|
289
291
|
--format a2a --dangerously-skip-permissions
|
|
290
292
|
|
|
291
|
-
#
|
|
292
|
-
cz-cli agent run "
|
|
293
|
+
# Step 3 (Optional): Explore source data structure
|
|
294
|
+
cz-cli agent run "View namespace list for data source <source_ds_name>, and the table/topic list and column structure under <schema>" \
|
|
293
295
|
--format a2a --dangerously-skip-permissions
|
|
294
296
|
|
|
295
|
-
#
|
|
296
|
-
cz-cli agent run "
|
|
297
|
+
# Steps 4-5: Create and configure real-time sync task
|
|
298
|
+
cz-cli agent run "Create a real-time sync task (task_type=14), source datasource=<source_ds_name>, schema=<schema>, table=<table> (source_ds_type=<type>), target Lakehouse public.<table>, task name rt_sync_<table>" \
|
|
297
299
|
--format a2a --dangerously-skip-permissions
|
|
298
300
|
|
|
299
|
-
#
|
|
300
|
-
cz-cli agent run "
|
|
301
|
+
# Step 7: Submit and deploy
|
|
302
|
+
cz-cli agent run "Submit real-time sync task rt_sync_<table> to start continuous running" \
|
|
301
303
|
--format a2a --dangerously-skip-permissions
|
|
302
304
|
```
|
|
303
305
|
|
|
304
|
-
>
|
|
306
|
+
> **Note**: Real-time sync tasks do not require scheduling configuration — they start running continuously upon submission. Kafka JSON message computed column configuration must be done in Studio UI.
|
|
305
307
|
|
|
306
308
|
---
|
|
307
309
|
|
|
308
|
-
###
|
|
310
|
+
### Operations and Monitoring (cz-cli Version)
|
|
309
311
|
|
|
310
312
|
```bash
|
|
311
|
-
#
|
|
313
|
+
# View recent run history
|
|
312
314
|
cz-cli runs list --task <task_name>
|
|
313
315
|
|
|
314
|
-
#
|
|
316
|
+
# View run details
|
|
315
317
|
cz-cli runs detail <run_id>
|
|
316
318
|
|
|
317
|
-
#
|
|
319
|
+
# View execution logs
|
|
318
320
|
cz-cli attempts log <run_id>
|
|
319
321
|
|
|
320
|
-
#
|
|
322
|
+
# Undeploy task (stop continuous running)
|
|
321
323
|
cz-cli task undeploy <task_name> -y
|
|
322
324
|
```
|
|
323
|
-
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
{"case_id":"001","type":"should_call","user_input":"
|
|
2
|
-
{"case_id":"002","type":"should_call","user_input":"Kafka
|
|
3
|
-
{"case_id":"003","type":"should_call","user_input":"
|
|
4
|
-
{"case_id":"004","type":"should_call","user_input":"MySQL
|
|
5
|
-
{"case_id":"005","type":"should_call","user_input":"
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"How do I create a single-table real-time sync task in Studio?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["real-time sync","task_type","14"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"How do I configure a single Kafka topic to sync to a Lakehouse table in real time?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["Kafka","real-time sync"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"What is the difference between single-table real-time sync and multi-table real-time sync?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["single","multi","14","281"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"How do I do MySQL single-table CDC real-time sync to Lakehouse?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["MySQL","real-time sync","CDC"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"Does a real-time sync task need scheduling configuration?","expected_skill":"clickzetta-realtime-sync-pipeline","expected_output_contains":["no scheduling","continuously running"]}
|