@clickzetta/cz-cli-darwin-x64 0.3.91 → 0.3.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
- package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
- package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
- package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
- package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
- package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
- package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
- package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
- package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
- package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
- package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
- package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
- package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
- package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
- package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
- package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
- package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
- package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
- package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
- package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
- package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
- package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
- package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
- package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
- package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
- package/package.json +1 -1
- package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
- package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
- /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
|
@@ -1,117 +1,135 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: clickzetta-oss-ingest-pipeline
|
|
3
3
|
description: |
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
|
|
10
|
-
|
|
4
|
+
Build ClickZetta object storage (OSS/S3/COS) data ingestion pipelines, covering both continuous
|
|
5
|
+
ingestion (PIPE) and one-time batch import scenarios. Continuous ingestion supports LIST_PURGE
|
|
6
|
+
scan mode and EVENT_NOTIFICATION message notification mode; batch import supports Volume + INSERT
|
|
7
|
+
INTO and Volume + COPY INTO methods. Triggered when user says "object storage import", "OSS data
|
|
8
|
+
pipeline", "S3 data import", "PIPE continuous ingestion", "auto file loading", "bucket data sync",
|
|
9
|
+
"COS import", "batch import from OSS", "load data from OSS", "Volume import".
|
|
10
|
+
Includes PIPE continuous ingestion (two INGEST_MODEs), batch import (Volume + COPY/INSERT),
|
|
11
|
+
Connection/Volume creation, monitoring and management — all ClickZetta-specific logic.
|
|
11
12
|
Keywords: OSS, S3, COS, object storage, PIPE, COPY INTO, file ingestion
|
|
12
13
|
---
|
|
13
14
|
|
|
14
|
-
#
|
|
15
|
+
# Object Storage Data Pipeline Setup Workflow
|
|
15
16
|
|
|
16
|
-
##
|
|
17
|
+
## Wizard: Collect Required Information
|
|
17
18
|
|
|
18
|
-
|
|
19
|
+
Before building an object storage pipeline, preferably use an interactive Q&A tool (e.g., `question`) to collect the following information via a selection menu; if no such tool is available, list all questions in text at once:
|
|
19
20
|
|
|
20
21
|
```
|
|
21
22
|
question({
|
|
22
23
|
questions: [
|
|
23
24
|
{
|
|
24
|
-
question: "
|
|
25
|
+
question: "Cloud platform?",
|
|
25
26
|
options: [
|
|
26
|
-
{ label: "
|
|
27
|
-
{ label: "AWS S3", description: "
|
|
28
|
-
{ label: "
|
|
27
|
+
{ label: "Alibaba Cloud OSS", description: "Supports both LIST_PURGE and EVENT_NOTIFICATION modes" },
|
|
28
|
+
{ label: "AWS S3", description: "Supports both LIST_PURGE and EVENT_NOTIFICATION modes" },
|
|
29
|
+
{ label: "Tencent Cloud COS", description: "Only supports LIST_PURGE mode" }
|
|
29
30
|
]
|
|
30
31
|
},
|
|
31
32
|
{
|
|
32
|
-
question: "
|
|
33
|
+
question: "Import mode?",
|
|
33
34
|
options: [
|
|
34
|
-
{ label: "
|
|
35
|
-
{ label: "
|
|
35
|
+
{ label: "Continuous ingestion (PIPE)", description: "New files automatically trigger import, near real-time" },
|
|
36
|
+
{ label: "One-time batch import", description: "Manually or scheduled COPY INTO execution" }
|
|
36
37
|
]
|
|
37
38
|
},
|
|
38
39
|
{
|
|
39
|
-
question: "
|
|
40
|
+
question: "File format?",
|
|
40
41
|
options: [
|
|
41
|
-
{ label: "CSV", description: "
|
|
42
|
-
{ label: "JSON / JSONL", description: "JSON
|
|
43
|
-
{ label: "Parquet", description: "
|
|
44
|
-
{ label: "ORC", description: "
|
|
42
|
+
{ label: "CSV", description: "Comma-separated text" },
|
|
43
|
+
{ label: "JSON / JSONL", description: "JSON or newline-delimited JSON" },
|
|
44
|
+
{ label: "Parquet", description: "Columnar storage format" },
|
|
45
|
+
{ label: "ORC", description: "Columnar storage format" }
|
|
45
46
|
]
|
|
46
47
|
}
|
|
47
48
|
]
|
|
48
49
|
})
|
|
49
50
|
```
|
|
50
51
|
|
|
51
|
-
|
|
52
|
+
**If the user has already provided sufficient information, proceed directly to the workflow without showing the menu.**
|
|
52
53
|
|
|
53
54
|
---
|
|
54
55
|
|
|
55
|
-
##
|
|
56
|
+
## Decision Tree
|
|
56
57
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
58
|
+
```
|
|
59
|
+
Is data arriving continuously (new files added over time)?
|
|
60
|
+
├─ YES → Use PIPE (continuous ingestion)
|
|
61
|
+
│ ├─ Need low latency (< 1 min) AND on Alibaba Cloud OSS or AWS S3?
|
|
62
|
+
│ │ ├─ YES → Mode B: EVENT_NOTIFICATION
|
|
63
|
+
│ │ └─ NO → Mode A: LIST_PURGE
|
|
64
|
+
│ └─ On Tencent Cloud COS?
|
|
65
|
+
│ └─ Mode A: LIST_PURGE (only option)
|
|
66
|
+
└─ NO → One-time or scheduled load
|
|
67
|
+
└─ Mode C: Batch Import (Volume + COPY INTO / INSERT INTO)
|
|
68
|
+
├─ Need deduplication protection? → Use COPY INTO
|
|
69
|
+
├─ Need filtering/file selection? → Use INSERT INTO
|
|
70
|
+
└─ Need idempotent overwrite? → Use COPY OVERWRITE INTO
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Applicable Scenarios
|
|
63
76
|
|
|
64
|
-
|
|
77
|
+
- Continuous auto-import from OSS/S3/COS to Lakehouse (PIPE)
|
|
78
|
+
- One-time or scheduled batch import (Volume + COPY/INSERT)
|
|
79
|
+
- Near real-time micro-batch file loading
|
|
80
|
+
- Filtering or transforming data during import
|
|
65
81
|
|
|
66
|
-
|
|
67
|
-
- 对象存储桶可达(Endpoint、AccessKey 或 Role ARN)
|
|
68
|
-
- **执行环境**:已安装并配置 cz-cli
|
|
82
|
+
## Prerequisites
|
|
69
83
|
|
|
70
|
-
|
|
84
|
+
- ClickZetta Lakehouse account with permissions to create PIPEs, tables, storage connections, Volumes, etc.
|
|
85
|
+
- Object storage bucket is reachable (Endpoint, AccessKey, or Role ARN)
|
|
86
|
+
- **Execution environment**: cz-cli installed and configured
|
|
71
87
|
|
|
72
|
-
|
|
88
|
+
## Execution Environment
|
|
89
|
+
|
|
90
|
+
All SQL is executed via `cz-cli sql`:
|
|
73
91
|
|
|
74
92
|
```bash
|
|
75
|
-
cz-cli --version #
|
|
76
|
-
cz-cli sql "SELECT 1" --sync #
|
|
93
|
+
cz-cli --version # Confirm cz-cli is available
|
|
94
|
+
cz-cli sql "SELECT 1" --sync # Verify connection
|
|
77
95
|
```
|
|
78
96
|
|
|
79
|
-
|
|
97
|
+
If cz-cli is needed, refer to the official documentation to install and configure it before retrying.
|
|
80
98
|
|
|
81
|
-
##
|
|
99
|
+
## Core Concepts
|
|
82
100
|
|
|
83
|
-
### INGEST_MODE
|
|
101
|
+
### INGEST_MODE Selection Guide
|
|
84
102
|
|
|
85
|
-
|
|
|
86
|
-
|
|
87
|
-
| `LIST_PURGE` |
|
|
88
|
-
| `EVENT_NOTIFICATION` |
|
|
103
|
+
| Mode | Trigger Method | Use Case | Cloud Platform Support | Auth Method |
|
|
104
|
+
|------|---------------|----------|----------------------|-------------|
|
|
105
|
+
| `LIST_PURGE` | Periodic directory scan | General purpose, deletes source files after import | All cloud platforms | Access Key or Role ARN |
|
|
106
|
+
| `EVENT_NOTIFICATION` | Message service notification | Low-latency scenarios, triggered on file upload | Alibaba Cloud OSS + AWS S3 only | Role ARN only |
|
|
89
107
|
|
|
90
|
-
###
|
|
108
|
+
### Key Limitations
|
|
91
109
|
|
|
92
|
-
-
|
|
93
|
-
-
|
|
94
|
-
-
|
|
95
|
-
-
|
|
96
|
-
-
|
|
97
|
-
-
|
|
98
|
-
-
|
|
99
|
-
-
|
|
110
|
+
- Each PIPE requires a dedicated Volume; Volumes cannot be shared across PIPEs
|
|
111
|
+
- **PIPE `VIRTUAL_CLUSTER` should be a General Purpose (GP) cluster** (recommended); AP clusters also work but GP is best suited for ingestion workloads. Integration (Sync) clusters are not supported for PIPE execution.
|
|
112
|
+
- COPY statement logic cannot be modified; delete and recreate the PIPE instead
|
|
113
|
+
- COPY statements in PIPEs do not support `files` / `regexp` / `subdirectory` parameters
|
|
114
|
+
- Data loading order is not strictly guaranteed
|
|
115
|
+
- `load_history` deduplication records are retained for 7 days
|
|
116
|
+
- Modifying `COPY_JOB_HINT` overwrites all existing hints; set all parameters at once
|
|
117
|
+
- **Volume PIPEs do not support Kafka-specific parameters**: `BATCH_INTERVAL_IN_SECONDS`, `BATCH_SIZE_PER_KAFKA_PARTITION`, `MAX_SKIP_BATCH_COUNT_ON_ERROR` apply only to Kafka PIPEs
|
|
118
|
+
- **`COPY_JOB_HINT` must be valid JSON format** with double-quoted keys and values: `'{"IGNORE_TMP_FILE": "true"}'`; do not use `KEY=VALUE` format
|
|
100
119
|
|
|
101
|
-
###
|
|
120
|
+
### File Size Recommendations
|
|
102
121
|
|
|
103
|
-
- gzip
|
|
104
|
-
- CSV /
|
|
122
|
+
- gzip compressed files: ~50MB
|
|
123
|
+
- CSV / Parquet uncompressed files: 128MB–256MB
|
|
105
124
|
|
|
106
|
-
##
|
|
125
|
+
## Workflow
|
|
107
126
|
|
|
108
|
-
###
|
|
127
|
+
### Mode A: LIST_PURGE Scan Mode (General Purpose)
|
|
109
128
|
|
|
110
|
-
####
|
|
129
|
+
#### Step 1: Create Storage Connection
|
|
111
130
|
|
|
112
131
|
```sql
|
|
113
|
-
--
|
|
114
|
-
-- 密钥方式(LIST_PURGE 模式支持)
|
|
132
|
+
-- Access Key method (supported by LIST_PURGE mode)
|
|
115
133
|
CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_connection
|
|
116
134
|
TYPE OSS
|
|
117
135
|
access_id = '<your_access_key_id>'
|
|
@@ -119,18 +137,17 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_connection
|
|
|
119
137
|
ENDPOINT = 'oss-cn-hangzhou.aliyuncs.com';
|
|
120
138
|
```
|
|
121
139
|
|
|
122
|
-
>
|
|
123
|
-
> - `access_id
|
|
124
|
-
> - `access_key
|
|
125
|
-
> -
|
|
126
|
-
> - ⚠️ `ACCESS_KEY` / `SECRET_KEY`
|
|
140
|
+
> **Parameter notes**:
|
|
141
|
+
> - `access_id`: Corresponds to **AccessKey ID** in the Alibaba Cloud console
|
|
142
|
+
> - `access_key`: Corresponds to **AccessKey Secret** in the Alibaba Cloud console
|
|
143
|
+
> - Uppercase forms `ACCESS_KEY_ID` / `ACCESS_KEY_SECRET` are also accepted
|
|
144
|
+
> - ⚠️ `ACCESS_KEY` / `SECRET_KEY` will error (missing `_ID` / `_SECRET` suffix)
|
|
127
145
|
>
|
|
128
|
-
>
|
|
146
|
+
> **Tip**: For Role ARN method (required for EVENT_NOTIFICATION mode), see the Connection creation syntax in "Mode B" below.
|
|
129
147
|
|
|
130
|
-
####
|
|
148
|
+
#### Step 2: Create External Volume
|
|
131
149
|
|
|
132
150
|
```sql
|
|
133
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
134
151
|
CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_volume
|
|
135
152
|
LOCATION 'oss://my-bucket/data-path/'
|
|
136
153
|
USING CONNECTION my_oss_connection
|
|
@@ -139,30 +156,36 @@ CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_volume
|
|
|
139
156
|
COMMENT 'Volume for OSS PIPE ingestion';
|
|
140
157
|
```
|
|
141
158
|
|
|
142
|
-
>
|
|
143
|
-
> - `RECURSIVE = true
|
|
144
|
-
> - `DIRECTORY = (enable = true, auto_refresh = true)
|
|
145
|
-
> - ⚠️ COMMENT
|
|
159
|
+
> **Key parameters**:
|
|
160
|
+
> - `RECURSIVE = true`: Recursively scan subdirectories
|
|
161
|
+
> - `DIRECTORY = (enable = true, auto_refresh = true)`: Auto-refresh directory metadata
|
|
162
|
+
> - ⚠️ COMMENT has no equals sign: `COMMENT 'text'` (not `COMMENT = 'text'`)
|
|
146
163
|
|
|
147
|
-
####
|
|
164
|
+
#### Step 3: Verify Schema and Sample Data
|
|
148
165
|
|
|
149
|
-
|
|
166
|
+
Before creating the PIPE, probe the Volume with a SELECT to verify file parsing and schema mapping:
|
|
150
167
|
|
|
151
168
|
```sql
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
169
|
+
SELECT *
|
|
170
|
+
FROM VOLUME pipe_volume (
|
|
171
|
+
id STRING,
|
|
172
|
+
name STRING,
|
|
173
|
+
amount DECIMAL(10,2),
|
|
174
|
+
created_date STRING
|
|
175
|
+
) USING CSV OPTIONS ('header' = 'true')
|
|
176
|
+
LIMIT 20;
|
|
156
177
|
```
|
|
157
178
|
|
|
158
|
-
|
|
159
|
-
> - PIPE 中的 COPY 语句不支持 `files`、`regexp`、`subdirectory` 参数。确保此处验证时也不使用这些参数。
|
|
160
|
-
> - OPTIONS 放在 PURGE=true **之前**:`USING CSV OPTIONS (...) PURGE=true`
|
|
179
|
+
**→ Show the results to the user and ask for confirmation before proceeding to Step 4.**
|
|
161
180
|
|
|
162
|
-
|
|
181
|
+
> **Notes**:
|
|
182
|
+
> - SELECT FROM VOLUME is read-only — no temp tables, no cleanup needed.
|
|
183
|
+
> - If columns appear misaligned or values are NULL, adjust the schema definition or OPTIONS before proceeding.
|
|
184
|
+
> - This validates the same parsing logic the PIPE's COPY INTO will use.
|
|
185
|
+
|
|
186
|
+
#### Step 4: Create PIPE (LIST_PURGE Mode)
|
|
163
187
|
|
|
164
188
|
```sql
|
|
165
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
166
189
|
CREATE PIPE IF NOT EXISTS my_oss_pipe
|
|
167
190
|
INGEST_MODE = 'LIST_PURGE'
|
|
168
191
|
VIRTUAL_CLUSTER = 'my_vc'
|
|
@@ -173,52 +196,49 @@ FROM VOLUME pipe_volume
|
|
|
173
196
|
USING CSV OPTIONS ('header' = 'true') PURGE=true;
|
|
174
197
|
```
|
|
175
198
|
|
|
176
|
-
> **⚠️
|
|
177
|
-
> - `PURGE=true`
|
|
178
|
-
> - OPTIONS
|
|
179
|
-
> -
|
|
180
|
-
> -
|
|
181
|
-
> -
|
|
182
|
-
> -
|
|
183
|
-
> -
|
|
184
|
-
> - `VIRTUAL_CLUSTER`:指定执行 PIPE 任务的虚拟集群
|
|
199
|
+
> **⚠️ Syntax key points**:
|
|
200
|
+
> - `PURGE=true` goes at the end: `USING <format> [OPTIONS (...)] PURGE=true`
|
|
201
|
+
> - OPTIONS goes **before** PURGE=true (if needed)
|
|
202
|
+
> - Can also omit OPTIONS: `USING CSV PURGE=true` (recommended concise form)
|
|
203
|
+
> - Uppercase `PURGE`, lowercase `true`, connected with `=`, no spaces
|
|
204
|
+
> - **LIST_PURGE mode requires** `PURGE=true`; source files are deleted after successful load (prevents duplicate imports)
|
|
205
|
+
> - Even if you don't want to delete source files, LIST_PURGE mode still requires this parameter, otherwise the same file will be imported repeatedly
|
|
206
|
+
> - `VIRTUAL_CLUSTER`: Specifies the virtual cluster that executes the PIPE task
|
|
185
207
|
>
|
|
186
|
-
>
|
|
208
|
+
> **Incorrect syntax** (will cause syntax errors):
|
|
187
209
|
> ```sql
|
|
188
|
-
> -- ❌
|
|
210
|
+
> -- ❌ Do not put purge inside OPTIONS
|
|
189
211
|
> OPTIONS ('header' = 'true', 'purge' = 'true')
|
|
190
|
-
> -- ❌ OPTIONS
|
|
212
|
+
> -- ❌ OPTIONS cannot come after PURGE
|
|
191
213
|
> USING CSV PURGE=true OPTIONS ('header' = 'true')
|
|
192
|
-
> -- ❌
|
|
214
|
+
> -- ❌ Do not use lowercase or quotes
|
|
193
215
|
> 'purge'='true'
|
|
194
216
|
> ```
|
|
195
217
|
|
|
196
|
-
####
|
|
218
|
+
#### Step 5: Verify PIPE Status
|
|
197
219
|
|
|
198
220
|
```sql
|
|
199
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
200
221
|
DESC PIPE EXTENDED my_oss_pipe;
|
|
201
222
|
```
|
|
202
223
|
|
|
203
|
-
|
|
224
|
+
Confirm `pipe_execution_paused = false` (PIPE is running).
|
|
204
225
|
|
|
205
226
|
---
|
|
206
227
|
|
|
207
|
-
###
|
|
228
|
+
### Mode B: EVENT_NOTIFICATION Message Notification Mode (Low Latency)
|
|
208
229
|
|
|
209
|
-
>
|
|
230
|
+
> Supported on Alibaba Cloud OSS + AWS S3 only. After files are uploaded to the bucket, Lakehouse is notified via message service (MNS/SQS) to load immediately.
|
|
210
231
|
|
|
211
|
-
####
|
|
232
|
+
#### Prerequisites (Alibaba Cloud OSS Example)
|
|
212
233
|
|
|
213
|
-
1.
|
|
214
|
-
2.
|
|
215
|
-
3.
|
|
216
|
-
4.
|
|
234
|
+
1. **Enable Alibaba Cloud MNS**: Activate Message Service (MNS) in the Alibaba Cloud console
|
|
235
|
+
2. **Configure OSS event notification**: In OSS bucket → Event Notification → Create Rule, select event type `ObjectCreated`, target as MNS queue
|
|
236
|
+
3. **Grant OSS read permissions**: Create a RAM role, grant `oss:GetObject` and `oss:ListBucket` permissions, record the Role ARN
|
|
237
|
+
4. **Authorize MNS to Lakehouse**: Add the Lakehouse service account to the MNS queue's authorization policy
|
|
217
238
|
|
|
218
|
-
####
|
|
239
|
+
#### Step 1: Create Storage Connection (Role ARN Method)
|
|
219
240
|
|
|
220
241
|
```sql
|
|
221
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
222
242
|
CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_role_connection
|
|
223
243
|
TYPE OSS
|
|
224
244
|
ENDPOINT = 'oss-cn-hangzhou.aliyuncs.com'
|
|
@@ -226,10 +246,9 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_role_connection
|
|
|
226
246
|
REGION = 'cn-hangzhou';
|
|
227
247
|
```
|
|
228
248
|
|
|
229
|
-
####
|
|
249
|
+
#### Step 2: Create External Volume
|
|
230
250
|
|
|
231
251
|
```sql
|
|
232
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
233
252
|
CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_event_volume
|
|
234
253
|
LOCATION 'oss://my-bucket/data-path/'
|
|
235
254
|
USING CONNECTION my_oss_role_connection
|
|
@@ -237,10 +256,9 @@ CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_event_volume
|
|
|
237
256
|
RECURSIVE = true;
|
|
238
257
|
```
|
|
239
258
|
|
|
240
|
-
####
|
|
259
|
+
#### Step 3: Create PIPE (EVENT_NOTIFICATION Mode)
|
|
241
260
|
|
|
242
261
|
```sql
|
|
243
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
244
262
|
CREATE PIPE IF NOT EXISTS my_oss_event_pipe
|
|
245
263
|
INGEST_MODE = 'EVENT_NOTIFICATION'
|
|
246
264
|
VIRTUAL_CLUSTER = 'my_vc'
|
|
@@ -252,28 +270,26 @@ FROM VOLUME pipe_event_volume
|
|
|
252
270
|
USING CSV;
|
|
253
271
|
```
|
|
254
272
|
|
|
255
|
-
>
|
|
256
|
-
> - `INGEST_MODE = 'EVENT_NOTIFICATION'
|
|
257
|
-
> - `ALICLOUD_MNS_QUEUE
|
|
258
|
-
> -
|
|
259
|
-
> - COMMENT 不带等号:`COMMENT 'text'`
|
|
273
|
+
> **Parameter notes**:
|
|
274
|
+
> - `INGEST_MODE = 'EVENT_NOTIFICATION'`: Triggers loading via message notification
|
|
275
|
+
> - `ALICLOUD_MNS_QUEUE`: Alibaba Cloud MNS queue name (use `AWS_SQS_QUEUE` for AWS)
|
|
276
|
+
> - This mode does not require `PURGE=true` since it's event-driven rather than scan-based
|
|
260
277
|
|
|
261
278
|
---
|
|
262
279
|
|
|
263
|
-
###
|
|
280
|
+
### Mode C: Batch Import (One-time Volume + COPY/INSERT)
|
|
264
281
|
|
|
265
|
-
>
|
|
266
|
-
>
|
|
282
|
+
> Suitable for one-time or scheduled batch loading of files from object storage; no PIPE creation needed. Supports Alibaba Cloud OSS, Tencent Cloud COS, and AWS S3.
|
|
283
|
+
> Recommended to use GENERAL PURPOSE type virtual clusters for batch loading.
|
|
267
284
|
|
|
268
|
-
####
|
|
285
|
+
#### Usage Limitations
|
|
269
286
|
|
|
270
|
-
-
|
|
271
|
-
-
|
|
287
|
+
- Cross-cloud import is not supported (source storage and Lakehouse environment must be on the same cloud platform)
|
|
288
|
+
- Same-region internal endpoints are recommended (e.g., `oss-cn-shanghai-internal.aliyuncs.com`) for better speed and stability
|
|
272
289
|
|
|
273
|
-
####
|
|
290
|
+
#### Step 1: Create Target Table
|
|
274
291
|
|
|
275
292
|
```sql
|
|
276
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
277
293
|
CREATE TABLE IF NOT EXISTS my_schema.target_table (
|
|
278
294
|
id STRING,
|
|
279
295
|
name STRING,
|
|
@@ -282,10 +298,9 @@ CREATE TABLE IF NOT EXISTS my_schema.target_table (
|
|
|
282
298
|
);
|
|
283
299
|
```
|
|
284
300
|
|
|
285
|
-
####
|
|
301
|
+
#### Step 2: Create Storage Connection (access_id/access_key Syntax)
|
|
286
302
|
|
|
287
303
|
```sql
|
|
288
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
289
304
|
CREATE STORAGE CONNECTION IF NOT EXISTS my_batch_conn
|
|
290
305
|
TYPE OSS
|
|
291
306
|
ENDPOINT = 'oss-cn-shanghai-internal.aliyuncs.com'
|
|
@@ -293,35 +308,30 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_batch_conn
|
|
|
293
308
|
access_key = '<your_access_key_secret>';
|
|
294
309
|
```
|
|
295
310
|
|
|
296
|
-
> **Connection
|
|
297
|
-
> - 小写形式:`access_id` / `access_key`(推荐)
|
|
298
|
-
> - 大写形式:`ACCESS_KEY_ID` / `ACCESS_KEY_SECRET`(也可以)
|
|
299
|
-
> - ⚠️ `ACCESS_KEY` / `SECRET_KEY` 会报错(缺少后缀)
|
|
311
|
+
> **Connection parameter naming**: See Mode A Step 1 for accepted forms. Use `access_id`/`access_key` (lowercase, recommended) or `ACCESS_KEY_ID`/`ACCESS_KEY_SECRET`. Never use `ACCESS_KEY`/`SECRET_KEY`.
|
|
300
312
|
|
|
301
|
-
####
|
|
313
|
+
#### Step 3: Create External Volume (with Directory Auto-refresh)
|
|
302
314
|
|
|
303
315
|
```sql
|
|
304
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
305
316
|
CREATE EXTERNAL VOLUME IF NOT EXISTS my_batch_volume
|
|
306
317
|
LOCATION 'oss://my-bucket/data-path/'
|
|
307
318
|
USING CONNECTION my_batch_conn
|
|
308
319
|
DIRECTORY = (enable=true, auto_refresh=true);
|
|
309
320
|
```
|
|
310
321
|
|
|
311
|
-
>
|
|
312
|
-
> - `LOCATION
|
|
313
|
-
> - `USING CONNECTION
|
|
314
|
-
> - `DIRECTORY = (enable=true, auto_refresh=true)
|
|
322
|
+
> **Key parameters**:
|
|
323
|
+
> - `LOCATION`: Object storage path, format: `oss://bucket/path/`
|
|
324
|
+
> - `USING CONNECTION`: References the previously created storage connection
|
|
325
|
+
> - `DIRECTORY = (enable=true, auto_refresh=true)`: Enables directory metadata with auto-refresh for querying file lists in the Volume
|
|
315
326
|
>
|
|
316
|
-
> **Volume
|
|
317
|
-
> - ✅
|
|
318
|
-
> - ⚠️
|
|
319
|
-
> -
|
|
327
|
+
> **Volume creation syntax notes**:
|
|
328
|
+
> - ✅ Recommended syntax: `LOCATION '...' USING CONNECTION conn_name` (official documentation standard)
|
|
329
|
+
> - ⚠️ Legacy syntax: `STORAGE_CONNECTION = conn_name LOCATION = '...'` (appears in some older docs, still works)
|
|
330
|
+
> - Both syntaxes are functionally equivalent; recommend using `LOCATION ... USING CONNECTION` consistently
|
|
320
331
|
|
|
321
|
-
####
|
|
332
|
+
#### Step 4a: INSERT INTO from Volume (Supports Filtering and Transformation)
|
|
322
333
|
|
|
323
334
|
```sql
|
|
324
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
325
335
|
INSERT INTO my_schema.target_table
|
|
326
336
|
SELECT * FROM VOLUME my_batch_volume (
|
|
327
337
|
id STRING,
|
|
@@ -333,17 +343,16 @@ FILES ('data_file_01.csv')
|
|
|
333
343
|
WHERE amount > 0;
|
|
334
344
|
```
|
|
335
345
|
|
|
336
|
-
>
|
|
337
|
-
> - `VOLUME my_batch_volume (...)
|
|
338
|
-
> - `USING CSV OPTIONS (...)
|
|
339
|
-
> - `FILES ('file1.csv', 'file2.csv')
|
|
340
|
-
> - `WHERE
|
|
341
|
-
> - INSERT INTO
|
|
346
|
+
> **Parameter notes**:
|
|
347
|
+
> - `VOLUME my_batch_volume (...)`: Specifies Volume and column definitions (Schema-on-Read)
|
|
348
|
+
> - `USING CSV OPTIONS (...)`: Specifies file format and parsing options
|
|
349
|
+
> - `FILES ('file1.csv', 'file2.csv')`: Specifies files to load (optional; loads all if omitted)
|
|
350
|
+
> - `WHERE ...`: Filters and transforms data (optional)
|
|
351
|
+
> - INSERT INTO supports `FILES` and `WHERE` parameters, suitable for fine-grained control
|
|
342
352
|
|
|
343
|
-
####
|
|
353
|
+
#### Step 4b: COPY INTO from Volume (Concise Syntax)
|
|
344
354
|
|
|
345
355
|
```sql
|
|
346
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
347
356
|
COPY INTO my_schema.target_table
|
|
348
357
|
FROM VOLUME my_batch_volume (
|
|
349
358
|
id STRING,
|
|
@@ -353,86 +362,110 @@ FROM VOLUME my_batch_volume (
|
|
|
353
362
|
) USING CSV OPTIONS ('header'='true', 'sep'=',');
|
|
354
363
|
```
|
|
355
364
|
|
|
356
|
-
> **INSERT INTO vs COPY INTO
|
|
357
|
-
> - `INSERT INTO
|
|
358
|
-
> - `COPY INTO
|
|
359
|
-
> -
|
|
360
|
-
> -
|
|
365
|
+
> **INSERT INTO vs COPY INTO selection**:
|
|
366
|
+
> - `INSERT INTO`: Supports `FILES()` for specifying files and `WHERE` for filtering/transformation; suitable for fine-grained control
|
|
367
|
+
> - `COPY INTO`: More concise syntax; suitable for full loads
|
|
368
|
+
> - `COPY OVERWRITE INTO`: Replaces all existing data in the target table; use for idempotent full-refresh loads
|
|
369
|
+
> - Both COPY and INSERT support Schema-on-Read (defining columns in FROM VOLUME)
|
|
370
|
+
> - ⚠️ **load_history difference**: Only `COPY INTO` records to `load_history`; `INSERT INTO ... FROM VOLUME` does not. Use `COPY INTO` if deduplication protection is needed
|
|
371
|
+
|
|
372
|
+
#### Step 4c: COPY OVERWRITE INTO (Idempotent Full Refresh)
|
|
373
|
+
|
|
374
|
+
```sql
|
|
375
|
+
COPY OVERWRITE INTO my_schema.target_table
|
|
376
|
+
FROM VOLUME my_batch_volume (
|
|
377
|
+
id STRING,
|
|
378
|
+
name STRING,
|
|
379
|
+
amount DECIMAL(10,2),
|
|
380
|
+
created_date STRING
|
|
381
|
+
) USING CSV OPTIONS ('header'='true', 'sep'=',');
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
> Atomically replaces all rows in the target table. Safe to retry — running twice produces the same result.
|
|
361
385
|
|
|
362
|
-
####
|
|
386
|
+
#### Step 5: Verify Import Results
|
|
363
387
|
|
|
364
388
|
```sql
|
|
365
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
366
389
|
SELECT COUNT(*) AS total_rows FROM my_schema.target_table;
|
|
367
390
|
SELECT * FROM my_schema.target_table LIMIT 10;
|
|
368
391
|
```
|
|
369
392
|
|
|
370
393
|
---
|
|
371
394
|
|
|
372
|
-
##
|
|
395
|
+
## Monitoring & Operations
|
|
373
396
|
|
|
374
|
-
###
|
|
397
|
+
### List Existing PIPEs
|
|
398
|
+
|
|
399
|
+
```sql
|
|
400
|
+
SHOW PIPES;
|
|
401
|
+
SHOW PIPES LIKE '%oss%';
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
### List Files in a Volume
|
|
405
|
+
|
|
406
|
+
```sql
|
|
407
|
+
SELECT * FROM DIRECTORY(@my_batch_volume) LIMIT 20;
|
|
408
|
+
-- If files are missing, refresh directory metadata first:
|
|
409
|
+
ALTER VOLUME my_batch_volume REFRESH;
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
### View PIPE Detailed Status
|
|
375
413
|
|
|
376
414
|
```sql
|
|
377
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
378
415
|
DESC PIPE EXTENDED my_oss_pipe;
|
|
379
416
|
```
|
|
380
417
|
|
|
381
|
-
|
|
382
|
-
- `pipe_execution_paused
|
|
383
|
-
- `ingest_mode
|
|
384
|
-
- `virtual_cluster
|
|
385
|
-
- `definition
|
|
418
|
+
Key fields:
|
|
419
|
+
- `pipe_execution_paused`: Whether paused
|
|
420
|
+
- `ingest_mode`: Import mode
|
|
421
|
+
- `virtual_cluster`: Execution cluster
|
|
422
|
+
- `definition`: COPY statement definition
|
|
386
423
|
|
|
387
|
-
###
|
|
424
|
+
### View Load History
|
|
388
425
|
|
|
389
426
|
```sql
|
|
390
|
-
-- 通过 cz-cli sql "<SQL>" --sync 执行
|
|
391
427
|
SELECT * FROM load_history('my_schema.target_table')
|
|
392
|
-
ORDER BY
|
|
428
|
+
ORDER BY last_copy_time DESC
|
|
393
429
|
LIMIT 20;
|
|
394
430
|
```
|
|
395
431
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
### 通过 query_tag 过滤 PIPE 作业
|
|
432
|
+
### Filter PIPE Jobs via query_tag
|
|
399
433
|
|
|
400
|
-
PIPE
|
|
434
|
+
PIPE-executed jobs are automatically tagged with `query_tag` in the format: `pipe.<workspace_name>.<schema_name>.<pipe_name>`
|
|
401
435
|
|
|
402
436
|
```sql
|
|
403
|
-
--
|
|
404
|
-
-- 在 JOBS 列表中过滤 PIPE 相关作业
|
|
437
|
+
-- Filter PIPE-related jobs in the JOBS list
|
|
405
438
|
SHOW JOBS WHERE query_tag = 'pipe.my_workspace.my_schema.my_oss_pipe';
|
|
406
439
|
```
|
|
407
440
|
|
|
408
441
|
---
|
|
409
442
|
|
|
410
|
-
## PIPE
|
|
443
|
+
## PIPE Management Operations
|
|
411
444
|
|
|
412
|
-
###
|
|
445
|
+
### Pause / Resume PIPE
|
|
413
446
|
|
|
414
447
|
```sql
|
|
415
|
-
--
|
|
448
|
+
-- Pause PIPE
|
|
416
449
|
ALTER PIPE my_oss_pipe SET PIPE_EXECUTION_PAUSED = true;
|
|
417
450
|
|
|
418
|
-
--
|
|
451
|
+
-- Resume PIPE
|
|
419
452
|
ALTER PIPE my_oss_pipe SET PIPE_EXECUTION_PAUSED = false;
|
|
420
453
|
```
|
|
421
454
|
|
|
422
|
-
###
|
|
455
|
+
### Modify PIPE Properties
|
|
423
456
|
|
|
424
457
|
```sql
|
|
425
|
-
--
|
|
458
|
+
-- Change virtual cluster
|
|
426
459
|
ALTER PIPE my_oss_pipe SET VIRTUAL_CLUSTER = 'new_vc';
|
|
427
460
|
|
|
428
|
-
--
|
|
429
|
-
--
|
|
461
|
+
-- Modify COPY_JOB_HINT (note: overwrites all existing hints; set all parameters at once)
|
|
462
|
+
-- Must be valid JSON format with double-quoted keys and values
|
|
430
463
|
ALTER PIPE my_oss_pipe SET COPY_JOB_HINT = '{"max_file_count":"100","force":"false"}';
|
|
431
464
|
```
|
|
432
465
|
|
|
433
|
-
>
|
|
466
|
+
> **Limitation**: Each ALTER PIPE can only modify one property at a time.
|
|
434
467
|
|
|
435
|
-
###
|
|
468
|
+
### Drop PIPE
|
|
436
469
|
|
|
437
470
|
```sql
|
|
438
471
|
DROP PIPE IF EXISTS my_oss_pipe;
|
|
@@ -440,123 +473,17 @@ DROP PIPE IF EXISTS my_oss_pipe;
|
|
|
440
473
|
|
|
441
474
|
---
|
|
442
475
|
|
|
443
|
-
##
|
|
444
|
-
|
|
445
|
-
| 问题 | 排查方向 |
|
|
446
|
-
|------|---------|
|
|
447
|
-
| PIPE 创建后无数据加载 | 1. `DESC PIPE EXTENDED` 检查是否暂停 2. 确认 Volume 路径下有新文件 3. 检查 COPY INTO 是否能独立运行 |
|
|
448
|
-
| LIST_PURGE 模式文件未被删除 | 确认 `PURGE=true` 已设置(紧跟 `USING <format>` 之后);检查 Connection 的 AccessKey 是否有删除权限 |
|
|
449
|
-
| `PURGE=true` 语法错误 | OPTIONS 必须在 PURGE 之前:`USING CSV OPTIONS (...) PURGE=true`。不要写成 `USING CSV PURGE=true OPTIONS(...)` |
|
|
450
|
-
| EVENT_NOTIFICATION 模式无触发 | 1. 检查 MNS/SQS 队列是否收到消息 2. 确认 OSS 事件通知规则配置正确 3. 检查 Role ARN 授权 |
|
|
451
|
-
| 重复加载数据 | `load_history` 去重记录仅保留 7 天,超过 7 天的同名文件会被重新加载 |
|
|
452
|
-
| COPY_JOB_HINT 修改后部分参数丢失 | `SET COPY_JOB_HINT` 会覆盖所有已有 hints,需在一次 ALTER 中设置全部参数 |
|
|
453
|
-
| INSERT INTO FROM VOLUME 后 load_history 无记录 | 正常行为:只有 `COPY INTO` 会记录到 load_history,`INSERT INTO` 不会 |
|
|
454
|
-
| COPY INTO 报格式错误 | Volume 中有多种格式文件,使用 `FILES('xxx.json')` 指定文件 |
|
|
455
|
-
|
|
456
|
-
## 注意事项
|
|
457
|
-
|
|
458
|
-
### PIPE 持续导入(模式 A / B)
|
|
459
|
-
|
|
460
|
-
- 每个 PIPE 需对应独立的 Volume,不可多个 PIPE 共用同一 Volume
|
|
461
|
-
- PIPE 中的 COPY 语句不支持 `files` / `regexp` / `subdirectory` 参数
|
|
462
|
-
- 数据加载无法保证严格有序(多文件并行加载)
|
|
463
|
-
- 推荐文件大小:gzip 压缩 ≈ 50MB,CSV/Parquet 未压缩 128MB ~ 256MB
|
|
464
|
-
- `load_history` 去重记录保留 7 天,超期后同名文件可能被重复加载
|
|
465
|
-
- 修改 COPY 逻辑需删除 PIPE 重新创建,ALTER PIPE 不支持修改 COPY 语句
|
|
466
|
-
|
|
467
|
-
### 批量导入(模式 C)
|
|
468
|
-
|
|
469
|
-
- Volume 支持阿里云 OSS、腾讯云 COS 和 AWS S3
|
|
470
|
-
- 不支持跨云导入(源存储与 Lakehouse 环境需在同一云平台)
|
|
471
|
-
- 同地域建议使用内网 Endpoint 以提升传输速度和稳定性
|
|
472
|
-
- 推荐使用 GENERAL PURPOSE 类型虚拟集群执行批量加载任务
|
|
473
|
-
- INSERT INTO 方式支持 `FILES()` 和 `WHERE` 参数,COPY INTO 不支持
|
|
474
|
-
- Connection 参数使用 `access_id`/`access_key`(小写)或 `ACCESS_KEY_ID`/`ACCESS_KEY_SECRET`(大写),不要用 `ACCESS_KEY`/`SECRET_KEY`
|
|
475
|
-
- ⚠️ `INSERT INTO ... FROM VOLUME` 不会记录到 `load_history`,只有 `COPY INTO` 会记录
|
|
476
|
-
- ⚠️ Volume 中有多种格式文件时,不指定 `FILES()` 的 COPY INTO 会尝试读取所有文件,可能因格式不匹配而失败。建议使用 `FILES('xxx.json')` 指定文件或 `SUBDIRECTORY` 指定子目录
|
|
477
|
-
- 上传文件到 OSS 后,`SHOW VOLUME DIRECTORY` 可能需要先执行 `ALTER VOLUME name REFRESH` 刷新目录元数据
|
|
476
|
+
## Troubleshooting
|
|
478
477
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
478
|
+
| Issue | Investigation Steps |
|
|
479
|
+
|-------|-------------------|
|
|
480
|
+
| No data loaded after PIPE creation | 1. `DESC PIPE EXTENDED` to check if paused 2. Confirm new files exist in Volume path 3. Check if COPY INTO runs independently |
|
|
481
|
+
| Files not deleted in LIST_PURGE mode | Confirm `PURGE=true` is set (immediately after `USING <format>`); check if Connection's AccessKey has delete permissions |
|
|
482
|
+
| `PURGE=true` syntax error | OPTIONS must come before PURGE: `USING CSV OPTIONS (...) PURGE=true`. Do not write `USING CSV PURGE=true OPTIONS(...)` |
|
|
483
|
+
| EVENT_NOTIFICATION mode not triggering | 1. Check if MNS/SQS queue is receiving messages 2. Confirm OSS event notification rules are configured correctly 3. Check Role ARN authorization |
|
|
484
|
+
| Duplicate data loading | `load_history` deduplication records are retained for only 7 days; files with the same name will be reloaded after expiry |
|
|
485
|
+
| Some parameters lost after COPY_JOB_HINT modification | `SET COPY_JOB_HINT` overwrites all existing hints; set all parameters in a single ALTER |
|
|
486
|
+
| No load_history record after INSERT INTO FROM VOLUME | Expected behavior: only `COPY INTO` records to load_history; `INSERT INTO` does not |
|
|
487
|
+
| COPY INTO format error | Volume contains files of multiple formats; use `FILES('xxx.json')` to specify files |
|
|
489
488
|
|
|
490
|
-
# 步骤 2:创建外部 Volume
|
|
491
|
-
cz-cli agent run "创建外部 Volume,名称 <pipe_volume>,使用 Connection <my_oss_connection>,路径 oss://<bucket>/<data-path>/" \
|
|
492
|
-
--format a2a --dangerously-skip-permissions
|
|
493
|
-
|
|
494
|
-
# 步骤 3:验证 COPY INTO 可独立运行
|
|
495
|
-
cz-cli agent run "用 COPY INTO 从 Volume <pipe_volume> 加载数据到表 <schema>.<table>,文件格式 CSV,有 header,验证数据能正常加载" \
|
|
496
|
-
--format a2a --dangerously-skip-permissions
|
|
497
|
-
|
|
498
|
-
# 步骤 4:创建 LIST_PURGE 模式 PIPE
|
|
499
|
-
cz-cli agent run "创建 PIPE <my_oss_pipe>,INGEST_MODE 为 LIST_PURGE,使用 VCluster <my_vc>,从 Volume <pipe_volume> 以 CSV 格式(有 header,purge=true)持续导入数据到表 <schema>.<table>" \
|
|
500
|
-
--format a2a --dangerously-skip-permissions
|
|
501
|
-
|
|
502
|
-
# 步骤 5:验证 PIPE 状态
|
|
503
|
-
cz-cli agent run "查看 PIPE <my_oss_pipe> 的详细状态,确认 pipe_execution_paused 为 false" \
|
|
504
|
-
--format a2a --dangerously-skip-permissions
|
|
505
|
-
```
|
|
506
|
-
|
|
507
|
-
---
|
|
508
|
-
|
|
509
|
-
### 模式 B:EVENT_NOTIFICATION 消息通知模式(cz-cli 版)
|
|
510
|
-
|
|
511
|
-
```bash
|
|
512
|
-
# 步骤 1:创建 Role ARN 方式的存储连接
|
|
513
|
-
cz-cli agent run "创建 OSS Storage Connection,名称 <my_oss_role_connection>,endpoint <oss-cn-hangzhou.aliyuncs.com>,使用 Role ARN <acs:ram::xxx:role/clickzetta-oss-role>,region cn-hangzhou" \
|
|
514
|
-
--format a2a --dangerously-skip-permissions
|
|
515
|
-
|
|
516
|
-
# 步骤 2:创建外部 Volume
|
|
517
|
-
cz-cli agent run "创建外部 Volume,名称 <pipe_event_volume>,使用 Connection <my_oss_role_connection>,路径 oss://<bucket>/<data-path>/" \
|
|
518
|
-
--format a2a --dangerously-skip-permissions
|
|
519
|
-
|
|
520
|
-
# 步骤 3:创建 EVENT_NOTIFICATION 模式 PIPE
|
|
521
|
-
cz-cli agent run "创建 PIPE <my_oss_event_pipe>,INGEST_MODE 为 EVENT_NOTIFICATION,使用 VCluster <my_vc>,ALICLOUD_MNS_QUEUE 为 <my-mns-queue-name>,从 Volume <pipe_event_volume> 以 CSV 格式持续导入数据到表 <schema>.<table>" \
|
|
522
|
-
--format a2a --dangerously-skip-permissions
|
|
523
|
-
```
|
|
524
489
|
|
|
525
|
-
---
|
|
526
|
-
|
|
527
|
-
### 模式 C:批量导入(cz-cli 版)
|
|
528
|
-
|
|
529
|
-
```bash
|
|
530
|
-
# 步骤 1:创建目标表
|
|
531
|
-
cz-cli agent run "在 schema <my_schema> 下创建表 <target_table>,字段:id STRING, name STRING, amount DECIMAL(10,2), created_date STRING" \
|
|
532
|
-
--format a2a --dangerously-skip-permissions
|
|
533
|
-
|
|
534
|
-
# 步骤 2-3:创建存储连接和 Volume
|
|
535
|
-
cz-cli agent run "创建 OSS Storage Connection <my_batch_conn>,endpoint <oss-cn-shanghai-internal.aliyuncs.com>,access_id <id>,access_key <key>;然后创建外部 Volume <my_batch_volume>,路径 oss://<bucket>/<data-path>/,启用目录自动刷新" \
|
|
536
|
-
--format a2a --dangerously-skip-permissions
|
|
537
|
-
|
|
538
|
-
# 步骤 4:从 Volume 导入数据
|
|
539
|
-
cz-cli agent run "从 Volume <my_batch_volume> 以 CSV 格式(有 header)将数据导入表 <my_schema>.<target_table>" \
|
|
540
|
-
--format a2a --dangerously-skip-permissions
|
|
541
|
-
|
|
542
|
-
# 步骤 5:验证导入结果
|
|
543
|
-
cz-cli agent run "查询表 <my_schema>.<target_table> 的总行数和前 10 条数据,验证导入结果" \
|
|
544
|
-
--format a2a --dangerously-skip-permissions
|
|
545
|
-
```
|
|
546
|
-
|
|
547
|
-
---
|
|
548
|
-
|
|
549
|
-
### 监控与运维(cz-cli 版)
|
|
550
|
-
|
|
551
|
-
```bash
|
|
552
|
-
# 查看 PIPE 状态
|
|
553
|
-
cz-cli agent run "查看 PIPE <my_oss_pipe> 的详细状态和加载历史" \
|
|
554
|
-
--format a2a --dangerously-skip-permissions
|
|
555
|
-
|
|
556
|
-
# 暂停/恢复 PIPE
|
|
557
|
-
cz-cli agent run "暂停 PIPE <my_oss_pipe>" \
|
|
558
|
-
--format a2a --dangerously-skip-permissions
|
|
559
|
-
|
|
560
|
-
cz-cli agent run "恢复 PIPE <my_oss_pipe>" \
|
|
561
|
-
--format a2a --dangerously-skip-permissions
|
|
562
|
-
```
|