@clickzetta/cz-cli-darwin-x64 0.3.92 → 0.3.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/cz-cli +0 -0
  2. package/bin/skills/clickzetta-ai-function/SKILL.md +109 -0
  3. package/bin/skills/clickzetta-ai-function/eval_cases.jsonl +4 -0
  4. package/bin/skills/clickzetta-ai-function/references/ai-function-ddl.md +106 -0
  5. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +124 -124
  6. package/bin/skills/clickzetta-batch-sync-pipeline/eval_cases.jsonl +5 -5
  7. package/bin/skills/clickzetta-bi-connect/SKILL.md +79 -78
  8. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +56 -56
  9. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +386 -382
  10. package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -5
  11. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +73 -212
  12. package/bin/skills/clickzetta-data-science/SKILL.md +57 -56
  13. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +38 -38
  14. package/bin/skills/clickzetta-data-science/references/data-patterns.md +16 -16
  15. package/bin/skills/clickzetta-data-science/references/setup.md +28 -28
  16. package/bin/skills/clickzetta-data-science/references/stats-functions.md +44 -44
  17. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +22 -22
  18. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +32 -32
  19. package/bin/skills/clickzetta-dw-modeling/SKILL.md +1 -1
  20. package/bin/skills/clickzetta-external-function/SKILL.md +51 -109
  21. package/bin/skills/clickzetta-external-function/eval_cases.jsonl +4 -4
  22. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +39 -77
  23. package/bin/skills/clickzetta-java-sdk/SKILL.md +49 -48
  24. package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -12
  25. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +34 -34
  26. package/bin/skills/clickzetta-java-sdk/references/realtime.md +44 -44
  27. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +273 -507
  28. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +197 -231
  29. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +231 -304
  30. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +180 -179
  31. package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -5
  32. package/bin/skills/clickzetta-semantic-view/SKILL.md +74 -72
  33. package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -12
  34. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +75 -75
  35. package/bin/skills/clickzetta-sql-migration/SKILL.md +128 -0
  36. package/bin/skills/clickzetta-sql-migration/eval_cases.jsonl +10 -0
  37. package/bin/skills/clickzetta-sql-migration/references/ddl-reference.md +350 -0
  38. package/bin/skills/clickzetta-sql-migration/references/dml-differences.md +192 -0
  39. package/bin/skills/clickzetta-sql-migration/references/dml-reference.md +279 -0
  40. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/dql-reference.md +128 -128
  41. package/bin/skills/clickzetta-sql-migration/references/function-mapping.md +194 -0
  42. package/bin/skills/clickzetta-sql-migration/references/functions-reference.md +372 -0
  43. package/bin/skills/clickzetta-sql-migration/references/implicit-type-conversion.md +143 -0
  44. package/bin/skills/clickzetta-sql-migration/references/migration-databricks.md +260 -0
  45. package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/references/migration-snowflake.md +112 -112
  46. package/bin/skills/clickzetta-sql-migration/references/vs-snowflake.md +346 -0
  47. package/bin/skills/clickzetta-sql-migration/references/vs-spark.md +229 -0
  48. package/bin/skills/clickzetta-studio-task-manager/SKILL.md +326 -329
  49. package/bin/skills/clickzetta-table-lineage/SKILL.md +57 -55
  50. package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -1
  51. package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +5 -5
  52. package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +6 -6
  53. package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +2 -2
  54. package/bin/skills/clickzetta-volume-manager/SKILL.md +186 -100
  55. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +153 -52
  56. package/package.json +1 -1
  57. package/bin/skills/clickzetta-dynamic-table/best-practices/scheduling-guide.md +0 -135
  58. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  59. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -260
  60. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -191
  61. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -249
  62. package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +0 -3
  63. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  64. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  65. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  66. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  67. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  68. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  69. /package/bin/skills/{clickzetta-sql-syntax-guide → clickzetta-sql-migration}/LICENSE +0 -0
@@ -1,117 +1,135 @@
1
1
  ---
2
2
  name: clickzetta-oss-ingest-pipeline
3
3
  description: |
4
- 搭建 ClickZetta 对象存储(OSS/S3/COS)数据导入管道,覆盖持续导入(PIPE)和批量一次性导入
5
- 两大场景。持续导入支持 LIST_PURGE 扫描模式和 EVENT_NOTIFICATION 消息通知模式;批量导入支持
6
- Volume + INSERT INTO Volume + COPY INTO 两种方式。当用户说"对象存储导入"、"OSS 数据管道"、
7
- "S3 数据导入"、"PIPE 持续导入"、"文件自动加载"、"存储桶数据同步"、"COS 导入"、
8
- "批量导入 OSS"" OSS 加载数据""Volume 导入"时触发。
9
- 包含 PIPE 持续导入(两种 INGEST_MODE)、批量导入(Volume + COPY/INSERT)、Connection/Volume 创建、
10
- 监控管理等 ClickZetta 特有逻辑。
4
+ Build ClickZetta object storage (OSS/S3/COS) data ingestion pipelines, covering both continuous
5
+ ingestion (PIPE) and one-time batch import scenarios. Continuous ingestion supports LIST_PURGE
6
+ scan mode and EVENT_NOTIFICATION message notification mode; batch import supports Volume + INSERT
7
+ INTO and Volume + COPY INTO methods. Triggered when user says "object storage import", "OSS data
8
+ pipeline", "S3 data import", "PIPE continuous ingestion", "auto file loading", "bucket data sync",
9
+ "COS import", "batch import from OSS", "load data from OSS", "Volume import".
10
+ Includes PIPE continuous ingestion (two INGEST_MODEs), batch import (Volume + COPY/INSERT),
11
+ Connection/Volume creation, monitoring and management — all ClickZetta-specific logic.
11
12
  Keywords: OSS, S3, COS, object storage, PIPE, COPY INTO, file ingestion
12
13
  ---
13
14
 
14
- # 对象存储数据管道搭建工作流
15
+ # Object Storage Data Pipeline Setup Workflow
15
16
 
16
- ## 向导:收集必要信息
17
+ ## Wizard: Collect Required Information
17
18
 
18
- 开始搭建对象存储管道前,优先使用交互式问答工具(如 `question`)收集以下信息并弹出选项菜单;若无此类工具,则用文字一次性列出所有问题:
19
+ Before building an object storage pipeline, preferably use an interactive Q&A tool (e.g., `question`) to collect the following information via a selection menu; if no such tool is available, list all questions in text at once:
19
20
 
20
21
  ```
21
22
  question({
22
23
  questions: [
23
24
  {
24
- question: "云平台?",
25
+ question: "Cloud platform?",
25
26
  options: [
26
- { label: "阿里云 OSS", description: "支持 LIST_PURGE EVENT_NOTIFICATION 两种模式" },
27
- { label: "AWS S3", description: "支持 LIST_PURGE EVENT_NOTIFICATION 两种模式" },
28
- { label: "腾讯云 COS", description: "仅支持 LIST_PURGE 模式" }
27
+ { label: "Alibaba Cloud OSS", description: "Supports both LIST_PURGE and EVENT_NOTIFICATION modes" },
28
+ { label: "AWS S3", description: "Supports both LIST_PURGE and EVENT_NOTIFICATION modes" },
29
+ { label: "Tencent Cloud COS", description: "Only supports LIST_PURGE mode" }
29
30
  ]
30
31
  },
31
32
  {
32
- question: "导入模式?",
33
+ question: "Import mode?",
33
34
  options: [
34
- { label: "持续导入(PIPE", description: "新文件自动触发导入,近实时" },
35
- { label: "批量一次性导入", description: "手动或定时执行 COPY INTO" }
35
+ { label: "Continuous ingestion (PIPE)", description: "New files automatically trigger import, near real-time" },
36
+ { label: "One-time batch import", description: "Manually or scheduled COPY INTO execution" }
36
37
  ]
37
38
  },
38
39
  {
39
- question: "文件格式?",
40
+ question: "File format?",
40
41
  options: [
41
- { label: "CSV", description: "逗号分隔文本" },
42
- { label: "JSON / JSONL", description: "JSON 或换行分隔 JSON" },
43
- { label: "Parquet", description: "列式存储格式" },
44
- { label: "ORC", description: "列式存储格式" }
42
+ { label: "CSV", description: "Comma-separated text" },
43
+ { label: "JSON / JSONL", description: "JSON or newline-delimited JSON" },
44
+ { label: "Parquet", description: "Columnar storage format" },
45
+ { label: "ORC", description: "Columnar storage format" }
45
46
  ]
46
47
  }
47
48
  ]
48
49
  })
49
50
  ```
50
51
 
51
- **如果用户已经提供了足够信息,直接进入工作流,不再弹出菜单。**
52
+ **If the user has already provided sufficient information, proceed directly to the workflow without showing the menu.**
52
53
 
53
54
  ---
54
55
 
55
- ## 适用场景
56
+ ## Decision Tree
56
57
 
57
- - 从对象存储(阿里云 OSS / AWS S3 / 腾讯云 COS)持续自动导入数据到 Lakehouse(PIPE 模式)
58
- - 从对象存储批量一次性导入数据到 Lakehouse(Volume + COPY/INSERT 模式)
59
- - 需要微批处理方式加载新增文件,实现近实时数据同步
60
- - 需要选择扫描模式(LIST_PURGE)或消息通知模式(EVENT_NOTIFICATION)
61
- - 需要对导入数据进行过滤转换(WHERE 条件、指定文件)
62
- - 关键词:OSS PIPE、S3 导入、对象存储管道、文件自动加载、PIPE 持续导入、COS 数据同步、批量导入、Volume 导入
58
+ ```
59
+ Is data arriving continuously (new files added over time)?
60
+ ├─ YES → Use PIPE (continuous ingestion)
61
+ │ ├─ Need low latency (< 1 min) AND on Alibaba Cloud OSS or AWS S3?
62
+ │ │ ├─ YES → Mode B: EVENT_NOTIFICATION
63
+ │ │ └─ NO → Mode A: LIST_PURGE
64
+ │ └─ On Tencent Cloud COS?
65
+ │ └─ Mode A: LIST_PURGE (only option)
66
+ └─ NO → One-time or scheduled load
67
+ └─ Mode C: Batch Import (Volume + COPY INTO / INSERT INTO)
68
+ ├─ Need deduplication protection? → Use COPY INTO
69
+ ├─ Need filtering/file selection? → Use INSERT INTO
70
+ └─ Need idempotent overwrite? → Use COPY OVERWRITE INTO
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Applicable Scenarios
63
76
 
64
- ## 前置依赖
77
+ - Continuous auto-import from OSS/S3/COS to Lakehouse (PIPE)
78
+ - One-time or scheduled batch import (Volume + COPY/INSERT)
79
+ - Near real-time micro-batch file loading
80
+ - Filtering or transforming data during import
65
81
 
66
- - ClickZetta Lakehouse 账户,具备创建 PIPE、表、存储连接、Volume 等权限
67
- - 对象存储桶可达(Endpoint、AccessKey 或 Role ARN)
68
- - **执行环境**:已安装并配置 cz-cli
82
+ ## Prerequisites
69
83
 
70
- ## 执行环境
84
+ - ClickZetta Lakehouse account with permissions to create PIPEs, tables, storage connections, Volumes, etc.
85
+ - Object storage bucket is reachable (Endpoint, AccessKey, or Role ARN)
86
+ - **Execution environment**: cz-cli installed and configured
71
87
 
72
- 所有 SQL 通过 `cz-cli sql` 执行:
88
+ ## Execution Environment
89
+
90
+ All SQL is executed via `cz-cli sql`:
73
91
 
74
92
  ```bash
75
- cz-cli --version # 确认 cz-cli 可用
76
- cz-cli sql "SELECT 1" --sync # 验证连接
93
+ cz-cli --version # Confirm cz-cli is available
94
+ cz-cli sql "SELECT 1" --sync # Verify connection
77
95
  ```
78
96
 
79
- 需要 cz-cli,请参考官方文档安装并完成配置后重试。
97
+ If cz-cli is needed, refer to the official documentation to install and configure it before retrying.
80
98
 
81
- ## 核心概念
99
+ ## Core Concepts
82
100
 
83
- ### INGEST_MODE 选择指引
101
+ ### INGEST_MODE Selection Guide
84
102
 
85
- | 模式 | 触发方式 | 适用场景 | 云平台支持 | 授权方式 |
86
- |------|---------|---------|-----------|---------|
87
- | `LIST_PURGE` | 定期扫描目录 | 通用场景,导入后删除源文件 | 所有云平台 | 密钥 Role ARN |
88
- | `EVENT_NOTIFICATION` | 消息服务通知 | 低延迟场景,文件上传即触发 | 仅阿里云 OSS + AWS S3 | Role ARN |
103
+ | Mode | Trigger Method | Use Case | Cloud Platform Support | Auth Method |
104
+ |------|---------------|----------|----------------------|-------------|
105
+ | `LIST_PURGE` | Periodic directory scan | General purpose, deletes source files after import | All cloud platforms | Access Key or Role ARN |
106
+ | `EVENT_NOTIFICATION` | Message service notification | Low-latency scenarios, triggered on file upload | Alibaba Cloud OSS + AWS S3 only | Role ARN only |
89
107
 
90
- ### 关键限制
108
+ ### Key Limitations
91
109
 
92
- - 每个 PIPE 需对应独立的 Volume,不可复用
93
- - 不支持修改 COPY 语句逻辑,需删除 PIPE 重新创建
94
- - PIPE 中的 COPY 语句不支持 `files` / `regexp` / `subdirectory` 参数
95
- - 数据加载无法保证严格有序
96
- - `load_history` 去重记录保留 7
97
- - 修改 `COPY_JOB_HINT` 会覆盖所有已有 hints,需一次性设置全部参数
98
- - **Volume PIPE 不支持 Kafka 专用参数**:`BATCH_INTERVAL_IN_SECONDS`、`BATCH_SIZE_PER_KAFKA_PARTITION`、`MAX_SKIP_BATCH_COUNT_ON_ERROR` 仅适用于 Kafka PIPE
99
- - **`COPY_JOB_HINT` 必须是合法 JSON 格式**,键值都要用双引号:`'{"IGNORE_TMP_FILE": "true"}'`,不能用 `KEY=VALUE` 格式
110
+ - Each PIPE requires a dedicated Volume; Volumes cannot be shared across PIPEs
111
+ - **PIPE `VIRTUAL_CLUSTER` should be a General Purpose (GP) cluster** (recommended); AP clusters also work but GP is best suited for ingestion workloads. Integration (Sync) clusters are not supported for PIPE execution.
112
+ - COPY statement logic cannot be modified; delete and recreate the PIPE instead
113
+ - COPY statements in PIPEs do not support `files` / `regexp` / `subdirectory` parameters
114
+ - Data loading order is not strictly guaranteed
115
+ - `load_history` deduplication records are retained for 7 days
116
+ - Modifying `COPY_JOB_HINT` overwrites all existing hints; set all parameters at once
117
+ - **Volume PIPEs do not support Kafka-specific parameters**: `BATCH_INTERVAL_IN_SECONDS`, `BATCH_SIZE_PER_KAFKA_PARTITION`, `MAX_SKIP_BATCH_COUNT_ON_ERROR` apply only to Kafka PIPEs
118
+ - **`COPY_JOB_HINT` must be valid JSON format** with double-quoted keys and values: `'{"IGNORE_TMP_FILE": "true"}'`; do not use `KEY=VALUE` format
100
119
 
101
- ### 文件大小建议
120
+ ### File Size Recommendations
102
121
 
103
- - gzip 压缩文件:≈ 50MB
104
- - CSV / PARQUET 未压缩文件:128MB ~ 256MB
122
+ - gzip compressed files: ~50MB
123
+ - CSV / Parquet uncompressed files: 128MB–256MB
105
124
 
106
- ## 工作流
125
+ ## Workflow
107
126
 
108
- ### 模式 ALIST_PURGE 扫描模式(通用)
127
+ ### Mode A: LIST_PURGE Scan Mode (General Purpose)
109
128
 
110
- #### 步骤 1:创建存储连接(Storage Connection
129
+ #### Step 1: Create Storage Connection
111
130
 
112
131
  ```sql
113
- -- 通过 cz-cli sql "<SQL>" --sync 执行
114
- -- 密钥方式(LIST_PURGE 模式支持)
132
+ -- Access Key method (supported by LIST_PURGE mode)
115
133
  CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_connection
116
134
  TYPE OSS
117
135
  access_id = '<your_access_key_id>'
@@ -119,18 +137,17 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_connection
119
137
  ENDPOINT = 'oss-cn-hangzhou.aliyuncs.com';
120
138
  ```
121
139
 
122
- > **参数说明**:
123
- > - `access_id`:对应阿里云控制台的 **AccessKey ID**
124
- > - `access_key`:对应阿里云控制台的 **AccessKey Secret**
125
- > - 也可使用大写形式 `ACCESS_KEY_ID` / `ACCESS_KEY_SECRET`
126
- > - ⚠️ `ACCESS_KEY` / `SECRET_KEY` 会报错(缺少 `_ID` / `_SECRET` 后缀)
140
+ > **Parameter notes**:
141
+ > - `access_id`: Corresponds to **AccessKey ID** in the Alibaba Cloud console
142
+ > - `access_key`: Corresponds to **AccessKey Secret** in the Alibaba Cloud console
143
+ > - Uppercase forms `ACCESS_KEY_ID` / `ACCESS_KEY_SECRET` are also accepted
144
+ > - ⚠️ `ACCESS_KEY` / `SECRET_KEY` will error (missing `_ID` / `_SECRET` suffix)
127
145
  >
128
- > **提示**:如果使用 Role ARN 方式(EVENT_NOTIFICATION 模式必须),参见下方"模式 B"中的 Connection 创建语法。
146
+ > **Tip**: For Role ARN method (required for EVENT_NOTIFICATION mode), see the Connection creation syntax in "Mode B" below.
129
147
 
130
- #### 步骤 2:创建外部 Volume
148
+ #### Step 2: Create External Volume
131
149
 
132
150
  ```sql
133
- -- 通过 cz-cli sql "<SQL>" --sync 执行
134
151
  CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_volume
135
152
  LOCATION 'oss://my-bucket/data-path/'
136
153
  USING CONNECTION my_oss_connection
@@ -139,30 +156,36 @@ CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_volume
139
156
  COMMENT 'Volume for OSS PIPE ingestion';
140
157
  ```
141
158
 
142
- > **关键参数**:
143
- > - `RECURSIVE = true`:递归扫描子目录
144
- > - `DIRECTORY = (enable = true, auto_refresh = true)`:自动刷新目录元数据
145
- > - ⚠️ COMMENT 不带等号:`COMMENT 'text'`(不是 `COMMENT = 'text'`)
159
+ > **Key parameters**:
160
+ > - `RECURSIVE = true`: Recursively scan subdirectories
161
+ > - `DIRECTORY = (enable = true, auto_refresh = true)`: Auto-refresh directory metadata
162
+ > - ⚠️ COMMENT has no equals sign: `COMMENT 'text'` (not `COMMENT = 'text'`)
146
163
 
147
- #### 步骤 3:验证 COPY INTO 可独立运行
164
+ #### Step 3: Verify Schema and Sample Data
148
165
 
149
- 在创建 PIPE 之前,先用 COPY INTO 验证数据能正常加载:
166
+ Before creating the PIPE, probe the Volume with a SELECT to verify file parsing and schema mapping:
150
167
 
151
168
  ```sql
152
- -- 通过 cz-cli sql "<SQL>" --sync 执行
153
- COPY INTO my_schema.target_table
154
- FROM VOLUME pipe_volume
155
- USING CSV OPTIONS ('header' = 'true', 'delimiter' = ',') PURGE=true;
169
+ SELECT *
170
+ FROM VOLUME pipe_volume (
171
+ id STRING,
172
+ name STRING,
173
+ amount DECIMAL(10,2),
174
+ created_date STRING
175
+ ) USING CSV OPTIONS ('header' = 'true')
176
+ LIMIT 20;
156
177
  ```
157
178
 
158
- > **重要**:
159
- > - PIPE 中的 COPY 语句不支持 `files`、`regexp`、`subdirectory` 参数。确保此处验证时也不使用这些参数。
160
- > - OPTIONS 放在 PURGE=true **之前**:`USING CSV OPTIONS (...) PURGE=true`
179
+ **→ Show the results to the user and ask for confirmation before proceeding to Step 4.**
161
180
 
162
- #### 步骤 4:创建 PIPE(LIST_PURGE 模式)
181
+ > **Notes**:
182
+ > - SELECT FROM VOLUME is read-only — no temp tables, no cleanup needed.
183
+ > - If columns appear misaligned or values are NULL, adjust the schema definition or OPTIONS before proceeding.
184
+ > - This validates the same parsing logic the PIPE's COPY INTO will use.
185
+
186
+ #### Step 4: Create PIPE (LIST_PURGE Mode)
163
187
 
164
188
  ```sql
165
- -- 通过 cz-cli sql "<SQL>" --sync 执行
166
189
  CREATE PIPE IF NOT EXISTS my_oss_pipe
167
190
  INGEST_MODE = 'LIST_PURGE'
168
191
  VIRTUAL_CLUSTER = 'my_vc'
@@ -173,52 +196,49 @@ FROM VOLUME pipe_volume
173
196
  USING CSV OPTIONS ('header' = 'true') PURGE=true;
174
197
  ```
175
198
 
176
- > **⚠️ 语法关键点**:
177
- > - `PURGE=true` 放在最后:`USING <format> [OPTIONS (...)] PURGE=true`
178
- > - OPTIONS PURGE=true **之前**(如果需要的话)
179
- > - 也可以不带 OPTIONS:`USING CSV PURGE=true`(推荐简洁写法)
180
- > - COMMENT 不带等号:`COMMENT 'text'`
181
- > - 大写 `PURGE`,小写 `true`,中间用 `=` 连接,无空格
182
- > - **LIST_PURGE 模式必须设置** `PURGE=true`,加载成功后删除源文件(避免重复导入)
183
- > - 即使不想删除源文件,LIST_PURGE 模式也需要此参数,否则会重复导入同一文件
184
- > - `VIRTUAL_CLUSTER`:指定执行 PIPE 任务的虚拟集群
199
+ > **⚠️ Syntax key points**:
200
+ > - `PURGE=true` goes at the end: `USING <format> [OPTIONS (...)] PURGE=true`
201
+ > - OPTIONS goes **before** PURGE=true (if needed)
202
+ > - Can also omit OPTIONS: `USING CSV PURGE=true` (recommended concise form)
203
+ > - Uppercase `PURGE`, lowercase `true`, connected with `=`, no spaces
204
+ > - **LIST_PURGE mode requires** `PURGE=true`; source files are deleted after successful load (prevents duplicate imports)
205
+ > - Even if you don't want to delete source files, LIST_PURGE mode still requires this parameter, otherwise the same file will be imported repeatedly
206
+ > - `VIRTUAL_CLUSTER`: Specifies the virtual cluster that executes the PIPE task
185
207
  >
186
- > **错误写法**(会报语法错误):
208
+ > **Incorrect syntax** (will cause syntax errors):
187
209
  > ```sql
188
- > -- ❌ 不要把 purge 放在 OPTIONS
210
+ > -- ❌ Do not put purge inside OPTIONS
189
211
  > OPTIONS ('header' = 'true', 'purge' = 'true')
190
- > -- ❌ OPTIONS 不能在 PURGE 之后
212
+ > -- ❌ OPTIONS cannot come after PURGE
191
213
  > USING CSV PURGE=true OPTIONS ('header' = 'true')
192
- > -- ❌ 不要用小写或加引号
214
+ > -- ❌ Do not use lowercase or quotes
193
215
  > 'purge'='true'
194
216
  > ```
195
217
 
196
- #### 步骤 5:验证 PIPE 状态
218
+ #### Step 5: Verify PIPE Status
197
219
 
198
220
  ```sql
199
- -- 通过 cz-cli sql "<SQL>" --sync 执行
200
221
  DESC PIPE EXTENDED my_oss_pipe;
201
222
  ```
202
223
 
203
- 确认 `pipe_execution_paused = false`(PIPE 已启动运行)。
224
+ Confirm `pipe_execution_paused = false` (PIPE is running).
204
225
 
205
226
  ---
206
227
 
207
- ### 模式 BEVENT_NOTIFICATION 消息通知模式(低延迟)
228
+ ### Mode B: EVENT_NOTIFICATION Message Notification Mode (Low Latency)
208
229
 
209
- > 仅支持阿里云 OSS + AWS S3。文件上传到桶后,通过消息服务(MNS/SQS)通知 Lakehouse 立即加载。
230
+ > Supported on Alibaba Cloud OSS + AWS S3 only. After files are uploaded to the bucket, Lakehouse is notified via message service (MNS/SQS) to load immediately.
210
231
 
211
- #### 前置准备(阿里云 OSS 示例)
232
+ #### Prerequisites (Alibaba Cloud OSS Example)
212
233
 
213
- 1. **开通阿里云 MNS 消息服务**:在阿里云控制台开通消息服务 MNS
214
- 2. **配置 OSS 事件通知**:在 OSS 事件通知创建规则,事件类型选择 `ObjectCreated`,目标选择 MNS 队列
215
- 3. **授权 OSS 读取权限**:创建 RAM 角色,授予 `oss:GetObject`、`oss:ListBucket` 权限,记录 Role ARN
216
- 4. **授权 MNS Lakehouse**:将 Lakehouse 服务账号添加到 MNS 队列的授权策略中
234
+ 1. **Enable Alibaba Cloud MNS**: Activate Message Service (MNS) in the Alibaba Cloud console
235
+ 2. **Configure OSS event notification**: In OSS bucketEvent Notification Create Rule, select event type `ObjectCreated`, target as MNS queue
236
+ 3. **Grant OSS read permissions**: Create a RAM role, grant `oss:GetObject` and `oss:ListBucket` permissions, record the Role ARN
237
+ 4. **Authorize MNS to Lakehouse**: Add the Lakehouse service account to the MNS queue's authorization policy
217
238
 
218
- #### 步骤 1:创建存储连接(Role ARN 方式)
239
+ #### Step 1: Create Storage Connection (Role ARN Method)
219
240
 
220
241
  ```sql
221
- -- 通过 cz-cli sql "<SQL>" --sync 执行
222
242
  CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_role_connection
223
243
  TYPE OSS
224
244
  ENDPOINT = 'oss-cn-hangzhou.aliyuncs.com'
@@ -226,10 +246,9 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_oss_role_connection
226
246
  REGION = 'cn-hangzhou';
227
247
  ```
228
248
 
229
- #### 步骤 2:创建外部 Volume
249
+ #### Step 2: Create External Volume
230
250
 
231
251
  ```sql
232
- -- 通过 cz-cli sql "<SQL>" --sync 执行
233
252
  CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_event_volume
234
253
  LOCATION 'oss://my-bucket/data-path/'
235
254
  USING CONNECTION my_oss_role_connection
@@ -237,10 +256,9 @@ CREATE EXTERNAL VOLUME IF NOT EXISTS pipe_event_volume
237
256
  RECURSIVE = true;
238
257
  ```
239
258
 
240
- #### 步骤 3:创建 PIPEEVENT_NOTIFICATION 模式)
259
+ #### Step 3: Create PIPE (EVENT_NOTIFICATION Mode)
241
260
 
242
261
  ```sql
243
- -- 通过 cz-cli sql "<SQL>" --sync 执行
244
262
  CREATE PIPE IF NOT EXISTS my_oss_event_pipe
245
263
  INGEST_MODE = 'EVENT_NOTIFICATION'
246
264
  VIRTUAL_CLUSTER = 'my_vc'
@@ -252,28 +270,26 @@ FROM VOLUME pipe_event_volume
252
270
  USING CSV;
253
271
  ```
254
272
 
255
- > **参数说明**:
256
- > - `INGEST_MODE = 'EVENT_NOTIFICATION'`:通过消息通知触发加载
257
- > - `ALICLOUD_MNS_QUEUE`:阿里云 MNS 队列名称(AWS 使用 `AWS_SQS_QUEUE`)
258
- > - 此模式下不需要 `PURGE=true`,因为是事件驱动而非扫描
259
- > - COMMENT 不带等号:`COMMENT 'text'`
273
+ > **Parameter notes**:
274
+ > - `INGEST_MODE = 'EVENT_NOTIFICATION'`: Triggers loading via message notification
275
+ > - `ALICLOUD_MNS_QUEUE`: Alibaba Cloud MNS queue name (use `AWS_SQS_QUEUE` for AWS)
276
+ > - This mode does not require `PURGE=true` since it's event-driven rather than scan-based
260
277
 
261
278
  ---
262
279
 
263
- ### 模式 C:批量导入(一次性 Volume + COPY/INSERT
280
+ ### Mode C: Batch Import (One-time Volume + COPY/INSERT)
264
281
 
265
- > 适用于一次性或定期批量加载对象存储中的文件,无需创建 PIPE。支持阿里云 OSS、腾讯云 COS AWS S3
266
- > 推荐使用 GENERAL PURPOSE 类型的虚拟集群执行批量加载。
282
+ > Suitable for one-time or scheduled batch loading of files from object storage; no PIPE creation needed. Supports Alibaba Cloud OSS, Tencent Cloud COS, and AWS S3.
283
+ > Recommended to use GENERAL PURPOSE type virtual clusters for batch loading.
267
284
 
268
- #### 使用限制
285
+ #### Usage Limitations
269
286
 
270
- - 不支持跨云导入(源存储与 Lakehouse 环境需在同一云平台)
271
- - 同地域建议使用内网 Endpoint(如 `oss-cn-shanghai-internal.aliyuncs.com`)以提升速度和稳定性
287
+ - Cross-cloud import is not supported (source storage and Lakehouse environment must be on the same cloud platform)
288
+ - Same-region internal endpoints are recommended (e.g., `oss-cn-shanghai-internal.aliyuncs.com`) for better speed and stability
272
289
 
273
- #### 步骤 1:创建目标表
290
+ #### Step 1: Create Target Table
274
291
 
275
292
  ```sql
276
- -- 通过 cz-cli sql "<SQL>" --sync 执行
277
293
  CREATE TABLE IF NOT EXISTS my_schema.target_table (
278
294
  id STRING,
279
295
  name STRING,
@@ -282,10 +298,9 @@ CREATE TABLE IF NOT EXISTS my_schema.target_table (
282
298
  );
283
299
  ```
284
300
 
285
- #### 步骤 2:创建存储连接(access_id/access_key 语法)
301
+ #### Step 2: Create Storage Connection (access_id/access_key Syntax)
286
302
 
287
303
  ```sql
288
- -- 通过 cz-cli sql "<SQL>" --sync 执行
289
304
  CREATE STORAGE CONNECTION IF NOT EXISTS my_batch_conn
290
305
  TYPE OSS
291
306
  ENDPOINT = 'oss-cn-shanghai-internal.aliyuncs.com'
@@ -293,35 +308,30 @@ CREATE STORAGE CONNECTION IF NOT EXISTS my_batch_conn
293
308
  access_key = '<your_access_key_secret>';
294
309
  ```
295
310
 
296
- > **Connection 参数命名**:
297
- > - 小写形式:`access_id` / `access_key`(推荐)
298
- > - 大写形式:`ACCESS_KEY_ID` / `ACCESS_KEY_SECRET`(也可以)
299
- > - ⚠️ `ACCESS_KEY` / `SECRET_KEY` 会报错(缺少后缀)
311
+ > **Connection parameter naming**: See Mode A Step 1 for accepted forms. Use `access_id`/`access_key` (lowercase, recommended) or `ACCESS_KEY_ID`/`ACCESS_KEY_SECRET`. Never use `ACCESS_KEY`/`SECRET_KEY`.
300
312
 
301
- #### 步骤 3:创建外部 Volume(启用目录自动刷新)
313
+ #### Step 3: Create External Volume (with Directory Auto-refresh)
302
314
 
303
315
  ```sql
304
- -- 通过 cz-cli sql "<SQL>" --sync 执行
305
316
  CREATE EXTERNAL VOLUME IF NOT EXISTS my_batch_volume
306
317
  LOCATION 'oss://my-bucket/data-path/'
307
318
  USING CONNECTION my_batch_conn
308
319
  DIRECTORY = (enable=true, auto_refresh=true);
309
320
  ```
310
321
 
311
- > **关键参数**:
312
- > - `LOCATION`:对象存储路径,格式为 `oss://bucket/path/`
313
- > - `USING CONNECTION`:引用已创建的存储连接
314
- > - `DIRECTORY = (enable=true, auto_refresh=true)`:启用目录元数据并自动刷新,便于查询 Volume 中的文件列表
322
+ > **Key parameters**:
323
+ > - `LOCATION`: Object storage path, format: `oss://bucket/path/`
324
+ > - `USING CONNECTION`: References the previously created storage connection
325
+ > - `DIRECTORY = (enable=true, auto_refresh=true)`: Enables directory metadata with auto-refresh for querying file lists in the Volume
315
326
  >
316
- > **Volume 创建语法统一说明**:
317
- > - ✅ 推荐语法:`LOCATION '...' USING CONNECTION conn_name`(官方文档标准写法)
318
- > - ⚠️ 旧语法:`STORAGE_CONNECTION = conn_name LOCATION = '...'`(部分旧文档中出现,仍可使用)
319
- > - 两种语法功能等价,建议统一使用 `LOCATION ... USING CONNECTION` 形式
327
+ > **Volume creation syntax notes**:
328
+ > - ✅ Recommended syntax: `LOCATION '...' USING CONNECTION conn_name` (official documentation standard)
329
+ > - ⚠️ Legacy syntax: `STORAGE_CONNECTION = conn_name LOCATION = '...'` (appears in some older docs, still works)
330
+ > - Both syntaxes are functionally equivalent; recommend using `LOCATION ... USING CONNECTION` consistently
320
331
 
321
- #### 步骤 4aINSERT INTO Volume 导入(支持过滤转换)
332
+ #### Step 4a: INSERT INTO from Volume (Supports Filtering and Transformation)
322
333
 
323
334
  ```sql
324
- -- 通过 cz-cli sql "<SQL>" --sync 执行
325
335
  INSERT INTO my_schema.target_table
326
336
  SELECT * FROM VOLUME my_batch_volume (
327
337
  id STRING,
@@ -333,17 +343,16 @@ FILES ('data_file_01.csv')
333
343
  WHERE amount > 0;
334
344
  ```
335
345
 
336
- > **参数说明**:
337
- > - `VOLUME my_batch_volume (...)`:指定 Volume 及列定义(Schema-on-Read
338
- > - `USING CSV OPTIONS (...)`:指定文件格式和解析选项
339
- > - `FILES ('file1.csv', 'file2.csv')`:指定要加载的文件名(可选,不指定则加载全部)
340
- > - `WHERE ...`:对数据进行过滤转换(可选)
341
- > - INSERT INTO 方式支持 `FILES` `WHERE` 参数,适合需要精细控制的场景
346
+ > **Parameter notes**:
347
+ > - `VOLUME my_batch_volume (...)`: Specifies Volume and column definitions (Schema-on-Read)
348
+ > - `USING CSV OPTIONS (...)`: Specifies file format and parsing options
349
+ > - `FILES ('file1.csv', 'file2.csv')`: Specifies files to load (optional; loads all if omitted)
350
+ > - `WHERE ...`: Filters and transforms data (optional)
351
+ > - INSERT INTO supports `FILES` and `WHERE` parameters, suitable for fine-grained control
342
352
 
343
- #### 步骤 4bCOPY INTO Volume 导入(简洁语法)
353
+ #### Step 4b: COPY INTO from Volume (Concise Syntax)
344
354
 
345
355
  ```sql
346
- -- 通过 cz-cli sql "<SQL>" --sync 执行
347
356
  COPY INTO my_schema.target_table
348
357
  FROM VOLUME my_batch_volume (
349
358
  id STRING,
@@ -353,86 +362,110 @@ FROM VOLUME my_batch_volume (
353
362
  ) USING CSV OPTIONS ('header'='true', 'sep'=',');
354
363
  ```
355
364
 
356
- > **INSERT INTO vs COPY INTO 选择**:
357
- > - `INSERT INTO`:支持 `FILES()` 指定文件、`WHERE` 过滤转换,适合精细控制
358
- > - `COPY INTO`:语法更简洁,适合全量加载
359
- > - 两者都支持 Schema-on-Read(在 FROM VOLUME 中定义列)
360
- > - ⚠️ **load_history 差异**:只有 `COPY INTO` 会记录到 `load_history`,`INSERT INTO ... FROM VOLUME` 不会记录。如需去重保护,请使用 `COPY INTO`
365
+ > **INSERT INTO vs COPY INTO selection**:
366
+ > - `INSERT INTO`: Supports `FILES()` for specifying files and `WHERE` for filtering/transformation; suitable for fine-grained control
367
+ > - `COPY INTO`: More concise syntax; suitable for full loads
368
+ > - `COPY OVERWRITE INTO`: Replaces all existing data in the target table; use for idempotent full-refresh loads
369
+ > - Both COPY and INSERT support Schema-on-Read (defining columns in FROM VOLUME)
370
+ > - ⚠️ **load_history difference**: Only `COPY INTO` records to `load_history`; `INSERT INTO ... FROM VOLUME` does not. Use `COPY INTO` if deduplication protection is needed
371
+
372
+ #### Step 4c: COPY OVERWRITE INTO (Idempotent Full Refresh)
373
+
374
+ ```sql
375
+ COPY OVERWRITE INTO my_schema.target_table
376
+ FROM VOLUME my_batch_volume (
377
+ id STRING,
378
+ name STRING,
379
+ amount DECIMAL(10,2),
380
+ created_date STRING
381
+ ) USING CSV OPTIONS ('header'='true', 'sep'=',');
382
+ ```
383
+
384
+ > Atomically replaces all rows in the target table. Safe to retry — running twice produces the same result.
361
385
 
362
- #### 步骤 5:验证导入结果
386
+ #### Step 5: Verify Import Results
363
387
 
364
388
  ```sql
365
- -- 通过 cz-cli sql "<SQL>" --sync 执行
366
389
  SELECT COUNT(*) AS total_rows FROM my_schema.target_table;
367
390
  SELECT * FROM my_schema.target_table LIMIT 10;
368
391
  ```
369
392
 
370
393
  ---
371
394
 
372
- ## 监控与运维
395
+ ## Monitoring & Operations
373
396
 
374
- ### 查看 PIPE 详细状态
397
+ ### List Existing PIPEs
398
+
399
+ ```sql
400
+ SHOW PIPES;
401
+ SHOW PIPES LIKE '%oss%';
402
+ ```
403
+
404
+ ### List Files in a Volume
405
+
406
+ ```sql
407
+ SELECT * FROM DIRECTORY(@my_batch_volume) LIMIT 20;
408
+ -- If files are missing, refresh directory metadata first:
409
+ ALTER VOLUME my_batch_volume REFRESH;
410
+ ```
411
+
412
+ ### View PIPE Detailed Status
375
413
 
376
414
  ```sql
377
- -- 通过 cz-cli sql "<SQL>" --sync 执行
378
415
  DESC PIPE EXTENDED my_oss_pipe;
379
416
  ```
380
417
 
381
- 关键字段:
382
- - `pipe_execution_paused`:是否暂停
383
- - `ingest_mode`:导入模式
384
- - `virtual_cluster`:执行集群
385
- - `definition`:COPY 语句定义
418
+ Key fields:
419
+ - `pipe_execution_paused`: Whether paused
420
+ - `ingest_mode`: Import mode
421
+ - `virtual_cluster`: Execution cluster
422
+ - `definition`: COPY statement definition
386
423
 
387
- ### 查看加载历史
424
+ ### View Load History
388
425
 
389
426
  ```sql
390
- -- 通过 cz-cli sql "<SQL>" --sync 执行
391
427
  SELECT * FROM load_history('my_schema.target_table')
392
- ORDER BY last_load_time DESC
428
+ ORDER BY last_copy_time DESC
393
429
  LIMIT 20;
394
430
  ```
395
431
 
396
- > `load_history` 去重记录保留 7 天。
397
-
398
- ### 通过 query_tag 过滤 PIPE 作业
432
+ ### Filter PIPE Jobs via query_tag
399
433
 
400
- PIPE 执行的作业会自动打上 `query_tag`,格式为:`pipe.<workspace_name>.<schema_name>.<pipe_name>`
434
+ PIPE-executed jobs are automatically tagged with `query_tag` in the format: `pipe.<workspace_name>.<schema_name>.<pipe_name>`
401
435
 
402
436
  ```sql
403
- -- 通过 cz-cli sql "<SQL>" --sync 执行
404
- -- 在 JOBS 列表中过滤 PIPE 相关作业
437
+ -- Filter PIPE-related jobs in the JOBS list
405
438
  SHOW JOBS WHERE query_tag = 'pipe.my_workspace.my_schema.my_oss_pipe';
406
439
  ```
407
440
 
408
441
  ---
409
442
 
410
- ## PIPE 管理操作
443
+ ## PIPE Management Operations
411
444
 
412
- ### 暂停 / 恢复 PIPE
445
+ ### Pause / Resume PIPE
413
446
 
414
447
  ```sql
415
- -- 暂停 PIPE
448
+ -- Pause PIPE
416
449
  ALTER PIPE my_oss_pipe SET PIPE_EXECUTION_PAUSED = true;
417
450
 
418
- -- 恢复 PIPE
451
+ -- Resume PIPE
419
452
  ALTER PIPE my_oss_pipe SET PIPE_EXECUTION_PAUSED = false;
420
453
  ```
421
454
 
422
- ### 修改 PIPE 属性
455
+ ### Modify PIPE Properties
423
456
 
424
457
  ```sql
425
- -- 修改虚拟集群
458
+ -- Change virtual cluster
426
459
  ALTER PIPE my_oss_pipe SET VIRTUAL_CLUSTER = 'new_vc';
427
460
 
428
- -- 修改 COPY_JOB_HINT(注意:会覆盖所有已有 hints,需一次性设置全部参数)
429
- -- 必须是合法 JSON 格式,键值都要用双引号
461
+ -- Modify COPY_JOB_HINT (note: overwrites all existing hints; set all parameters at once)
462
+ -- Must be valid JSON format with double-quoted keys and values
430
463
  ALTER PIPE my_oss_pipe SET COPY_JOB_HINT = '{"max_file_count":"100","force":"false"}';
431
464
  ```
432
465
 
433
- > **限制**:每次 ALTER PIPE 只能修改一个属性。不支持修改 COPY 语句逻辑,需删除 PIPE 重新创建。
466
+ > **Limitation**: Each ALTER PIPE can only modify one property at a time.
434
467
 
435
- ### 删除 PIPE
468
+ ### Drop PIPE
436
469
 
437
470
  ```sql
438
471
  DROP PIPE IF EXISTS my_oss_pipe;
@@ -440,123 +473,17 @@ DROP PIPE IF EXISTS my_oss_pipe;
440
473
 
441
474
  ---
442
475
 
443
- ## 故障排除
444
-
445
- | 问题 | 排查方向 |
446
- |------|---------|
447
- | PIPE 创建后无数据加载 | 1. `DESC PIPE EXTENDED` 检查是否暂停 2. 确认 Volume 路径下有新文件 3. 检查 COPY INTO 是否能独立运行 |
448
- | LIST_PURGE 模式文件未被删除 | 确认 `PURGE=true` 已设置(紧跟 `USING <format>` 之后);检查 Connection 的 AccessKey 是否有删除权限 |
449
- | `PURGE=true` 语法错误 | OPTIONS 必须在 PURGE 之前:`USING CSV OPTIONS (...) PURGE=true`。不要写成 `USING CSV PURGE=true OPTIONS(...)` |
450
- | EVENT_NOTIFICATION 模式无触发 | 1. 检查 MNS/SQS 队列是否收到消息 2. 确认 OSS 事件通知规则配置正确 3. 检查 Role ARN 授权 |
451
- | 重复加载数据 | `load_history` 去重记录仅保留 7 天,超过 7 天的同名文件会被重新加载 |
452
- | COPY_JOB_HINT 修改后部分参数丢失 | `SET COPY_JOB_HINT` 会覆盖所有已有 hints,需在一次 ALTER 中设置全部参数 |
453
- | INSERT INTO FROM VOLUME 后 load_history 无记录 | 正常行为:只有 `COPY INTO` 会记录到 load_history,`INSERT INTO` 不会 |
454
- | COPY INTO 报格式错误 | Volume 中有多种格式文件,使用 `FILES('xxx.json')` 指定文件 |
455
-
456
- ## 注意事项
457
-
458
- ### PIPE 持续导入(模式 A / B)
459
-
460
- - 每个 PIPE 需对应独立的 Volume,不可多个 PIPE 共用同一 Volume
461
- - PIPE 中的 COPY 语句不支持 `files` / `regexp` / `subdirectory` 参数
462
- - 数据加载无法保证严格有序(多文件并行加载)
463
- - 推荐文件大小:gzip 压缩 ≈ 50MB,CSV/Parquet 未压缩 128MB ~ 256MB
464
- - `load_history` 去重记录保留 7 天,超期后同名文件可能被重复加载
465
- - 修改 COPY 逻辑需删除 PIPE 重新创建,ALTER PIPE 不支持修改 COPY 语句
466
-
467
- ### 批量导入(模式 C)
468
-
469
- - Volume 支持阿里云 OSS、腾讯云 COS 和 AWS S3
470
- - 不支持跨云导入(源存储与 Lakehouse 环境需在同一云平台)
471
- - 同地域建议使用内网 Endpoint 以提升传输速度和稳定性
472
- - 推荐使用 GENERAL PURPOSE 类型虚拟集群执行批量加载任务
473
- - INSERT INTO 方式支持 `FILES()` 和 `WHERE` 参数,COPY INTO 不支持
474
- - Connection 参数使用 `access_id`/`access_key`(小写)或 `ACCESS_KEY_ID`/`ACCESS_KEY_SECRET`(大写),不要用 `ACCESS_KEY`/`SECRET_KEY`
475
- - ⚠️ `INSERT INTO ... FROM VOLUME` 不会记录到 `load_history`,只有 `COPY INTO` 会记录
476
- - ⚠️ Volume 中有多种格式文件时,不指定 `FILES()` 的 COPY INTO 会尝试读取所有文件,可能因格式不匹配而失败。建议使用 `FILES('xxx.json')` 指定文件或 `SUBDIRECTORY` 指定子目录
477
- - 上传文件到 OSS 后,`SHOW VOLUME DIRECTORY` 可能需要先执行 `ALTER VOLUME name REFRESH` 刷新目录元数据
476
+ ## Troubleshooting
478
477
 
479
- ---
480
-
481
- ## cz-cli 执行路径
482
-
483
- ### 模式 A:LIST_PURGE 扫描模式(cz-cli 版)
484
-
485
- ```bash
486
- # 步骤 1:创建存储连接
487
- cz-cli agent run "创建 OSS Storage Connection,名称 <my_oss_connection>,endpoint <oss-cn-hangzhou.aliyuncs.com>,access_key <key>,secret_key <secret>" \
488
- --format a2a --dangerously-skip-permissions
478
+ | Issue | Investigation Steps |
479
+ |-------|-------------------|
480
+ | No data loaded after PIPE creation | 1. `DESC PIPE EXTENDED` to check if paused 2. Confirm new files exist in Volume path 3. Check if COPY INTO runs independently |
481
+ | Files not deleted in LIST_PURGE mode | Confirm `PURGE=true` is set (immediately after `USING <format>`); check if Connection's AccessKey has delete permissions |
482
+ | `PURGE=true` syntax error | OPTIONS must come before PURGE: `USING CSV OPTIONS (...) PURGE=true`. Do not write `USING CSV PURGE=true OPTIONS(...)` |
483
+ | EVENT_NOTIFICATION mode not triggering | 1. Check if MNS/SQS queue is receiving messages 2. Confirm OSS event notification rules are configured correctly 3. Check Role ARN authorization |
484
+ | Duplicate data loading | `load_history` deduplication records are retained for only 7 days; files with the same name will be reloaded after expiry |
485
+ | Some parameters lost after COPY_JOB_HINT modification | `SET COPY_JOB_HINT` overwrites all existing hints; set all parameters in a single ALTER |
486
+ | No load_history record after INSERT INTO FROM VOLUME | Expected behavior: only `COPY INTO` records to load_history; `INSERT INTO` does not |
487
+ | COPY INTO format error | Volume contains files of multiple formats; use `FILES('xxx.json')` to specify files |
489
488
 
490
- # 步骤 2:创建外部 Volume
491
- cz-cli agent run "创建外部 Volume,名称 <pipe_volume>,使用 Connection <my_oss_connection>,路径 oss://<bucket>/<data-path>/" \
492
- --format a2a --dangerously-skip-permissions
493
-
494
- # 步骤 3:验证 COPY INTO 可独立运行
495
- cz-cli agent run "用 COPY INTO 从 Volume <pipe_volume> 加载数据到表 <schema>.<table>,文件格式 CSV,有 header,验证数据能正常加载" \
496
- --format a2a --dangerously-skip-permissions
497
-
498
- # 步骤 4:创建 LIST_PURGE 模式 PIPE
499
- cz-cli agent run "创建 PIPE <my_oss_pipe>,INGEST_MODE 为 LIST_PURGE,使用 VCluster <my_vc>,从 Volume <pipe_volume> 以 CSV 格式(有 header,purge=true)持续导入数据到表 <schema>.<table>" \
500
- --format a2a --dangerously-skip-permissions
501
-
502
- # 步骤 5:验证 PIPE 状态
503
- cz-cli agent run "查看 PIPE <my_oss_pipe> 的详细状态,确认 pipe_execution_paused 为 false" \
504
- --format a2a --dangerously-skip-permissions
505
- ```
506
-
507
- ---
508
-
509
- ### 模式 B:EVENT_NOTIFICATION 消息通知模式(cz-cli 版)
510
-
511
- ```bash
512
- # 步骤 1:创建 Role ARN 方式的存储连接
513
- cz-cli agent run "创建 OSS Storage Connection,名称 <my_oss_role_connection>,endpoint <oss-cn-hangzhou.aliyuncs.com>,使用 Role ARN <acs:ram::xxx:role/clickzetta-oss-role>,region cn-hangzhou" \
514
- --format a2a --dangerously-skip-permissions
515
-
516
- # 步骤 2:创建外部 Volume
517
- cz-cli agent run "创建外部 Volume,名称 <pipe_event_volume>,使用 Connection <my_oss_role_connection>,路径 oss://<bucket>/<data-path>/" \
518
- --format a2a --dangerously-skip-permissions
519
-
520
- # 步骤 3:创建 EVENT_NOTIFICATION 模式 PIPE
521
- cz-cli agent run "创建 PIPE <my_oss_event_pipe>,INGEST_MODE 为 EVENT_NOTIFICATION,使用 VCluster <my_vc>,ALICLOUD_MNS_QUEUE 为 <my-mns-queue-name>,从 Volume <pipe_event_volume> 以 CSV 格式持续导入数据到表 <schema>.<table>" \
522
- --format a2a --dangerously-skip-permissions
523
- ```
524
489
 
525
- ---
526
-
527
- ### 模式 C:批量导入(cz-cli 版)
528
-
529
- ```bash
530
- # 步骤 1:创建目标表
531
- cz-cli agent run "在 schema <my_schema> 下创建表 <target_table>,字段:id STRING, name STRING, amount DECIMAL(10,2), created_date STRING" \
532
- --format a2a --dangerously-skip-permissions
533
-
534
- # 步骤 2-3:创建存储连接和 Volume
535
- cz-cli agent run "创建 OSS Storage Connection <my_batch_conn>,endpoint <oss-cn-shanghai-internal.aliyuncs.com>,access_id <id>,access_key <key>;然后创建外部 Volume <my_batch_volume>,路径 oss://<bucket>/<data-path>/,启用目录自动刷新" \
536
- --format a2a --dangerously-skip-permissions
537
-
538
- # 步骤 4:从 Volume 导入数据
539
- cz-cli agent run "从 Volume <my_batch_volume> 以 CSV 格式(有 header)将数据导入表 <my_schema>.<target_table>" \
540
- --format a2a --dangerously-skip-permissions
541
-
542
- # 步骤 5:验证导入结果
543
- cz-cli agent run "查询表 <my_schema>.<target_table> 的总行数和前 10 条数据,验证导入结果" \
544
- --format a2a --dangerously-skip-permissions
545
- ```
546
-
547
- ---
548
-
549
- ### 监控与运维(cz-cli 版)
550
-
551
- ```bash
552
- # 查看 PIPE 状态
553
- cz-cli agent run "查看 PIPE <my_oss_pipe> 的详细状态和加载历史" \
554
- --format a2a --dangerously-skip-permissions
555
-
556
- # 暂停/恢复 PIPE
557
- cz-cli agent run "暂停 PIPE <my_oss_pipe>" \
558
- --format a2a --dangerously-skip-permissions
559
-
560
- cz-cli agent run "恢复 PIPE <my_oss_pipe>" \
561
- --format a2a --dangerously-skip-permissions
562
- ```