@clickzetta/cz-cli-linux-x64 0.3.2 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/bin/cz-cli +0 -0
  2. package/package.json +1 -1
  3. package/bin/skills/clickzetta-access-control/SKILL.md +0 -243
  4. package/bin/skills/clickzetta-access-control/references/dynamic-masking.md +0 -86
  5. package/bin/skills/clickzetta-access-control/references/grant-revoke.md +0 -103
  6. package/bin/skills/clickzetta-access-control/references/role-management.md +0 -66
  7. package/bin/skills/clickzetta-access-control/references/user-management.md +0 -61
  8. package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
  9. package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
  10. package/bin/skills/clickzetta-app-python-sdk/SKILL.md +0 -153
  11. package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +0 -196
  12. package/bin/skills/clickzetta-app-python-sdk/references/connector.md +0 -143
  13. package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +0 -122
  14. package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +0 -293
  15. package/bin/skills/clickzetta-bi-connect/SKILL.md +0 -176
  16. package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +0 -170
  17. package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +0 -450
  18. package/bin/skills/clickzetta-concepts/SKILL.md +0 -282
  19. package/bin/skills/clickzetta-concepts/references/brands-and-endpoints.md +0 -79
  20. package/bin/skills/clickzetta-concepts/references/object-model.md +0 -311
  21. package/bin/skills/clickzetta-data-ingest-pipeline/SKILL.md +0 -165
  22. package/bin/skills/clickzetta-data-lifecycle/SKILL.md +0 -211
  23. package/bin/skills/clickzetta-data-lifecycle/references/lifecycle-reference.md +0 -175
  24. package/bin/skills/clickzetta-data-recovery/SKILL.md +0 -215
  25. package/bin/skills/clickzetta-data-recovery/evals/evals.json +0 -35
  26. package/bin/skills/clickzetta-data-science/SKILL.md +0 -125
  27. package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +0 -146
  28. package/bin/skills/clickzetta-data-science/references/data-patterns.md +0 -110
  29. package/bin/skills/clickzetta-data-science/references/setup.md +0 -160
  30. package/bin/skills/clickzetta-data-science/references/stats-functions.md +0 -195
  31. package/bin/skills/clickzetta-data-science/references/write-and-infer.md +0 -122
  32. package/bin/skills/clickzetta-data-science/references/zettapark-api.md +0 -156
  33. package/bin/skills/clickzetta-data-sharing/SKILL.md +0 -160
  34. package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +0 -134
  35. package/bin/skills/clickzetta-dba-guide/SKILL.md +0 -540
  36. package/bin/skills/clickzetta-dw-modeling/SKILL.md +0 -259
  37. package/bin/skills/clickzetta-dw-modeling/references/modeling-patterns.md +0 -100
  38. package/bin/skills/clickzetta-dynamic-table/SKILL.md +0 -86
  39. package/bin/skills/clickzetta-dynamic-table/best-practices/dimension-table-join-guide.md +0 -257
  40. package/bin/skills/clickzetta-dynamic-table/best-practices/medallion-and-stream-patterns.md +0 -124
  41. package/bin/skills/clickzetta-dynamic-table/best-practices/non-partitioned-merge-into-warning.md +0 -96
  42. package/bin/skills/clickzetta-dynamic-table/best-practices/performance-optimization.md +0 -109
  43. package/bin/skills/clickzetta-dynamic-table/dt-creator/SKILL.md +0 -15
  44. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/dt-declaration-strategy.md +0 -185
  45. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/incremental-config-reference.md +0 -429
  46. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/refresh-history-guide.md +0 -268
  47. package/bin/skills/clickzetta-dynamic-table/dt-creator/references/sql-limitations.md +0 -80
  48. package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +0 -190
  49. package/bin/skills/clickzetta-external-catalog/SKILL.md +0 -120
  50. package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +0 -130
  51. package/bin/skills/clickzetta-external-function/SKILL.md +0 -203
  52. package/bin/skills/clickzetta-external-function/references/external-function-ddl.md +0 -171
  53. package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +0 -117
  54. package/bin/skills/clickzetta-index-manager/SKILL.md +0 -140
  55. package/bin/skills/clickzetta-index-manager/references/bloomfilter-index.md +0 -67
  56. package/bin/skills/clickzetta-index-manager/references/index-management.md +0 -73
  57. package/bin/skills/clickzetta-index-manager/references/inverted-index.md +0 -80
  58. package/bin/skills/clickzetta-index-manager/references/vector-index.md +0 -81
  59. package/bin/skills/clickzetta-information-schema/SKILL.md +0 -367
  60. package/bin/skills/clickzetta-information-schema/references/instance-views-reference.md +0 -276
  61. package/bin/skills/clickzetta-information-schema/references/metering-views-reference.md +0 -137
  62. package/bin/skills/clickzetta-information-schema/references/views-reference.md +0 -271
  63. package/bin/skills/clickzetta-java-sdk/SKILL.md +0 -186
  64. package/bin/skills/clickzetta-java-sdk/references/bulkload.md +0 -163
  65. package/bin/skills/clickzetta-java-sdk/references/realtime.md +0 -212
  66. package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +0 -531
  67. package/bin/skills/clickzetta-kafka-ingest-pipeline/references/kafka-pipe-syntax.md +0 -186
  68. package/bin/skills/clickzetta-lakehouse-connect/SKILL.md +0 -218
  69. package/bin/skills/clickzetta-lakehouse-connect/evals/evals.json +0 -35
  70. package/bin/skills/clickzetta-lakehouse-connect/references/config-file.md +0 -435
  71. package/bin/skills/clickzetta-lakehouse-connect/references/jdbc.md +0 -478
  72. package/bin/skills/clickzetta-lakehouse-connect/references/python-sdk.md +0 -225
  73. package/bin/skills/clickzetta-lakehouse-connect/references/sqlalchemy.md +0 -468
  74. package/bin/skills/clickzetta-lakehouse-connect/references/zettapark-session.md +0 -445
  75. package/bin/skills/clickzetta-manage-comments/SKILL.md +0 -219
  76. package/bin/skills/clickzetta-metadata-query/SKILL.md +0 -298
  77. package/bin/skills/clickzetta-metadata-query/references/show-desc-reference.md +0 -326
  78. package/bin/skills/clickzetta-monitoring/SKILL.md +0 -199
  79. package/bin/skills/clickzetta-monitoring/references/job-history-analysis.md +0 -97
  80. package/bin/skills/clickzetta-monitoring/references/show-jobs.md +0 -48
  81. package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +0 -402
  82. package/bin/skills/clickzetta-query-optimizer/SKILL.md +0 -156
  83. package/bin/skills/clickzetta-query-optimizer/references/explain.md +0 -56
  84. package/bin/skills/clickzetta-query-optimizer/references/hints-and-sortkey.md +0 -78
  85. package/bin/skills/clickzetta-query-optimizer/references/optimize.md +0 -65
  86. package/bin/skills/clickzetta-query-optimizer/references/result-cache.md +0 -49
  87. package/bin/skills/clickzetta-query-optimizer/references/show-jobs.md +0 -42
  88. package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +0 -197
  89. package/bin/skills/clickzetta-semantic-view/SKILL.md +0 -207
  90. package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +0 -167
  91. package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +0 -92
  92. package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +0 -147
  93. package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +0 -132
  94. package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +0 -353
  95. package/bin/skills/clickzetta-sql-pipeline-manager/evals/evals.json +0 -166
  96. package/bin/skills/clickzetta-sql-pipeline-manager/references/dynamic-table.md +0 -173
  97. package/bin/skills/clickzetta-sql-pipeline-manager/references/materialized-view.md +0 -129
  98. package/bin/skills/clickzetta-sql-pipeline-manager/references/pipe.md +0 -160
  99. package/bin/skills/clickzetta-sql-pipeline-manager/references/table-stream.md +0 -123
  100. package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +0 -172
  101. package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +0 -350
  102. package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +0 -279
  103. package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +0 -504
  104. package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +0 -372
  105. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +0 -260
  106. package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +0 -382
  107. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +0 -346
  108. package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +0 -229
  109. package/bin/skills/clickzetta-studio-overview/SKILL.md +0 -170
  110. package/bin/skills/clickzetta-studio-overview/references/studio-modules.md +0 -173
  111. package/bin/skills/clickzetta-table-stream-pipeline/SKILL.md +0 -155
  112. package/bin/skills/clickzetta-vcluster-manager/SKILL.md +0 -212
  113. package/bin/skills/clickzetta-vcluster-manager/references/vc-cache.md +0 -54
  114. package/bin/skills/clickzetta-vcluster-manager/references/vcluster-ddl.md +0 -150
  115. package/bin/skills/clickzetta-volume-manager/SKILL.md +0 -249
  116. package/bin/skills/clickzetta-volume-manager/references/volume-ddl.md +0 -194
  117. package/bin/skills/clickzetta-zettapark/SKILL.md +0 -248
  118. package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +0 -283
@@ -1,160 +0,0 @@
1
- ---
2
- name: clickzetta-ai-vector-search
3
- description: |
4
- 在 ClickZetta Lakehouse 中实现向量存储、向量索引(HNSW)和向量检索,
5
- 构建 RAG、语义搜索、图像检索等 AI 应用。覆盖 VECTOR 数据类型定义、
6
- 向量索引创建(cosine/l2/hamming 距离)、向量数据插入与转换、
7
- ANN 近似最近邻检索、向量+倒排索引融合检索等完整工作流。
8
- 当用户说"向量检索"、"向量索引"、"语义搜索"、"embedding 存储"、
9
- "RAG"、"ANN 搜索"、"HNSW"、"cosine_distance"、"l2_distance"、
10
- "VECTOR 类型"、"向量数据库"、"相似度搜索"、"向量 + 标量融合检索"、
11
- "文本向量化"时触发。
12
- Keywords: vector, HNSW, embedding, RAG, semantic search, similarity, VECTOR type
13
- ---
14
-
15
- # ClickZetta 向量检索
16
-
17
- Lakehouse 原生支持 VECTOR 数据类型和 HNSW 向量索引,无需独立向量数据库即可在同一张表中实现向量检索、全文检索和标量过滤的融合查询。
18
-
19
- 阅读 [references/vector-search.md](references/vector-search.md) 了解完整语法。
20
-
21
- ---
22
-
23
- ## 快速开始
24
-
25
- ### 1. 建表(含向量索引)
26
-
27
- ```sql
28
- CREATE TABLE doc_embeddings (
29
- id INT,
30
- content STRING,
31
- vec VECTOR(FLOAT, 1024),
32
- INDEX vec_idx (vec) USING VECTOR PROPERTIES (
33
- "distance.function" = "cosine_distance",
34
- "scalar.type" = "f32"
35
- )
36
- );
37
- ```
38
-
39
- ### 2. 插入向量数据
40
-
41
- ```sql
42
- -- 直接插入
43
- INSERT INTO doc_embeddings VALUES
44
- (1, '云器 Lakehouse 产品介绍', vector(0.12, 0.34, ...));
45
-
46
- -- 从字符串转换(适合 API 返回的 JSON 格式)
47
- INSERT INTO doc_embeddings (id, content, vec)
48
- SELECT id, content, CAST(embedding_str AS VECTOR(1024))
49
- FROM staging_table;
50
- ```
51
-
52
- ### 3. 向量检索
53
-
54
- ```sql
55
- -- 设置探索因子(精度 vs 速度)
56
- SET cz.vector.index.search.ef = 64;
57
-
58
- -- 余弦距离 Top-10 相似文档
59
- SELECT id, content, cosine_distance(vec, CAST('[0.12, 0.34, ...]' AS VECTOR(1024))) AS dist
60
- FROM doc_embeddings
61
- ORDER BY dist
62
- LIMIT 10;
63
- ```
64
-
65
- ---
66
-
67
- ## 向量 + 标量融合检索(RAG 场景)
68
-
69
- ```sql
70
- -- 先用标量过滤缩小范围,再用向量排序
71
- SELECT id, content, cosine_distance(vec, :query_embedding) AS dist
72
- FROM doc_embeddings
73
- WHERE category = 'product'
74
- AND created_at >= '2024-01-01'
75
- ORDER BY dist
76
- LIMIT 5;
77
- ```
78
-
79
- ---
80
-
81
- ## 向量 + 全文检索融合
82
-
83
- ```sql
84
- -- 建表:同时支持向量索引和倒排索引
85
- CREATE TABLE hybrid_docs (
86
- id INT,
87
- title STRING,
88
- body STRING,
89
- vec VECTOR(FLOAT, 1024),
90
- INDEX body_inv_idx (body) USING INVERTED,
91
- INDEX vec_idx (vec) USING VECTOR PROPERTIES (
92
- "distance.function" = "cosine_distance"
93
- )
94
- );
95
-
96
- -- 融合检索:关键词过滤 + 向量排序
97
- SELECT id, title, cosine_distance(vec, :query_vec) AS dist
98
- FROM hybrid_docs
99
- WHERE body LIKE '%向量检索%'
100
- ORDER BY dist
101
- LIMIT 10;
102
- ```
103
-
104
- ---
105
-
106
- ## 外部系统写入向量(ARRAY → VECTOR 转换)
107
-
108
- 外部系统(Python SDK、Kafka 等)不能直接写 VECTOR 类型,需先写 ARRAY 再转换:
109
-
110
- ```sql
111
- -- 暂存表(ARRAY 类型)
112
- CREATE TABLE staging (id INT, vec_array ARRAY<FLOAT>);
113
-
114
- -- 转换写入目标表
115
- INSERT INTO doc_embeddings (id, vec)
116
- SELECT id, CAST(vec_array AS VECTOR(FLOAT, 1024))
117
- FROM staging;
118
- ```
119
-
120
- ---
121
-
122
- ## 距离函数速查
123
-
124
- | 函数 | 适用场景 |
125
- |---|---|
126
- | `cosine_distance(v1, v2)` | 文本语义检索(最常用) |
127
- | `l2_distance(v1, v2)` | 图像/通用向量检索 |
128
- | `dot_product(v1, v2)` | 归一化向量的相似度 |
129
- | `hamming_distance(v1, v2)` | 二值向量(高效压缩) |
130
- | `binary_quantize(v)` | 将 float 向量压缩为二值向量 |
131
-
132
- ---
133
-
134
- ## 性能调优
135
-
136
- ```sql
137
- -- 调整探索因子(默认 64,越大精度越高但越慢)
138
- SET cz.vector.index.search.ef = 128;
139
-
140
- -- 验证向量索引是否生效
141
- EXPLAIN SELECT id, cosine_distance(vec, vector(0.1, 0.2)) AS dist
142
- FROM doc_embeddings ORDER BY dist LIMIT 10;
143
- -- 查看执行计划中是否有 vector_index_search_type 字样
144
- ```
145
-
146
- **最佳实践:**
147
- - 向量检索建议**单独占用 VCluster**,避免与其他查询争抢缓存
148
- - 大批量写入后执行 `BUILD INDEX vec_idx ON table_name` 为存量数据构建索引
149
- - 外部系统写入时先写 ARRAY,再批量 CAST 转换,避免频繁小文件
150
-
151
- ---
152
-
153
- ## 常见问题
154
-
155
- | 问题 | 原因 | 解决方案 |
156
- |---|---|---|
157
- | 向量索引未生效 | 存量数据未构建索引 | 执行 `BUILD INDEX idx ON table` |
158
- | 检索精度低 | ef 值太小 | 增大 `cz.vector.index.search.ef` |
159
- | 外部写入报错 | 不支持直接写 VECTOR | 先写 ARRAY,再 CAST 转换 |
160
- | 向量检索慢 | 与其他查询共用 VCluster | 为向量检索单独分配 VCluster |
@@ -1,155 +0,0 @@
1
- # 向量检索参考
2
-
3
- > 来源:https://www.yunqi.tech/documents/vector-search 等
4
-
5
- ## VECTOR 数据类型
6
-
7
- ```sql
8
- -- 语法
9
- vector(scalar_type, dimension)
10
- vector(dimension) -- 默认 float 类型
11
-
12
- -- 示例
13
- CREATE TABLE embeddings (
14
- id INT,
15
- content STRING,
16
- vec VECTOR(FLOAT, 1024), -- 1024 维 float 向量
17
- vec_bin VECTOR(TINYINT, 128) -- 128 维 tinyint 向量(二值化)
18
- );
19
- ```
20
-
21
- 支持的元素类型:`FLOAT`(f32)、`TINYINT`(i8/b1)
22
-
23
- ---
24
-
25
- ## 创建向量索引
26
-
27
- ```sql
28
- -- 建表时内联创建
29
- CREATE TABLE doc_embeddings (
30
- id INT,
31
- content STRING,
32
- vec VECTOR(FLOAT, 1024),
33
- INDEX vec_idx (vec) USING VECTOR PROPERTIES (
34
- "distance.function" = "cosine_distance",
35
- "scalar.type" = "f32",
36
- "m" = "16",
37
- "ef.construction" = "128"
38
- )
39
- );
40
-
41
- -- 在已有表上添加向量索引
42
- ALTER TABLE doc_embeddings ADD INDEX vec_idx (vec) USING VECTOR PROPERTIES (
43
- "distance.function" = "cosine_distance",
44
- "scalar.type" = "f32"
45
- );
46
-
47
- -- 为存量数据构建索引
48
- BUILD INDEX vec_idx ON doc_embeddings;
49
- ```
50
-
51
- ### 关键参数
52
-
53
- | 参数 | 可选值 | 默认值 | 说明 |
54
- |---|---|---|---|
55
- | distance.function | l2_distance, cosine_distance, jaccard_distance, hamming_distance | cosine_distance | 距离函数 |
56
- | scalar.type | f32, f16, i8, b1 | f32 | 索引元素类型 |
57
- | m | 建议 ≤ 1000 | 16 | HNSW 最大邻居数 |
58
- | ef.construction | 建议 ≤ 5000 | 128 | 构建时候选集大小 |
59
- | compress.codec | uncompressed/zstd/lz4 | uncompressed | 压缩算法 |
60
-
61
- ---
62
-
63
- ## 插入向量数据
64
-
65
- ```sql
66
- -- 直接插入
67
- INSERT INTO doc_embeddings (id, content, vec) VALUES
68
- (1, 'hello world', vector(0.1, 0.2, 0.3, ...)),
69
- (2, 'foo bar', vector(0.4, 0.5, 0.6, ...));
70
-
71
- -- 从字符串转换
72
- INSERT INTO doc_embeddings (id, vec)
73
- SELECT id, CAST('[0.1, 0.2, 0.3]' AS VECTOR(3))
74
- FROM source_table;
75
-
76
- -- 从 ARRAY 列转换(外部系统写入场景)
77
- INSERT OVERWRITE doc_embeddings
78
- SELECT id, content, CAST(vec_array AS VECTOR(FLOAT, 1024))
79
- FROM staging_table;
80
- ```
81
-
82
- ---
83
-
84
- ## 向量检索
85
-
86
- ```sql
87
- -- 调整探索因子(精度 vs 速度权衡)
88
- SET cz.vector.index.search.ef = 64;
89
-
90
- -- L2 距离检索(欧几里得距离,越小越相似)
91
- SELECT id, content, l2_distance(vec, vector(0.1, 0.2, 0.3, ...)) AS dist
92
- FROM doc_embeddings
93
- ORDER BY dist
94
- LIMIT 10;
95
-
96
- -- 余弦距离检索(越小越相似)
97
- SELECT id, content, cosine_distance(vec, CAST('[0.1,0.2,0.3]' AS VECTOR(3))) AS dist
98
- FROM doc_embeddings
99
- ORDER BY dist
100
- LIMIT 10;
101
-
102
- -- 带过滤条件的向量检索(向量 + 标量融合)
103
- SELECT id, content, cosine_distance(vec, :query_vec) AS dist
104
- FROM doc_embeddings
105
- WHERE category = 'tech'
106
- AND cosine_distance(vec, :query_vec) < 0.3
107
- ORDER BY dist
108
- LIMIT 10;
109
- ```
110
-
111
- ---
112
-
113
- ## 距离函数速查
114
-
115
- | 函数 | 适用场景 | 说明 |
116
- |---|---|---|
117
- | `l2_distance(v1, v2)` | 通用语义检索 | 欧几里得距离,越小越相似 |
118
- | `cosine_distance(v1, v2)` | 文本语义检索 | 余弦距离,越小越相似 |
119
- | `dot_product(v1, v2)` | 归一化向量 | 点积,越大越相似 |
120
- | `hamming_distance(v1, v2)` | 二值向量 | 汉明距离,越小越相似 |
121
- | `jaccard_distance(v1, v2)` | 集合相似度 | 雅卡德距离 |
122
- | `binary_quantize(v)` | 向量压缩 | 将 float 向量二值化 |
123
-
124
- ---
125
-
126
- ## 向量 + 倒排索引融合检索
127
-
128
- ```sql
129
- -- 建表:同时支持向量索引和倒排索引
130
- CREATE TABLE hybrid_search (
131
- id INT,
132
- content STRING,
133
- vec VECTOR(FLOAT, 1024),
134
- INDEX content_inv_idx (content) USING INVERTED,
135
- INDEX vec_idx (vec) USING VECTOR PROPERTIES (
136
- "distance.function" = "cosine_distance"
137
- )
138
- );
139
-
140
- -- 融合检索:先用倒排过滤,再用向量排序
141
- SELECT id, content, cosine_distance(vec, :query_vec) AS dist
142
- FROM hybrid_search
143
- WHERE content LIKE '%关键词%'
144
- ORDER BY dist
145
- LIMIT 10;
146
- ```
147
-
148
- ---
149
-
150
- ## 注意事项
151
-
152
- - 向量类型不支持 `ORDER BY` 或 `GROUP BY`(只能对距离函数结果排序)
153
- - 向量索引性能与内存/磁盘缓存直接相关,建议**单独占用 VCluster**
154
- - 外部系统写入时不能直接写 VECTOR 类型,需先写 ARRAY 再 CAST 转换
155
- - `ef` 值越大,检索精度越高但延迟越大;建议从 64 开始调优
@@ -1,153 +0,0 @@
1
- ---
2
- name: clickzetta-app-python-sdk
3
- description: |
4
- 在 Python 应用程序中集成 ClickZetta Lakehouse 的官方 SDK 用法。
5
- 覆盖 clickzetta-connector-python(SQL 查询、参数绑定、批量插入、异步执行)、
6
- clickzetta-ingestion-python(BulkLoad 批量上传,单线程与分布式模式)、
7
- clickzetta-ingestion-python-v2(IGS 实时写入,秒级可查,支持主键表 CDC)、
8
- SQLAlchemy dialect 集成,以及连接参数说明。
9
- 当用户说"Python SDK"、"clickzetta-connector-python"、"clickzetta-ingestion-python"、
10
- "Python 查询 Lakehouse"、"Python 写入 Lakehouse"、"Python 批量上传"、
11
- "BulkLoad Python"、"SQLAlchemy Lakehouse"、"Python 连接 Lakehouse"、
12
- "executemany"、"execute_async"、"参数绑定 Python"、
13
- "IGS 实时写入"、"实时写入 Python"、"ingestion-python-v2"、
14
- "主键表写入 Python"、"CDC 写入"、"UPSERT Python"时触发。
15
- Keywords: Python SDK, clickzetta-connector-python, clickzetta-ingestion-python, bulk insert, async query, SQLAlchemy, IGS
16
- ---
17
-
18
- # ClickZetta Lakehouse — Python SDK
19
-
20
- 官方提供三个 Python 包:
21
- - **`clickzetta-connector-python`** — SQL 查询接口(PEP-249 规范),支持参数绑定、批量插入、异步执行、SQLAlchemy dialect
22
- - **`clickzetta-ingestion-python`** — 高吞吐批量上传(BulkLoad),数据直传对象存储,不消耗计算资源
23
- - **`clickzetta-ingestion-python-v2`** — IGS 实时写入,秒级可查,支持主键表 CDC(UPSERT/DELETE)
24
-
25
- 阅读 [references/connector.md](references/connector.md) 了解 SQL 查询接口,[references/bulkload.md](references/bulkload.md) 了解批量上传,[references/realtime.md](references/realtime.md) 了解 IGS 实时写入。
26
-
27
- ---
28
-
29
- ## 安装
30
-
31
- ```bash
32
- # SQL 查询接口
33
- pip install clickzetta-connector-python -U
34
-
35
- # 批量上传(按云环境选择)
36
- pip install "clickzetta-ingestion-python[oss]" -U # 阿里云
37
- pip install "clickzetta-ingestion-python[s3]" -U # AWS
38
- pip install "clickzetta-ingestion-python[all]" -U # 全部(安装较慢)
39
-
40
- # IGS 实时写入
41
- pip install clickzetta-ingestion-python-v2
42
- ```
43
-
44
- > 注意:旧版 `clickzetta-connector` 已停止维护,请迁移到 `clickzetta-connector-python`。
45
-
46
- ---
47
-
48
- ## 连接参数
49
-
50
- ```python
51
- from clickzetta import connect
52
-
53
- conn = connect(
54
- username='your_username',
55
- password='your_password',
56
- service='api.clickzetta.com', # region.api.clickzetta.com
57
- instance='your_instance',
58
- workspace='your_workspace',
59
- schema='public',
60
- vcluster='default'
61
- )
62
- ```
63
-
64
- | 参数 | 必填 | 说明 |
65
- |---|---|---|
66
- | `username` | ✅ | 用户名 |
67
- | `password` | ✅ | 密码 |
68
- | `service` | ✅ | 连接地址,格式 `region.api.clickzetta.com` |
69
- | `instance` | ✅ | 实例名,在 Studio 工作空间 JDBC 连接串中查看 |
70
- | `workspace` | ✅ | 工作空间名 |
71
- | `vcluster` | ✅ | 虚拟集群名 |
72
- | `schema` | ✅ | 默认 schema |
73
-
74
- ---
75
-
76
- ## 快速示例
77
-
78
- ```python
79
- # 查询
80
- cursor = conn.cursor()
81
- cursor.execute('SELECT * FROM orders LIMIT 10')
82
- results = cursor.fetchall()
83
- cursor.close()
84
- conn.close()
85
-
86
- # 参数绑定(防 SQL 注入)
87
- cursor.execute('INSERT INTO test (id, name) VALUES (?, ?)', binding_params=[1, 'test'])
88
-
89
- # 批量插入
90
- data = [(1, 'a'), (2, 'b'), (3, 'c')]
91
- cursor.executemany('INSERT INTO test (id, name) VALUES (?, ?)', data)
92
- ```
93
-
94
- ## IGS 实时写入快速示例(ingestion-python-v2)
95
-
96
- 普通表(APPEND_ONLY):
97
-
98
- ```python
99
- from clickzetta.connector.v0.connection import connect
100
- from clickzetta.connector.v0.enums import RealtimeOperation
101
- from clickzetta_ingestion.realtime.realtime_options import RealtimeOptionsBuilder, FlushMode
102
- from clickzetta_ingestion.realtime.arrow_stream import RowOperator
103
-
104
- with connect(**CONN_ARGS) as conn:
105
- stream = conn.get_realtime_stream(
106
- schema='your_schema',
107
- table='your_table',
108
- operate=RealtimeOperation.APPEND_ONLY,
109
- options=RealtimeOptionsBuilder().with_flush_mode(FlushMode.AUTO_FLUSH_BACKGROUND).build()
110
- )
111
- row = stream.create_row(RowOperator.INSERT)
112
- row.set_value('id', 1)
113
- row.set_value('name', 'alice')
114
- stream.apply(row)
115
- stream.close()
116
- ```
117
-
118
- 主键表 CDC(UPSERT / DELETE):
119
-
120
- ```python
121
- # 建表:CREATE TABLE users (id STRING NOT NULL PRIMARY KEY, name STRING, age INT);
122
-
123
- with connect(**CONN_ARGS) as conn:
124
- stream = conn.get_realtime_stream(
125
- schema='your_schema',
126
- table='users',
127
- operate=RealtimeOperation.CDC, # 主键表必须用 CDC
128
- options=RealtimeOptionsBuilder().with_flush_mode(FlushMode.AUTO_FLUSH_SYNC).build()
129
- )
130
- # UPSERT
131
- row = stream.create_row(RowOperator.UPSERT)
132
- row.set_value('id', 'u1')
133
- row.set_value('name', 'bob')
134
- row.set_value('age', 25)
135
- stream.apply(row)
136
- # DELETE_IGNORE
137
- row = stream.create_row(RowOperator.DELETE_IGNORE)
138
- row.set_value('id', 'u1')
139
- stream.apply(row)
140
- stream.close()
141
- ```
142
-
143
- ---
144
-
145
- ## 选择指南
146
-
147
- | 场景 | 推荐方案 |
148
- |---|---|
149
- | 查询 / 小批量写入 | `clickzetta-connector-python` |
150
- | 大批量数据导入(GB 级,间隔 ≥ 5 分钟) | `clickzetta-ingestion-python` BulkLoad |
151
- | 高频小批写入(间隔 < 5 分钟,秒级可查) | `clickzetta-ingestion-python-v2` 实时写入 |
152
- | 主键表写入(UPSERT / DELETE) | `clickzetta-ingestion-python-v2` CDC 模式 |
153
- | SQLAlchemy / ORM 集成 | `clickzetta-connector-python`(内置 dialect) |
@@ -1,196 +0,0 @@
1
- # clickzetta-ingestion-python BulkLoad 详细参考
2
-
3
- ## 安装
4
-
5
- ```bash
6
- # 按云环境选择(推荐按需安装,all 安装较慢且可能冲突)
7
- pip install "clickzetta-ingestion-python[oss]" -U # 阿里云
8
- pip install "clickzetta-ingestion-python[s3]" -U # AWS
9
- pip install "clickzetta-ingestion-python[cos]" -U # 腾讯云
10
- pip install "clickzetta-ingestion-python[gcp]" -U # Google Cloud
11
- pip install "clickzetta-ingestion-python[all]" -U # 全部
12
- ```
13
-
14
- ## 工作原理
15
-
16
- ```
17
- [SDK 写入数据] → [对象存储] → [调用 commit()] → [触发 SQL 导入] → [Lakehouse 表]
18
- ```
19
-
20
- - 数据上传阶段不消耗计算资源
21
- - `commit()` 触发从对象存储到 Lakehouse 表的导入,消耗少量计算资源
22
- - `commit()` 只能调用一次,commit 后数据可见
23
-
24
- ## 使用限制
25
-
26
- - **不支持主键(pk)表写入**
27
- - **不适合时间间隔小于 5 分钟的高频写入**
28
-
29
- ## 单线程写入
30
-
31
- ### 建表
32
-
33
- ```sql
34
- CREATE TABLE public.bulkload_test (
35
- i BIGINT,
36
- s STRING,
37
- d DOUBLE
38
- );
39
- ```
40
-
41
- ### 完整示例
42
-
43
- ```python
44
- from clickzetta import connect
45
-
46
- conn = connect(
47
- username='your_username',
48
- password='your_password',
49
- service='api.clickzetta.com',
50
- instance='your_instance',
51
- workspace='your_workspace',
52
- schema='public',
53
- vcluster='default'
54
- )
55
-
56
- bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulkload_test')
57
-
58
- writer = bulkload_stream.open_writer(0) # 单线程传 0
59
- for index in range(1000000):
60
- row = writer.create_row()
61
- row.set_value('i', index) # 按列名设值
62
- row.set_value('s', 'Hello')
63
- row.set_value('d', 123.456)
64
- writer.write(row)
65
- writer.close()
66
-
67
- bulkload_stream.commit() # 提交,数据可见
68
- ```
69
-
70
- ## 读取 CSV 写入示例
71
-
72
- ```python
73
- from clickzetta import connect
74
- import csv
75
-
76
- conn = connect(
77
- username='',
78
- password='',
79
- service='api.clickzetta.com',
80
- instance='',
81
- workspace='',
82
- schema='public',
83
- vcluster='default_ap'
84
- )
85
-
86
- bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulk_order_payments')
87
- writer = bulkload_stream.open_writer(0)
88
-
89
- with open('olist_order_payments_dataset.csv', 'r') as csvfile:
90
- reader = csv.reader(csvfile)
91
- next(reader) # 跳过 header
92
- for record in reader:
93
- row = writer.create_row()
94
- row.set_value('order_id', record[0])
95
- row.set_value('payment_sequence', int(record[1]))
96
- row.set_value('payment_type', record[2])
97
- row.set_value('payment_installments', int(record[3]))
98
- row.set_value('payment_value', float(record[4]))
99
- writer.write(row) # ⚠️ 必须调用,否则数据不发送到服务端
100
-
101
- writer.close()
102
- bulkload_stream.commit()
103
- ```
104
-
105
- ## 写入模式
106
-
107
- ```python
108
- from clickzetta.bulkload.bulkload_enums import BulkLoadOperation
109
-
110
- # APPEND 模式(默认):新数据追加,不影响旧数据
111
- bulkload_stream = conn.create_bulkload_stream(schema='public', table='my_table')
112
-
113
- # OVERWRITE 模式:清空旧数据,写入新数据
114
- bulkload_stream = conn.create_bulkload_stream(
115
- schema='public',
116
- table='my_table',
117
- operation=BulkLoadOperation.OVERWRITE
118
- )
119
-
120
- # 分区表 OVERWRITE(只覆盖指定分区)
121
- bulkload_stream = conn.create_bulkload_stream(
122
- schema='public',
123
- table='my_partitioned_table',
124
- partition_spec='pt=2024-01-01',
125
- operation=BulkLoadOperation.OVERWRITE
126
- )
127
- ```
128
-
129
- ## 分布式并发写入
130
-
131
- 适合 GB 级以上数据,多进程并发写入同一 stream,最后统一 commit。
132
-
133
- ### 控制进程
134
-
135
- ```python
136
- import subprocess
137
- from clickzetta import connect
138
-
139
- conn = connect(username='username', password='password',
140
- service='api.clickzetta.com', instance='instance',
141
- workspace='quickstart_ws', schema='public', vcluster='default')
142
-
143
- bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulkload_test')
144
- stream_id = bulkload_stream.get_stream_id()
145
-
146
- # 启动多个写入进程,每个进程用不同的 writer_id
147
- p1 = subprocess.Popen(['python', 'writer.py', stream_id, '1'])
148
- p2 = subprocess.Popen(['python', 'writer.py', stream_id, '2'])
149
- p1.wait()
150
- p2.wait()
151
-
152
- bulkload_stream.commit() # 所有 writer 完成后统一 commit
153
- ```
154
-
155
- ### 写入进程
156
-
157
- ```python
158
- import sys
159
- from clickzetta import connect
160
-
161
- conn = connect(username='username', password='password',
162
- service='api.clickzetta.com', instance='instance',
163
- workspace='quickstart_ws', schema='public', vcluster='default')
164
-
165
- stream_id = sys.argv[1]
166
- writer_id = int(sys.argv[2])
167
-
168
- # 通过 stream_id 获取已有 stream(不创建新的)
169
- bulkload_stream = conn.get_bulkload_stream(
170
- schema='public', table='bulkload_test', stream_id=stream_id
171
- )
172
-
173
- writer = bulkload_stream.open_writer(writer_id) # writer_id 必须唯一
174
- for index in range(1, 1000000):
175
- row = writer.create_row()
176
- row.set_value('i', index)
177
- row.set_value('s', 'Hello')
178
- row.set_value('d', 123.456)
179
- writer.write(row)
180
- writer.close()
181
- # 写入进程不调用 commit,只有控制进程调用
182
- ```
183
-
184
- ## 关键 API
185
-
186
- | API | 说明 |
187
- |---|---|
188
- | `conn.create_bulkload_stream(schema, table)` | 创建新的 bulkload stream |
189
- | `conn.get_bulkload_stream(schema, table, stream_id)` | 获取已有 stream(分布式写入用) |
190
- | `bulkload_stream.get_stream_id()` | 获取 stream id(传给写入进程) |
191
- | `bulkload_stream.open_writer(writer_id)` | 创建 writer,id 必须唯一 |
192
- | `writer.create_row()` | 创建行对象 |
193
- | `row.set_value(column_name, value)` | 按列名设值 |
194
- | `writer.write(row)` | 写入行(必须调用) |
195
- | `writer.close()` | 关闭 writer(写完必须调用) |
196
- | `bulkload_stream.commit()` | 提交,数据可见(只能调用一次) |