@clickzetta/cz-cli-darwin-x64 0.3.40 → 0.3.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cz-cli +0 -0
- package/bin/skills/clickzetta-app-python-sdk/SKILL.md +153 -0
- package/bin/skills/clickzetta-app-python-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-app-python-sdk/references/bulkload.md +196 -0
- package/bin/skills/clickzetta-app-python-sdk/references/connector.md +143 -0
- package/bin/skills/clickzetta-app-python-sdk/references/realtime.md +122 -0
- package/bin/skills/clickzetta-batch-sync-pipeline/SKILL.md +128 -287
- package/bin/skills/clickzetta-bi-connect/SKILL.md +176 -0
- package/bin/skills/clickzetta-bi-connect/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-bi-connect/references/bi-tools.md +170 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/SKILL.md +633 -0
- package/bin/skills/clickzetta-cdc-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-ingest-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-data-science/SKILL.md +125 -0
- package/bin/skills/clickzetta-data-science/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-data-science/references/bitmap-profile.md +146 -0
- package/bin/skills/clickzetta-data-science/references/data-patterns.md +110 -0
- package/bin/skills/clickzetta-data-science/references/setup.md +160 -0
- package/bin/skills/clickzetta-data-science/references/stats-functions.md +195 -0
- package/bin/skills/clickzetta-data-science/references/write-and-infer.md +122 -0
- package/bin/skills/clickzetta-data-science/references/zettapark-api.md +156 -0
- package/bin/skills/clickzetta-data-sharing/SKILL.md +160 -0
- package/bin/skills/clickzetta-data-sharing/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-data-sharing/references/share-ddl.md +134 -0
- package/bin/skills/clickzetta-dw-modeling/SKILL.md +103 -11
- package/bin/skills/clickzetta-dynamic-table/SKILL.md +58 -2
- package/bin/skills/clickzetta-dynamic-table/dynamic-table-alter/SKILL.md +4 -4
- package/bin/skills/clickzetta-external-catalog/SKILL.md +123 -0
- package/bin/skills/clickzetta-external-catalog/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-external-catalog/references/external-catalog-ddl.md +130 -0
- package/bin/skills/clickzetta-file-import-pipeline/SKILL.md +34 -0
- package/bin/skills/clickzetta-java-sdk/SKILL.md +186 -0
- package/bin/skills/clickzetta-java-sdk/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-java-sdk/references/bulkload.md +163 -0
- package/bin/skills/clickzetta-java-sdk/references/realtime.md +212 -0
- package/bin/skills/clickzetta-kafka-ingest-pipeline/SKILL.md +31 -0
- package/bin/skills/clickzetta-metadata/SKILL.md +28 -30
- package/bin/skills/clickzetta-oss-ingest-pipeline/SKILL.md +39 -0
- package/bin/skills/clickzetta-pipeline-review/SKILL.md +377 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/SKILL.md +323 -0
- package/bin/skills/clickzetta-realtime-sync-pipeline/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-semantic-view/SKILL.md +207 -0
- package/bin/skills/clickzetta-semantic-view/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-semantic-view/references/semantic-view-reference.md +167 -0
- package/bin/skills/clickzetta-spark-flink-connector/SKILL.md +92 -0
- package/bin/skills/clickzetta-spark-flink-connector/eval_cases.jsonl +5 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/flink.md +147 -0
- package/bin/skills/clickzetta-spark-flink-connector/references/spark.md +132 -0
- package/bin/skills/clickzetta-sql-pipeline-manager/SKILL.md +115 -9
- package/bin/skills/clickzetta-sql-syntax-guide/SKILL.md +249 -0
- package/bin/skills/clickzetta-sql-syntax-guide/eval_cases.jsonl +3 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/ddl-reference.md +350 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dml-reference.md +279 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/dql-reference.md +504 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/functions-reference.md +372 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-databricks.md +260 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/migration-snowflake.md +382 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-snowflake.md +346 -0
- package/bin/skills/clickzetta-sql-syntax-guide/references/vs-spark.md +229 -0
- package/bin/skills/clickzetta-studio-task-manager/SKILL.md +652 -0
- package/bin/skills/clickzetta-table-lineage/SKILL.md +90 -0
- package/bin/skills/clickzetta-table-lineage/eval_cases.jsonl +1 -0
- package/bin/skills/clickzetta-table-lineage/references/normalize_func.sql +14 -0
- package/bin/skills/clickzetta-table-lineage/references/table_cost.sql +38 -0
- package/bin/skills/clickzetta-table-lineage/references/table_lineage_standalone.html +562 -0
- package/bin/skills/clickzetta-table-lineage/references/table_relation.sql +25 -0
- package/bin/skills/clickzetta-zettapark/SKILL.md +248 -0
- package/bin/skills/clickzetta-zettapark/eval_cases.jsonl +12 -0
- package/bin/skills/clickzetta-zettapark/references/zettapark-api.md +283 -0
- package/package.json +1 -1
- package/bin/skills/clickzetta-ai-vector-search/SKILL.md +0 -160
- package/bin/skills/clickzetta-ai-vector-search/eval_cases.jsonl +0 -4
- package/bin/skills/clickzetta-ai-vector-search/references/vector-search.md +0 -155
package/bin/cz-cli
CHANGED
|
Binary file
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clickzetta-app-python-sdk
|
|
3
|
+
description: |
|
|
4
|
+
在 Python 应用程序中集成 ClickZetta Lakehouse 的官方 SDK 用法。
|
|
5
|
+
覆盖 clickzetta-connector-python(SQL 查询、参数绑定、批量插入、异步执行)、
|
|
6
|
+
clickzetta-ingestion-python(BulkLoad 批量上传,单线程与分布式模式)、
|
|
7
|
+
clickzetta-ingestion-python-v2(IGS 实时写入,秒级可查,支持主键表 CDC)、
|
|
8
|
+
SQLAlchemy dialect 集成,以及连接参数说明。
|
|
9
|
+
当用户说"Python SDK"、"clickzetta-connector-python"、"clickzetta-ingestion-python"、
|
|
10
|
+
"Python 查询 Lakehouse"、"Python 写入 Lakehouse"、"Python 批量上传"、
|
|
11
|
+
"BulkLoad Python"、"SQLAlchemy Lakehouse"、"Python 连接 Lakehouse"、
|
|
12
|
+
"executemany"、"execute_async"、"参数绑定 Python"、
|
|
13
|
+
"IGS 实时写入"、"实时写入 Python"、"ingestion-python-v2"、
|
|
14
|
+
"主键表写入 Python"、"CDC 写入"、"UPSERT Python"时触发。
|
|
15
|
+
Keywords: Python SDK, clickzetta-connector-python, clickzetta-ingestion-python, bulk insert, async query, SQLAlchemy, IGS
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
# ClickZetta Lakehouse — Python SDK
|
|
19
|
+
|
|
20
|
+
官方提供三个 Python 包:
|
|
21
|
+
- **`clickzetta-connector-python`** — SQL 查询接口(PEP-249 规范),支持参数绑定、批量插入、异步执行、SQLAlchemy dialect
|
|
22
|
+
- **`clickzetta-ingestion-python`** — 高吞吐批量上传(BulkLoad),数据直传对象存储,不消耗计算资源
|
|
23
|
+
- **`clickzetta-ingestion-python-v2`** — IGS 实时写入,秒级可查,支持主键表 CDC(UPSERT/DELETE)
|
|
24
|
+
|
|
25
|
+
阅读 [references/connector.md](references/connector.md) 了解 SQL 查询接口,[references/bulkload.md](references/bulkload.md) 了解批量上传,[references/realtime.md](references/realtime.md) 了解 IGS 实时写入。
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## 安装
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# SQL 查询接口
|
|
33
|
+
pip install clickzetta-connector-python -U
|
|
34
|
+
|
|
35
|
+
# 批量上传(按云环境选择)
|
|
36
|
+
pip install "clickzetta-ingestion-python[oss]" -U # 阿里云
|
|
37
|
+
pip install "clickzetta-ingestion-python[s3]" -U # AWS
|
|
38
|
+
pip install "clickzetta-ingestion-python[all]" -U # 全部(安装较慢)
|
|
39
|
+
|
|
40
|
+
# IGS 实时写入
|
|
41
|
+
pip install clickzetta-ingestion-python-v2
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
> 注意:旧版 `clickzetta-connector` 已停止维护,请迁移到 `clickzetta-connector-python`。
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## 连接参数
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from clickzetta import connect
|
|
52
|
+
|
|
53
|
+
conn = connect(
|
|
54
|
+
username='your_username',
|
|
55
|
+
password='your_password',
|
|
56
|
+
service='api.clickzetta.com', # region.api.clickzetta.com
|
|
57
|
+
instance='your_instance',
|
|
58
|
+
workspace='your_workspace',
|
|
59
|
+
schema='public',
|
|
60
|
+
vcluster='default'
|
|
61
|
+
)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
| 参数 | 必填 | 说明 |
|
|
65
|
+
|---|---|---|
|
|
66
|
+
| `username` | ✅ | 用户名 |
|
|
67
|
+
| `password` | ✅ | 密码 |
|
|
68
|
+
| `service` | ✅ | 连接地址,格式 `region.api.clickzetta.com` |
|
|
69
|
+
| `instance` | ✅ | 实例名,在 Studio 工作空间 JDBC 连接串中查看 |
|
|
70
|
+
| `workspace` | ✅ | 工作空间名 |
|
|
71
|
+
| `vcluster` | ✅ | 虚拟集群名 |
|
|
72
|
+
| `schema` | ✅ | 默认 schema |
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## 快速示例
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
# 查询
|
|
80
|
+
cursor = conn.cursor()
|
|
81
|
+
cursor.execute('SELECT * FROM orders LIMIT 10')
|
|
82
|
+
results = cursor.fetchall()
|
|
83
|
+
cursor.close()
|
|
84
|
+
conn.close()
|
|
85
|
+
|
|
86
|
+
# 参数绑定(防 SQL 注入)
|
|
87
|
+
cursor.execute('INSERT INTO test (id, name) VALUES (?, ?)', binding_params=[1, 'test'])
|
|
88
|
+
|
|
89
|
+
# 批量插入
|
|
90
|
+
data = [(1, 'a'), (2, 'b'), (3, 'c')]
|
|
91
|
+
cursor.executemany('INSERT INTO test (id, name) VALUES (?, ?)', data)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## IGS 实时写入快速示例(ingestion-python-v2)
|
|
95
|
+
|
|
96
|
+
普通表(APPEND_ONLY):
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from clickzetta.connector.v0.connection import connect
|
|
100
|
+
from clickzetta.connector.v0.enums import RealtimeOperation
|
|
101
|
+
from clickzetta_ingestion.realtime.realtime_options import RealtimeOptionsBuilder, FlushMode
|
|
102
|
+
from clickzetta_ingestion.realtime.arrow_stream import RowOperator
|
|
103
|
+
|
|
104
|
+
with connect(**CONN_ARGS) as conn:
|
|
105
|
+
stream = conn.get_realtime_stream(
|
|
106
|
+
schema='your_schema',
|
|
107
|
+
table='your_table',
|
|
108
|
+
operate=RealtimeOperation.APPEND_ONLY,
|
|
109
|
+
options=RealtimeOptionsBuilder().with_flush_mode(FlushMode.AUTO_FLUSH_BACKGROUND).build()
|
|
110
|
+
)
|
|
111
|
+
row = stream.create_row(RowOperator.INSERT)
|
|
112
|
+
row.set_value('id', 1)
|
|
113
|
+
row.set_value('name', 'alice')
|
|
114
|
+
stream.apply(row)
|
|
115
|
+
stream.close()
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
主键表 CDC(UPSERT / DELETE):
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
# 建表:CREATE TABLE users (id STRING NOT NULL PRIMARY KEY, name STRING, age INT);
|
|
122
|
+
|
|
123
|
+
with connect(**CONN_ARGS) as conn:
|
|
124
|
+
stream = conn.get_realtime_stream(
|
|
125
|
+
schema='your_schema',
|
|
126
|
+
table='users',
|
|
127
|
+
operate=RealtimeOperation.CDC, # 主键表必须用 CDC
|
|
128
|
+
options=RealtimeOptionsBuilder().with_flush_mode(FlushMode.AUTO_FLUSH_SYNC).build()
|
|
129
|
+
)
|
|
130
|
+
# UPSERT
|
|
131
|
+
row = stream.create_row(RowOperator.UPSERT)
|
|
132
|
+
row.set_value('id', 'u1')
|
|
133
|
+
row.set_value('name', 'bob')
|
|
134
|
+
row.set_value('age', 25)
|
|
135
|
+
stream.apply(row)
|
|
136
|
+
# DELETE_IGNORE
|
|
137
|
+
row = stream.create_row(RowOperator.DELETE_IGNORE)
|
|
138
|
+
row.set_value('id', 'u1')
|
|
139
|
+
stream.apply(row)
|
|
140
|
+
stream.close()
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 选择指南
|
|
146
|
+
|
|
147
|
+
| 场景 | 推荐方案 |
|
|
148
|
+
|---|---|
|
|
149
|
+
| 查询 / 小批量写入 | `clickzetta-connector-python` |
|
|
150
|
+
| 大批量数据导入(GB 级,间隔 ≥ 5 分钟) | `clickzetta-ingestion-python` BulkLoad |
|
|
151
|
+
| 高频小批写入(间隔 < 5 分钟,秒级可查) | `clickzetta-ingestion-python-v2` 实时写入 |
|
|
152
|
+
| 主键表写入(UPSERT / DELETE) | `clickzetta-ingestion-python-v2` CDC 模式 |
|
|
153
|
+
| SQLAlchemy / ORM 集成 | `clickzetta-connector-python`(内置 dialect) |
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{"case_id":"001","type":"should_call","user_input":"clickzetta-connector-python 怎么执行 SQL 查询","expected_skill":"clickzetta-app-python-sdk","expected_output_contains":["cursor","execute"]}
|
|
2
|
+
{"case_id":"002","type":"should_call","user_input":"Python 怎么批量上传数据到 Lakehouse","expected_skill":"clickzetta-app-python-sdk","expected_output_contains":["ingestion","BulkLoad"]}
|
|
3
|
+
{"case_id":"003","type":"should_call","user_input":"executemany 批量插入怎么用","expected_skill":"clickzetta-app-python-sdk","expected_output_contains":["executemany"]}
|
|
4
|
+
{"case_id":"004","type":"should_call","user_input":"Python SDK 怎么做参数绑定","expected_skill":"clickzetta-app-python-sdk","expected_output_contains":["参数绑定"]}
|
|
5
|
+
{"case_id":"005","type":"should_call","user_input":"execute_async 异步执行怎么用","expected_skill":"clickzetta-app-python-sdk","expected_output_contains":["async"]}
|
|
6
|
+
{"case_id":"006","type":"should_call","user_input":"clickzetta-ingestion-python-v2 IGS 实时写入怎么用","expected_skill":"clickzetta-app-python-sdk","expected_output_contains":["IGS","实时"]}
|
|
7
|
+
{"case_id":"007","type":"should_call","user_input":"Python SDK 支持主键表 UPSERT 吗","expected_skill":"clickzetta-app-python-sdk","expected_output_contains":["UPSERT","主键"]}
|
|
8
|
+
{"case_id":"008","type":"should_not_call","user_input":"Java SDK 怎么写入数据","forbidden_skill":"clickzetta-app-python-sdk"}
|
|
9
|
+
{"case_id":"009","type":"should_not_call","user_input":"帮我写一个 Django 应用","forbidden_skill":"clickzetta-app-python-sdk"}
|
|
10
|
+
{"case_id":"010","type":"should_not_call","user_input":"ZettaPark DataFrame 怎么用","forbidden_skill":"clickzetta-app-python-sdk"}
|
|
11
|
+
{"case_id":"011","type":"should_not_call","user_input":"怎么创建 VCluster","forbidden_skill":"clickzetta-app-python-sdk"}
|
|
12
|
+
{"case_id":"012","type":"should_not_call","user_input":"pip install numpy 报错","forbidden_skill":"clickzetta-app-python-sdk"}
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# clickzetta-ingestion-python BulkLoad 详细参考
|
|
2
|
+
|
|
3
|
+
## 安装
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
# 按云环境选择(推荐按需安装,all 安装较慢且可能冲突)
|
|
7
|
+
pip install "clickzetta-ingestion-python[oss]" -U # 阿里云
|
|
8
|
+
pip install "clickzetta-ingestion-python[s3]" -U # AWS
|
|
9
|
+
pip install "clickzetta-ingestion-python[cos]" -U # 腾讯云
|
|
10
|
+
pip install "clickzetta-ingestion-python[gcp]" -U # Google Cloud
|
|
11
|
+
pip install "clickzetta-ingestion-python[all]" -U # 全部
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## 工作原理
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
[SDK 写入数据] → [对象存储] → [调用 commit()] → [触发 SQL 导入] → [Lakehouse 表]
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
- 数据上传阶段不消耗计算资源
|
|
21
|
+
- `commit()` 触发从对象存储到 Lakehouse 表的导入,消耗少量计算资源
|
|
22
|
+
- `commit()` 只能调用一次,commit 后数据可见
|
|
23
|
+
|
|
24
|
+
## 使用限制
|
|
25
|
+
|
|
26
|
+
- **不支持主键(pk)表写入**
|
|
27
|
+
- **不适合时间间隔小于 5 分钟的高频写入**
|
|
28
|
+
|
|
29
|
+
## 单线程写入
|
|
30
|
+
|
|
31
|
+
### 建表
|
|
32
|
+
|
|
33
|
+
```sql
|
|
34
|
+
CREATE TABLE public.bulkload_test (
|
|
35
|
+
i BIGINT,
|
|
36
|
+
s STRING,
|
|
37
|
+
d DOUBLE
|
|
38
|
+
);
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### 完整示例
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from clickzetta import connect
|
|
45
|
+
|
|
46
|
+
conn = connect(
|
|
47
|
+
username='your_username',
|
|
48
|
+
password='your_password',
|
|
49
|
+
service='api.clickzetta.com',
|
|
50
|
+
instance='your_instance',
|
|
51
|
+
workspace='your_workspace',
|
|
52
|
+
schema='public',
|
|
53
|
+
vcluster='default'
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulkload_test')
|
|
57
|
+
|
|
58
|
+
writer = bulkload_stream.open_writer(0) # 单线程传 0
|
|
59
|
+
for index in range(1000000):
|
|
60
|
+
row = writer.create_row()
|
|
61
|
+
row.set_value('i', index) # 按列名设值
|
|
62
|
+
row.set_value('s', 'Hello')
|
|
63
|
+
row.set_value('d', 123.456)
|
|
64
|
+
writer.write(row)
|
|
65
|
+
writer.close()
|
|
66
|
+
|
|
67
|
+
bulkload_stream.commit() # 提交,数据可见
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## 读取 CSV 写入示例
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from clickzetta import connect
|
|
74
|
+
import csv
|
|
75
|
+
|
|
76
|
+
conn = connect(
|
|
77
|
+
username='',
|
|
78
|
+
password='',
|
|
79
|
+
service='api.clickzetta.com',
|
|
80
|
+
instance='',
|
|
81
|
+
workspace='',
|
|
82
|
+
schema='public',
|
|
83
|
+
vcluster='default_ap'
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulk_order_payments')
|
|
87
|
+
writer = bulkload_stream.open_writer(0)
|
|
88
|
+
|
|
89
|
+
with open('olist_order_payments_dataset.csv', 'r') as csvfile:
|
|
90
|
+
reader = csv.reader(csvfile)
|
|
91
|
+
next(reader) # 跳过 header
|
|
92
|
+
for record in reader:
|
|
93
|
+
row = writer.create_row()
|
|
94
|
+
row.set_value('order_id', record[0])
|
|
95
|
+
row.set_value('payment_sequence', int(record[1]))
|
|
96
|
+
row.set_value('payment_type', record[2])
|
|
97
|
+
row.set_value('payment_installments', int(record[3]))
|
|
98
|
+
row.set_value('payment_value', float(record[4]))
|
|
99
|
+
writer.write(row) # ⚠️ 必须调用,否则数据不发送到服务端
|
|
100
|
+
|
|
101
|
+
writer.close()
|
|
102
|
+
bulkload_stream.commit()
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## 写入模式
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from clickzetta.bulkload.bulkload_enums import BulkLoadOperation
|
|
109
|
+
|
|
110
|
+
# APPEND 模式(默认):新数据追加,不影响旧数据
|
|
111
|
+
bulkload_stream = conn.create_bulkload_stream(schema='public', table='my_table')
|
|
112
|
+
|
|
113
|
+
# OVERWRITE 模式:清空旧数据,写入新数据
|
|
114
|
+
bulkload_stream = conn.create_bulkload_stream(
|
|
115
|
+
schema='public',
|
|
116
|
+
table='my_table',
|
|
117
|
+
operation=BulkLoadOperation.OVERWRITE
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# 分区表 OVERWRITE(只覆盖指定分区)
|
|
121
|
+
bulkload_stream = conn.create_bulkload_stream(
|
|
122
|
+
schema='public',
|
|
123
|
+
table='my_partitioned_table',
|
|
124
|
+
partition_spec='pt=2024-01-01',
|
|
125
|
+
operation=BulkLoadOperation.OVERWRITE
|
|
126
|
+
)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## 分布式并发写入
|
|
130
|
+
|
|
131
|
+
适合 GB 级以上数据,多进程并发写入同一 stream,最后统一 commit。
|
|
132
|
+
|
|
133
|
+
### 控制进程
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
import subprocess
|
|
137
|
+
from clickzetta import connect
|
|
138
|
+
|
|
139
|
+
conn = connect(username='username', password='password',
|
|
140
|
+
service='api.clickzetta.com', instance='instance',
|
|
141
|
+
workspace='quickstart_ws', schema='public', vcluster='default')
|
|
142
|
+
|
|
143
|
+
bulkload_stream = conn.create_bulkload_stream(schema='public', table='bulkload_test')
|
|
144
|
+
stream_id = bulkload_stream.get_stream_id()
|
|
145
|
+
|
|
146
|
+
# 启动多个写入进程,每个进程用不同的 writer_id
|
|
147
|
+
p1 = subprocess.Popen(['python', 'writer.py', stream_id, '1'])
|
|
148
|
+
p2 = subprocess.Popen(['python', 'writer.py', stream_id, '2'])
|
|
149
|
+
p1.wait()
|
|
150
|
+
p2.wait()
|
|
151
|
+
|
|
152
|
+
bulkload_stream.commit() # 所有 writer 完成后统一 commit
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 写入进程
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
import sys
|
|
159
|
+
from clickzetta import connect
|
|
160
|
+
|
|
161
|
+
conn = connect(username='username', password='password',
|
|
162
|
+
service='api.clickzetta.com', instance='instance',
|
|
163
|
+
workspace='quickstart_ws', schema='public', vcluster='default')
|
|
164
|
+
|
|
165
|
+
stream_id = sys.argv[1]
|
|
166
|
+
writer_id = int(sys.argv[2])
|
|
167
|
+
|
|
168
|
+
# 通过 stream_id 获取已有 stream(不创建新的)
|
|
169
|
+
bulkload_stream = conn.get_bulkload_stream(
|
|
170
|
+
schema='public', table='bulkload_test', stream_id=stream_id
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
writer = bulkload_stream.open_writer(writer_id) # writer_id 必须唯一
|
|
174
|
+
for index in range(1, 1000000):
|
|
175
|
+
row = writer.create_row()
|
|
176
|
+
row.set_value('i', index)
|
|
177
|
+
row.set_value('s', 'Hello')
|
|
178
|
+
row.set_value('d', 123.456)
|
|
179
|
+
writer.write(row)
|
|
180
|
+
writer.close()
|
|
181
|
+
# 写入进程不调用 commit,只有控制进程调用
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## 关键 API
|
|
185
|
+
|
|
186
|
+
| API | 说明 |
|
|
187
|
+
|---|---|
|
|
188
|
+
| `conn.create_bulkload_stream(schema, table)` | 创建新的 bulkload stream |
|
|
189
|
+
| `conn.get_bulkload_stream(schema, table, stream_id)` | 获取已有 stream(分布式写入用) |
|
|
190
|
+
| `bulkload_stream.get_stream_id()` | 获取 stream id(传给写入进程) |
|
|
191
|
+
| `bulkload_stream.open_writer(writer_id)` | 创建 writer,id 必须唯一 |
|
|
192
|
+
| `writer.create_row()` | 创建行对象 |
|
|
193
|
+
| `row.set_value(column_name, value)` | 按列名设值 |
|
|
194
|
+
| `writer.write(row)` | 写入行(必须调用) |
|
|
195
|
+
| `writer.close()` | 关闭 writer(写完必须调用) |
|
|
196
|
+
| `bulkload_stream.commit()` | 提交,数据可见(只能调用一次) |
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# clickzetta-connector-python 详细参考
|
|
2
|
+
|
|
3
|
+
## 安装
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install clickzetta-connector-python -U
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## 建立连接
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from clickzetta import connect
|
|
13
|
+
|
|
14
|
+
conn = connect(
|
|
15
|
+
username='your_username',
|
|
16
|
+
password='your_password',
|
|
17
|
+
service='api.clickzetta.com',
|
|
18
|
+
instance='your_instance',
|
|
19
|
+
workspace='your_workspace',
|
|
20
|
+
schema='public',
|
|
21
|
+
vcluster='default'
|
|
22
|
+
)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## 基本查询
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
cursor = conn.cursor()
|
|
29
|
+
cursor.execute('SELECT * FROM orders LIMIT 10')
|
|
30
|
+
results = cursor.fetchall()
|
|
31
|
+
for row in results:
|
|
32
|
+
print(row)
|
|
33
|
+
cursor.close()
|
|
34
|
+
conn.close()
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## 参数绑定
|
|
38
|
+
|
|
39
|
+
支持两种风格(PEP-249 规范):
|
|
40
|
+
|
|
41
|
+
### qmark 风格(推荐)
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
# 单行插入
|
|
45
|
+
cursor.execute('INSERT INTO test (id, name) VALUES (?, ?)', binding_params=[1, 'test'])
|
|
46
|
+
|
|
47
|
+
# 批量插入(executemany)
|
|
48
|
+
data = [
|
|
49
|
+
(1, 'test1'),
|
|
50
|
+
(2, 'test2'),
|
|
51
|
+
(3, 'test3')
|
|
52
|
+
]
|
|
53
|
+
cursor.executemany('INSERT INTO test (id, name) VALUES (?, ?)', data)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### pyformat 风格
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
data = {'id': 1, 'name': 'test'}
|
|
60
|
+
cursor.execute('INSERT INTO test (id, name) VALUES (%(id)s, %(name)s)', data)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## SQL hints(超时控制等)
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
my_param = {
|
|
67
|
+
'hints': {
|
|
68
|
+
'sdk.job.timeout': 30 # 查询超时秒数
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
cursor.execute('SELECT * FROM large_table', my_param)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 异步执行(长时间查询)
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import time
|
|
78
|
+
|
|
79
|
+
cursor.execute_async('SELECT * FROM large_table')
|
|
80
|
+
|
|
81
|
+
while not cursor.is_job_finished():
|
|
82
|
+
print("查询执行中...")
|
|
83
|
+
time.sleep(1)
|
|
84
|
+
|
|
85
|
+
results = cursor.fetchall()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## 结果保存到 CSV
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import csv
|
|
92
|
+
|
|
93
|
+
cursor.execute('SELECT * FROM orders LIMIT 1000')
|
|
94
|
+
results = cursor.fetchall()
|
|
95
|
+
|
|
96
|
+
with open('output.csv', 'w', newline='', encoding='utf-8') as f:
|
|
97
|
+
writer = csv.writer(f)
|
|
98
|
+
writer.writerow([col[0] for col in cursor.description])
|
|
99
|
+
writer.writerows(results)
|
|
100
|
+
|
|
101
|
+
cursor.close()
|
|
102
|
+
conn.close()
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## SQLAlchemy 集成
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from sqlalchemy import create_engine, text
|
|
109
|
+
|
|
110
|
+
engine = create_engine(
|
|
111
|
+
"clickzetta://username:password@instance.api.clickzetta.com/workspace"
|
|
112
|
+
"?schema=public&vcluster=default"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
with engine.connect() as conn:
|
|
116
|
+
result = conn.execute(text("SELECT * FROM orders LIMIT 10"))
|
|
117
|
+
for row in result:
|
|
118
|
+
print(row)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### SQLAlchemy + pandas
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
import pandas as pd
|
|
125
|
+
from sqlalchemy import create_engine, text
|
|
126
|
+
|
|
127
|
+
engine = create_engine(
|
|
128
|
+
"clickzetta://username:password@instance.api.clickzetta.com/workspace"
|
|
129
|
+
"?schema=public&vcluster=default"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
with engine.connect() as conn:
|
|
133
|
+
result = conn.execute(text("SELECT * FROM orders"))
|
|
134
|
+
df = pd.DataFrame(result.fetchall(), columns=result.keys())
|
|
135
|
+
|
|
136
|
+
print(df.head())
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## 注意事项
|
|
140
|
+
|
|
141
|
+
- 不支持 `commit()` 和 `rollback()` 接口
|
|
142
|
+
- 需要 `clickzetta-connector-python >= 0.8.82` 才能使用参数绑定和异步执行
|
|
143
|
+
- 旧版 `clickzetta-connector` 已停止维护,请迁移到 `clickzetta-connector-python`
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# clickzetta-ingestion-python-v2 实时写入详细参考
|
|
2
|
+
|
|
3
|
+
## 安装
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install clickzetta-ingestion-python-v2
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
> 注意:这是独立的包,与 `clickzetta-ingestion-python`(BulkLoad)不同。
|
|
10
|
+
|
|
11
|
+
## 与 BulkLoad 的区别
|
|
12
|
+
|
|
13
|
+
| 特性 | BulkLoad (`ingestion-python`) | 实时写入 (`ingestion-python-v2`) |
|
|
14
|
+
|---|---|---|
|
|
15
|
+
| 写入延迟 | 分钟级(commit 后可见) | 秒级可查 |
|
|
16
|
+
| 适用频率 | 间隔 ≥ 5 分钟 | 高频小批(5 分钟以内) |
|
|
17
|
+
| 主键表支持 | ❌ 不支持 | ✅ 支持(CDC 模式) |
|
|
18
|
+
| UPDATE/DELETE | ❌ | ✅ UPSERT / DELETE_IGNORE |
|
|
19
|
+
| 写入原理 | 上传对象存储 → commit 导入 | 直写 Ingestion Service |
|
|
20
|
+
|
|
21
|
+
## 工作原理
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
[SDK 写入] → [Ingestion Service] → [混合表(秒级可查)] → [~1分钟后自动 commit] → [普通表(stream/DT 可见)]
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
- 写入后秒级即可 SELECT 查到数据
|
|
28
|
+
- table stream、materialized view、dynamic table 需等约 1 分钟(commit 后)才能看到
|
|
29
|
+
- commit 后后台合并为普通表,之后可执行 UPDATE/MERGE/DELETE
|
|
30
|
+
|
|
31
|
+
## 普通表写入(APPEND_ONLY)
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from clickzetta.connector.v0.connection import connect
|
|
35
|
+
from clickzetta.connector.v0.enums import RealtimeOperation
|
|
36
|
+
from clickzetta_ingestion.realtime.realtime_options import RealtimeOptionsBuilder, FlushMode
|
|
37
|
+
from clickzetta_ingestion.realtime.arrow_stream import RowOperator
|
|
38
|
+
|
|
39
|
+
with connect(
|
|
40
|
+
username='your_username',
|
|
41
|
+
password='your_password',
|
|
42
|
+
service='your_service_endpoint',
|
|
43
|
+
instance='your_instance',
|
|
44
|
+
workspace='your_workspace',
|
|
45
|
+
schema='your_schema',
|
|
46
|
+
vcluster='default'
|
|
47
|
+
) as conn:
|
|
48
|
+
stream = conn.get_realtime_stream(
|
|
49
|
+
schema='your_schema',
|
|
50
|
+
table='your_table',
|
|
51
|
+
operate=RealtimeOperation.APPEND_ONLY,
|
|
52
|
+
options=RealtimeOptionsBuilder()
|
|
53
|
+
.with_flush_mode(FlushMode.AUTO_FLUSH_BACKGROUND)
|
|
54
|
+
.build()
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
for i in range(1000):
|
|
58
|
+
row = stream.create_row(RowOperator.INSERT)
|
|
59
|
+
row.set_value('id', str(i))
|
|
60
|
+
row.set_value('name', f'user_{i}')
|
|
61
|
+
stream.apply(row)
|
|
62
|
+
|
|
63
|
+
stream.close()
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## 主键表写入(CDC 模式)
|
|
67
|
+
|
|
68
|
+
主键表**必须**使用 `RealtimeOperation.CDC` + `FlushMode.AUTO_FLUSH_SYNC`。
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from clickzetta.connector.v0.connection import connect
|
|
72
|
+
from clickzetta.connector.v0.enums import RealtimeOperation
|
|
73
|
+
from clickzetta_ingestion.realtime.realtime_options import RealtimeOptionsBuilder, FlushMode
|
|
74
|
+
from clickzetta_ingestion.realtime.arrow_stream import RowOperator
|
|
75
|
+
|
|
76
|
+
with connect(...) as conn:
|
|
77
|
+
stream = conn.get_realtime_stream(
|
|
78
|
+
schema='your_schema',
|
|
79
|
+
table='your_pk_table',
|
|
80
|
+
operate=RealtimeOperation.CDC,
|
|
81
|
+
options=RealtimeOptionsBuilder()
|
|
82
|
+
.with_flush_mode(FlushMode.AUTO_FLUSH_SYNC) # PK 表强制同步刷写
|
|
83
|
+
.build()
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# UPSERT:存在则更新,不存在则插入
|
|
87
|
+
row = stream.create_row(RowOperator.UPSERT)
|
|
88
|
+
row.set_value('id', 'id_1')
|
|
89
|
+
row.set_value('name', 'alice')
|
|
90
|
+
row.set_value('age', 30)
|
|
91
|
+
stream.apply(row)
|
|
92
|
+
|
|
93
|
+
# DELETE_IGNORE:删除,目标行不存在时自动忽略
|
|
94
|
+
row = stream.create_row(RowOperator.DELETE_IGNORE)
|
|
95
|
+
row.set_value('id', 'id_1')
|
|
96
|
+
stream.apply(row)
|
|
97
|
+
|
|
98
|
+
stream.close()
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## 操作类型对照
|
|
102
|
+
|
|
103
|
+
| Stream 模式 | 可用 RowOperator | 适用表类型 |
|
|
104
|
+
|---|---|---|
|
|
105
|
+
| `APPEND_ONLY` | `INSERT` | 普通表 |
|
|
106
|
+
| `CDC` | `UPSERT`、`DELETE_IGNORE` | 主键表(必须) |
|
|
107
|
+
|
|
108
|
+
## FlushMode 说明
|
|
109
|
+
|
|
110
|
+
| 模式 | 说明 | 适用场景 |
|
|
111
|
+
|---|---|---|
|
|
112
|
+
| `AUTO_FLUSH_BACKGROUND` | 异步刷写,高吞吐 | 普通表,对顺序无要求 |
|
|
113
|
+
| `AUTO_FLUSH_SYNC` | 同步刷写,阻塞式 | 主键表(强制),需保证顺序 |
|
|
114
|
+
| `MANUAL_FLUSH` | 手动调用 `stream.flush()` | 精确控制刷写时机 |
|
|
115
|
+
|
|
116
|
+
> ⚠️ 主键表不支持 `AUTO_FLUSH_BACKGROUND`,会自动重置为 `AUTO_FLUSH_SYNC`。
|
|
117
|
+
|
|
118
|
+
## 关键注意事项
|
|
119
|
+
|
|
120
|
+
- 表结构变更前需先停止实时写入任务,变更后约 90 分钟再重启(Flink Connector 的 schema change sink 除外)
|
|
121
|
+
- 分区列必须是 primary key 的子集
|
|
122
|
+
- 避免 `flush()` 过于频繁,会产生大量小文件
|