recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
# description : use clickhouse-driver [https://github.com/mymarilyn/clickhouse-driver]
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import functools
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import subprocess
|
|
9
|
+
|
|
10
|
+
import clickhouse_driver
|
|
11
|
+
|
|
12
|
+
from recurvedata.pigeon.connector._registry import register_connector_class
|
|
13
|
+
from recurvedata.pigeon.connector.dbapi import DBAPIConnector
|
|
14
|
+
from recurvedata.pigeon.schema import types
|
|
15
|
+
from recurvedata.pigeon.utils import fs
|
|
16
|
+
|
|
17
|
+
_clickhouse_type_to_canonical_type = {
|
|
18
|
+
# pigeon 没有定义 uint, 用「更长」的 INT 表示,防止溢出
|
|
19
|
+
"UInt8": types.INT16,
|
|
20
|
+
"UInt16": types.INT32,
|
|
21
|
+
"UInt32": types.INT64,
|
|
22
|
+
"UInt64": types.INT64,
|
|
23
|
+
"Int8": types.INT8,
|
|
24
|
+
"Int16": types.INT16,
|
|
25
|
+
"Int32": types.INT32,
|
|
26
|
+
"Int64": types.INT64,
|
|
27
|
+
"Float32": types.FLOAT32,
|
|
28
|
+
"Float64": types.FLOAT64,
|
|
29
|
+
"String": types.STRING,
|
|
30
|
+
"FixedString": types.STRING,
|
|
31
|
+
"Date": types.DATE,
|
|
32
|
+
"DateTime": types.DATETIME,
|
|
33
|
+
"Enum": types.STRING,
|
|
34
|
+
"Array": types.JSON,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
_canonical_type_to_clickhouse_type = {
|
|
38
|
+
types.BOOLEAN: "UInt8",
|
|
39
|
+
types.INT8: "Int8",
|
|
40
|
+
types.INT16: "Int16",
|
|
41
|
+
types.INT32: "Int32",
|
|
42
|
+
types.INT64: "Int64",
|
|
43
|
+
types.FLOAT32: "Float32",
|
|
44
|
+
types.FLOAT64: "Float64",
|
|
45
|
+
types.DATE: "Date",
|
|
46
|
+
types.DATETIME: "DateTime",
|
|
47
|
+
types.STRING: "String",
|
|
48
|
+
types.JSON: "String",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
nullable_type_p = re.compile(r"Nullable\((?P<inner_type_code>.*)\)")
|
|
52
|
+
array_type_p = re.compile(r"Array\((?P<inner_type_code>.*)\)")
|
|
53
|
+
low_cardinality_type_p = re.compile(r"LowCardinality\((?P<inner_type_code>.*)\)")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@register_connector_class(["clickhouse_native", "clickhouse"])
|
|
57
|
+
class ClickHouseConnector(DBAPIConnector):
|
|
58
|
+
_sqla_driver = "clickhouse+native"
|
|
59
|
+
_default_port = 9000
|
|
60
|
+
_default_database = "default"
|
|
61
|
+
|
|
62
|
+
def is_clickhouse_native(self):
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
def connect_impl(self, autocommit=False, *args, **kwargs):
|
|
66
|
+
conn_kwargs = {
|
|
67
|
+
"host": self.host,
|
|
68
|
+
"port": self.port,
|
|
69
|
+
"user": self.user,
|
|
70
|
+
"password": self.password,
|
|
71
|
+
"database": self.database,
|
|
72
|
+
"compression": True,
|
|
73
|
+
}
|
|
74
|
+
conn_kwargs.update(self.kwargs)
|
|
75
|
+
conn_kwargs.update(kwargs)
|
|
76
|
+
return clickhouse_driver.connect(**conn_kwargs)
|
|
77
|
+
|
|
78
|
+
def cursor(self, autocommit=False, dryrun=False, commit_on_close=True, stream=False, max_rows=0, **kwargs):
|
|
79
|
+
"""Returns a clickhouse DBAPI cursor
|
|
80
|
+
stream: enable or disable results streaming
|
|
81
|
+
max_rows: specifies the maximum number of rows to buffer at a time
|
|
82
|
+
"""
|
|
83
|
+
ch_cursor = super().cursor(autocommit=autocommit, dryrun=dryrun, commit_on_close=commit_on_close, **kwargs)
|
|
84
|
+
if stream:
|
|
85
|
+
ch_cursor._cursor.set_stream_results(stream_results=stream, max_row_buffer=max_rows)
|
|
86
|
+
return ch_cursor
|
|
87
|
+
|
|
88
|
+
def has_table(self, table, database=None, **kwargs) -> bool:
|
|
89
|
+
# check if table exists: https://clickhouse.com/docs/en/sql-reference/statements/exists/
|
|
90
|
+
database = database or self.database
|
|
91
|
+
rows = self.fetchall(f"EXISTS `{database}`.`{table}`")
|
|
92
|
+
return bool(rows[0][0])
|
|
93
|
+
|
|
94
|
+
def get_columns(self, table, database=None, exclude=None):
|
|
95
|
+
database = database or self.database
|
|
96
|
+
if not self.has_table(table, database):
|
|
97
|
+
raise ValueError("Table {!r} not exists in {!r}".format(table, database))
|
|
98
|
+
with self.cursor() as cursor:
|
|
99
|
+
cursor.execute(
|
|
100
|
+
"SELECT * FROM {}.{} LIMIT 0".format(self.quote_identifier(database), self.quote_identifier(table))
|
|
101
|
+
)
|
|
102
|
+
cols = [x.name for x in cursor.description if x not in (exclude or ())]
|
|
103
|
+
return cols
|
|
104
|
+
|
|
105
|
+
def generate_ddl(self, table, database=None, if_exists=True):
|
|
106
|
+
database = database or self.database
|
|
107
|
+
if not self.has_table(table, database):
|
|
108
|
+
raise ValueError(f"Table {table!r} not exists in {database!r}")
|
|
109
|
+
|
|
110
|
+
with self.cursor() as cursor:
|
|
111
|
+
cursor.execute(f"SHOW CREATE TABLE {self.quote_identifier(database)}.{self.quote_identifier(table)}")
|
|
112
|
+
if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
|
|
113
|
+
body = re.search(r"CREATE TABLE (.*)", cursor.fetchall()[0][0], flags=re.S).group(1)
|
|
114
|
+
return f"CREATE TABLE{if_exists_stmt}{body}"
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def to_canonical_type(type_code, size):
|
|
118
|
+
if "nullable" in type_code.lower():
|
|
119
|
+
type_code = nullable_type_p.search(type_code).groupdict()["inner_type_code"]
|
|
120
|
+
if "lowcardinality" in type_code.lower():
|
|
121
|
+
type_code = low_cardinality_type_p.search(type_code).groupdict()["inner_type_code"]
|
|
122
|
+
if "FixedString" in type_code:
|
|
123
|
+
type_code = "FixedString"
|
|
124
|
+
if "Array" in type_code:
|
|
125
|
+
type_code = "Array"
|
|
126
|
+
return _clickhouse_type_to_canonical_type.get(type_code, types.STRING)
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def from_canonical_type(canonical_type, size):
|
|
130
|
+
return _canonical_type_to_clickhouse_type.get(canonical_type, "String")
|
|
131
|
+
|
|
132
|
+
def generate_create_table_ddl(self, name, schema, **kwargs):
|
|
133
|
+
"""从 schema 文件生成建表语句。Table engine 需要从 kwargs 传入,否则默认使用 Log"""
|
|
134
|
+
# Nullable
|
|
135
|
+
cols = []
|
|
136
|
+
for f in schema:
|
|
137
|
+
col_name = self.quote_identifier(f.name)
|
|
138
|
+
if f.comment:
|
|
139
|
+
cols.append(f"{col_name} Nullable({self.from_canonical_type(f.type, f.size)}) COMMENT {f.comment!r}")
|
|
140
|
+
else:
|
|
141
|
+
cols.append(f"{col_name} Nullable({self.from_canonical_type(f.type, f.size)})")
|
|
142
|
+
|
|
143
|
+
col_types = ",\n".join(cols)
|
|
144
|
+
name = self.quote_identifier(name)
|
|
145
|
+
ddl = f"CREATE TABLE {name} (\n{col_types}\n)"
|
|
146
|
+
|
|
147
|
+
# ddl = super().generate_create_table_ddl(name, schema)
|
|
148
|
+
|
|
149
|
+
# Table Engines: https://clickhouse.yandex/docs/en/operations/table_engines/
|
|
150
|
+
engine = kwargs.get("ENGINE", "Log")
|
|
151
|
+
ddl += f" ENGINE = {engine}"
|
|
152
|
+
return ddl
|
|
153
|
+
|
|
154
|
+
def _handle_row(self, row, columns):
|
|
155
|
+
rv = []
|
|
156
|
+
for col, value in zip(columns, row):
|
|
157
|
+
rv.append(col.cast(value))
|
|
158
|
+
return tuple(rv)
|
|
159
|
+
|
|
160
|
+
def _get_columns_with_type(self, table):
|
|
161
|
+
with self.cursor() as cursor:
|
|
162
|
+
cursor.execute(
|
|
163
|
+
"SELECT * FROM {}.{} LIMIT 0".format(self.quote_identifier(self.database), self.quote_identifier(table))
|
|
164
|
+
)
|
|
165
|
+
cursor.fetchall()
|
|
166
|
+
cols = [ClickHouseField(x.name, x.type_code) for x in cursor.description]
|
|
167
|
+
return cols
|
|
168
|
+
|
|
169
|
+
def _bulk_insert(self, cursor, table, cols, rows):
|
|
170
|
+
if not rows:
|
|
171
|
+
return
|
|
172
|
+
if cols:
|
|
173
|
+
field_names = "({})".format(", ".join([self.quote_identifier(x) for x in cols]))
|
|
174
|
+
else:
|
|
175
|
+
field_names = ""
|
|
176
|
+
sql = f"INSERT INTO {table} {field_names} VALUES"
|
|
177
|
+
cursor.executemany(sql, rows)
|
|
178
|
+
cursor.connection.commit()
|
|
179
|
+
|
|
180
|
+
def load_csv(
|
|
181
|
+
self,
|
|
182
|
+
table,
|
|
183
|
+
filename,
|
|
184
|
+
delimiter=",",
|
|
185
|
+
quotechar='"',
|
|
186
|
+
lineterminator="\r\n",
|
|
187
|
+
escapechar=None,
|
|
188
|
+
skiprows=0,
|
|
189
|
+
using_insert=False,
|
|
190
|
+
**kwargs,
|
|
191
|
+
):
|
|
192
|
+
"""Load CSV file to ClickHouse table, support both batch INSERT by Python and clickhouse-client binary"""
|
|
193
|
+
infile = filename
|
|
194
|
+
if skiprows:
|
|
195
|
+
infile = fs.skip_lines(filename, skiprows)
|
|
196
|
+
|
|
197
|
+
clickhouse_client_binary = shutil.which("clickhouse-client")
|
|
198
|
+
try_clickhouse_client = (not using_insert) and clickhouse_client_binary
|
|
199
|
+
if try_clickhouse_client:
|
|
200
|
+
self.logger.info("found clickhouse-client in %s, try to load file using it", clickhouse_client_binary)
|
|
201
|
+
self._load_csv_by_clickhouse_client(clickhouse_client_binary, table, filename, delimiter)
|
|
202
|
+
else:
|
|
203
|
+
# fallback to perform INSERT
|
|
204
|
+
self._load_csv_by_inserting(table, filename, delimiter, quotechar, lineterminator, escapechar, **kwargs)
|
|
205
|
+
|
|
206
|
+
if infile != filename:
|
|
207
|
+
fs.remove_files_safely(infile)
|
|
208
|
+
|
|
209
|
+
def _load_csv_by_clickhouse_client(self, binary, table, filename, delimiter=","):
|
|
210
|
+
if "." not in table:
|
|
211
|
+
table = f"{self.database}.{table}"
|
|
212
|
+
command = " ".join(
|
|
213
|
+
[
|
|
214
|
+
binary,
|
|
215
|
+
f"--host {self.host}",
|
|
216
|
+
f"--port {self.port}",
|
|
217
|
+
f"--user {self.user}",
|
|
218
|
+
f"--password {self.password}",
|
|
219
|
+
f'--format_csv_delimiter="{delimiter}"',
|
|
220
|
+
f'--query="INSERT INTO {table} FORMAT CSV"' f"< {filename}",
|
|
221
|
+
]
|
|
222
|
+
)
|
|
223
|
+
self.logger.info(command)
|
|
224
|
+
subprocess.check_call(command, shell=True)
|
|
225
|
+
|
|
226
|
+
def _load_csv_by_inserting(self, table, filename, delimiter, quotechar, lineterminator, escapechar, **kwargs):
|
|
227
|
+
# https://clickhouse.yandex/docs/en/query_language/insert_into/
|
|
228
|
+
# Performance Considerations
|
|
229
|
+
# INSERT sorts the input data by primary key and splits them into partitions by a partition key
|
|
230
|
+
# If you insert data into several partitions at once, it can significantly reduce the performance.
|
|
231
|
+
# To avoid this:
|
|
232
|
+
#
|
|
233
|
+
# - Add data in fairly large batches, such as 100,000 rows at a time.
|
|
234
|
+
# - Group data by month before uploading it to ClickHouse.
|
|
235
|
+
batch_size = kwargs.get("batch_size") or 10000
|
|
236
|
+
|
|
237
|
+
# https://clickhouse.yandex/docs/en/single/#strong-typing
|
|
238
|
+
columns = self._get_columns_with_type(table)
|
|
239
|
+
values_hook = functools.partial(self._handle_row, columns=columns)
|
|
240
|
+
column_names = [x.name for x in columns]
|
|
241
|
+
|
|
242
|
+
self.logger.info("columns: %s", columns)
|
|
243
|
+
self.logger.info("batch size: %s", batch_size)
|
|
244
|
+
self.load_csv_by_inserting(
|
|
245
|
+
table=table,
|
|
246
|
+
filename=filename,
|
|
247
|
+
columns=column_names,
|
|
248
|
+
delimiter=delimiter,
|
|
249
|
+
quotechar=quotechar,
|
|
250
|
+
lineterminator=lineterminator,
|
|
251
|
+
escapechar=escapechar,
|
|
252
|
+
skiprows=0,
|
|
253
|
+
batch_size=batch_size,
|
|
254
|
+
values_hook=values_hook,
|
|
255
|
+
concurrency=kwargs.get("concurrency", 1),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class ClickHouseField:
|
|
260
|
+
"""将 clickhouse datatype 转换成 python datatype"""
|
|
261
|
+
|
|
262
|
+
def __init__(self, name, type_code):
|
|
263
|
+
self.name = name
|
|
264
|
+
self.type_code = type_code
|
|
265
|
+
|
|
266
|
+
if self.is_array() or self.is_nullable() or self.is_low_cardinality():
|
|
267
|
+
self.inner_type = self._infer_inner_type()
|
|
268
|
+
else:
|
|
269
|
+
self.inner_type = None
|
|
270
|
+
|
|
271
|
+
@classmethod
|
|
272
|
+
def get_converters(cls, columns_with_type: dict):
|
|
273
|
+
return {name: cls(type_code) for name, type_code in columns_with_type.items()}
|
|
274
|
+
|
|
275
|
+
def is_array(self):
|
|
276
|
+
return self.type_code.startswith("Array")
|
|
277
|
+
|
|
278
|
+
def is_nullable(self):
|
|
279
|
+
return self.type_code.startswith("Nullable")
|
|
280
|
+
|
|
281
|
+
def is_low_cardinality(self):
|
|
282
|
+
return self.type_code.startswith("LowCardinality")
|
|
283
|
+
|
|
284
|
+
@property
|
|
285
|
+
def _real_type(self):
|
|
286
|
+
if self.is_nullable():
|
|
287
|
+
return self.inner_type
|
|
288
|
+
return self.type_code
|
|
289
|
+
|
|
290
|
+
def is_int(self):
|
|
291
|
+
return self._real_type in ["UInt8", "UInt16", "UInt32", "UInt64", "Int8", "Int16", "Int32", "Int64"]
|
|
292
|
+
|
|
293
|
+
def is_float(self):
|
|
294
|
+
return self._real_type in ["Float32", "Float64"]
|
|
295
|
+
|
|
296
|
+
def is_string(self):
|
|
297
|
+
return self._real_type == "String"
|
|
298
|
+
|
|
299
|
+
def _infer_inner_type(self):
|
|
300
|
+
for f, p in [
|
|
301
|
+
(self.is_array, array_type_p),
|
|
302
|
+
(self.is_nullable, nullable_type_p),
|
|
303
|
+
(self.is_low_cardinality, low_cardinality_type_p),
|
|
304
|
+
]:
|
|
305
|
+
if f():
|
|
306
|
+
return p.search(self.type_code).groupdict()["inner_type_code"]
|
|
307
|
+
raise TypeError("No inner type, use type_code instead")
|
|
308
|
+
|
|
309
|
+
def _convert_datetime(self, value, type_code):
|
|
310
|
+
if type_code == "DateTime":
|
|
311
|
+
return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
|
|
312
|
+
return datetime.datetime.strptime(value, "%Y-%m-%d").date()
|
|
313
|
+
|
|
314
|
+
def cast(self, value):
|
|
315
|
+
if value is None:
|
|
316
|
+
if self.is_string():
|
|
317
|
+
return ""
|
|
318
|
+
else:
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
if self.type_code in ("DateTime", "Date"):
|
|
322
|
+
return self._convert_datetime(value, self.type_code)
|
|
323
|
+
|
|
324
|
+
if self.is_string():
|
|
325
|
+
return value
|
|
326
|
+
|
|
327
|
+
if self.is_int() or self.is_float():
|
|
328
|
+
if value == "":
|
|
329
|
+
return 0
|
|
330
|
+
if self.is_int():
|
|
331
|
+
return int(value)
|
|
332
|
+
else:
|
|
333
|
+
return float(value)
|
|
334
|
+
|
|
335
|
+
# 处理数组类型
|
|
336
|
+
if self.is_array():
|
|
337
|
+
if isinstance(value, str):
|
|
338
|
+
try:
|
|
339
|
+
value = json.loads(value)
|
|
340
|
+
except Exception:
|
|
341
|
+
value = []
|
|
342
|
+
|
|
343
|
+
if self.inner_type == "DateTime":
|
|
344
|
+
value = [self._convert_datetime(x, self.inner_type) for x in value]
|
|
345
|
+
return value
|
|
346
|
+
|
|
347
|
+
# 其他类型,先不处理,需要的时候再说
|
|
348
|
+
return value
|
|
349
|
+
|
|
350
|
+
def __repr__(self):
|
|
351
|
+
return f"<ClickHouseField({repr(self.name)}, {repr(self.type_code)})>"
|