recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Azure Synapse Analytics(previous Azure SQL Data Warehouse)
|
|
3
|
+
|
|
4
|
+
doc:https://docs.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-overview-what-is
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from recurvedata.pigeon.connector._registry import register_connector_class
|
|
10
|
+
from recurvedata.pigeon.connector.mssql import AzureSQLServerConnector
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_connector_class(["azure_synapse", "azure_dw"])
|
|
14
|
+
class AzureSynapseConnector(AzureSQLServerConnector):
|
|
15
|
+
_autocommit = True
|
|
16
|
+
|
|
17
|
+
def is_azure_synapse(self):
|
|
18
|
+
return True
|
|
19
|
+
|
|
20
|
+
def load_csv(
|
|
21
|
+
self,
|
|
22
|
+
table,
|
|
23
|
+
filename,
|
|
24
|
+
schema="dbo",
|
|
25
|
+
columns=None,
|
|
26
|
+
delimiter=",",
|
|
27
|
+
quotechar='"',
|
|
28
|
+
lineterminator="\r\n",
|
|
29
|
+
escapechar=None,
|
|
30
|
+
skiprows=0,
|
|
31
|
+
**kwargs,
|
|
32
|
+
):
|
|
33
|
+
options = dict(
|
|
34
|
+
columns=columns,
|
|
35
|
+
delimiter=delimiter,
|
|
36
|
+
quotechar=quotechar,
|
|
37
|
+
lineterminator=lineterminator,
|
|
38
|
+
escapechar=escapechar,
|
|
39
|
+
skiprows=skiprows,
|
|
40
|
+
)
|
|
41
|
+
options.update(**kwargs)
|
|
42
|
+
self.load_csv_bulk(table, filename, schema, **options)
|
|
43
|
+
|
|
44
|
+
def get_pandas_df(self, query, parameters=None, **kwargs):
|
|
45
|
+
# 没有 AUTOCOMMIT 的话, 会报错 An attempt to complete a transaction has failed. No corresponding transaction found.
|
|
46
|
+
con = self.create_engine({"isolation_level": "AUTOCOMMIT"})
|
|
47
|
+
try:
|
|
48
|
+
df = pd.read_sql_query(sql=query, con=con, params=parameters, **kwargs)
|
|
49
|
+
finally:
|
|
50
|
+
con.dispose()
|
|
51
|
+
return df
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import retrying
|
|
2
|
+
from cassandra import ReadTimeout, cqltypes
|
|
3
|
+
from cassandra.auth import PlainTextAuthProvider
|
|
4
|
+
from cassandra.cluster import Cluster, default_lbp_factory
|
|
5
|
+
from cassandra.encoder import Encoder
|
|
6
|
+
from cassandra.policies import ConstantReconnectionPolicy, RetryPolicy
|
|
7
|
+
from cassandra.query import bind_params
|
|
8
|
+
|
|
9
|
+
from recurvedata.pigeon.connector._registry import register_connector_class
|
|
10
|
+
from recurvedata.pigeon.schema import Schema, types
|
|
11
|
+
from recurvedata.pigeon.utils import LoggingMixin
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class NullSession(LoggingMixin):
|
|
15
|
+
"""
|
|
16
|
+
NullCursor implements some methods of Cassandra Session, but does nothing at all.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def execute(self, query, parameters=None, *args, **kwargs):
|
|
20
|
+
query_string = bind_params(query, parameters, Encoder())
|
|
21
|
+
self.logger.info(query_string)
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
def shutdown(self):
|
|
25
|
+
self.logger.info("shutting down null session")
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
def __enter__(self):
|
|
29
|
+
return self
|
|
30
|
+
|
|
31
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
32
|
+
self.shutdown()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ClosingSession(object):
|
|
36
|
+
def __init__(self, session):
|
|
37
|
+
self._session = session
|
|
38
|
+
self._cluster = session.cluster
|
|
39
|
+
|
|
40
|
+
def __getattr__(self, name):
|
|
41
|
+
return getattr(self._session, name)
|
|
42
|
+
|
|
43
|
+
def __enter__(self):
|
|
44
|
+
return self
|
|
45
|
+
|
|
46
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
47
|
+
self.shutdown()
|
|
48
|
+
|
|
49
|
+
def close(self):
|
|
50
|
+
self.shutdown()
|
|
51
|
+
|
|
52
|
+
def shutdown(self):
|
|
53
|
+
self._session.shutdown()
|
|
54
|
+
self._cluster.shutdown()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@register_connector_class("cassandra")
|
|
58
|
+
class CassandraConnector(object):
|
|
59
|
+
_cqltype_to_canonical_type = {
|
|
60
|
+
cqltypes.BooleanType: types.BOOLEAN,
|
|
61
|
+
cqltypes.ByteType: types.INT8,
|
|
62
|
+
cqltypes.ShortType: types.INT16,
|
|
63
|
+
cqltypes.Int32Type: types.INT32,
|
|
64
|
+
cqltypes.IntegerType: types.INT64,
|
|
65
|
+
cqltypes.LongType: types.INT64,
|
|
66
|
+
cqltypes.TimeType: types.INT64,
|
|
67
|
+
cqltypes.FloatType: types.FLOAT32,
|
|
68
|
+
cqltypes.DoubleType: types.FLOAT64,
|
|
69
|
+
cqltypes.Decimal: types.FLOAT64,
|
|
70
|
+
cqltypes.SimpleDateType: types.DATE,
|
|
71
|
+
cqltypes.DateType: types.DATETIME,
|
|
72
|
+
cqltypes.TimestampType: types.DATETIME,
|
|
73
|
+
cqltypes.VarcharType: types.STRING,
|
|
74
|
+
cqltypes.UUIDType: types.STRING,
|
|
75
|
+
cqltypes.UTF8Type: types.STRING,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
_canonical_type_to_cqltype = {
|
|
79
|
+
types.BOOLEAN: cqltypes.BooleanType.typename,
|
|
80
|
+
types.INT8: cqltypes.ByteType.typename,
|
|
81
|
+
types.INT16: cqltypes.ShortType.typename,
|
|
82
|
+
types.INT32: cqltypes.Int32Type.typename,
|
|
83
|
+
types.INT64: cqltypes.LongType.typename,
|
|
84
|
+
types.FLOAT32: cqltypes.FloatType.typename,
|
|
85
|
+
types.FLOAT64: cqltypes.DoubleType.typename,
|
|
86
|
+
types.DATE: cqltypes.SimpleDateType.typename,
|
|
87
|
+
types.DATETIME: cqltypes.TimestampType.typename,
|
|
88
|
+
types.STRING: cqltypes.UTF8Type.typename,
|
|
89
|
+
types.JSON: cqltypes.UTF8Type.typename,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def __init__(self, host, port, database=None, user=None, password=None, *args, **kwargs):
|
|
93
|
+
self.host = host
|
|
94
|
+
self.port = int(port)
|
|
95
|
+
self.database = database
|
|
96
|
+
self.user = user
|
|
97
|
+
self.password = password
|
|
98
|
+
self.args = args
|
|
99
|
+
self.kwargs = kwargs
|
|
100
|
+
|
|
101
|
+
def connect(self, *args, **kwargs):
|
|
102
|
+
auth = PlainTextAuthProvider(username=self.user, password=self.password)
|
|
103
|
+
cluster = Cluster(
|
|
104
|
+
contact_points=self.host,
|
|
105
|
+
auth_provider=auth,
|
|
106
|
+
protocol_version=3,
|
|
107
|
+
load_balancing_policy=default_lbp_factory(),
|
|
108
|
+
default_retry_policy=RetryPolicy(),
|
|
109
|
+
reconnection_policy=ConstantReconnectionPolicy(delay=1, max_attempts=10),
|
|
110
|
+
*args,
|
|
111
|
+
**kwargs,
|
|
112
|
+
)
|
|
113
|
+
return cluster
|
|
114
|
+
|
|
115
|
+
def session(self, *args, **kwargs):
|
|
116
|
+
cluster = self.connect(*args, **kwargs)
|
|
117
|
+
return cluster.connect(self.database)
|
|
118
|
+
|
|
119
|
+
def closing_session(self, dryrun=False, *args, **kwargs):
|
|
120
|
+
if dryrun:
|
|
121
|
+
session = NullSession()
|
|
122
|
+
else:
|
|
123
|
+
real_session = self.session(*args, **kwargs)
|
|
124
|
+
session = ClosingSession(real_session)
|
|
125
|
+
return session
|
|
126
|
+
|
|
127
|
+
def execute(self, query, parameters=None, timeout=20, retry=3):
|
|
128
|
+
with self.closing_session() as session:
|
|
129
|
+
retry_handler = retrying.Retrying(retry_on_exception=_retry_if_timeout, stop_max_attempt_number=retry)
|
|
130
|
+
return retry_handler.call(_execute_query, session, query, parameters, timeout)
|
|
131
|
+
|
|
132
|
+
def get_data_schema(self, result_set):
|
|
133
|
+
schema = Schema()
|
|
134
|
+
for name, ctype in zip(result_set.column_names, result_set.column_types):
|
|
135
|
+
ttype = self.to_canonical_type(ctype)
|
|
136
|
+
schema.add_field_by_attrs(name, ttype)
|
|
137
|
+
return schema
|
|
138
|
+
|
|
139
|
+
def to_canonical_type(self, ctype):
|
|
140
|
+
return self._cqltype_to_canonical_type.get(ctype, types.STRING)
|
|
141
|
+
|
|
142
|
+
def from_canonical_type(self, canonical_type, size):
|
|
143
|
+
return self._canonical_type_to_cqltype.get(canonical_type, cqltypes.UTF8Type.typename)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _retry_if_timeout(exc):
|
|
147
|
+
return isinstance(exc, ReadTimeout)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _execute_query(session, query, parameters, timeout, *args, **kwargs):
|
|
151
|
+
return session.execute(query, parameters, timeout=timeout, *args, **kwargs)
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import functools
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
7
|
+
|
|
8
|
+
import cytoolz as toolz
|
|
9
|
+
import requests
|
|
10
|
+
from infi.clickhouse_orm import fields
|
|
11
|
+
from sqlalchemy_clickhouse import connector as clickhouse
|
|
12
|
+
|
|
13
|
+
from recurvedata.pigeon.connector._registry import register_connector_class
|
|
14
|
+
from recurvedata.pigeon.connector.dbapi import ClosingCursor, DBAPIConnector, NullCursor, _ShowTableLikeMixin
|
|
15
|
+
from recurvedata.pigeon.schema import types
|
|
16
|
+
from recurvedata.pigeon.utils import fs
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Patch sqlalchemy_clickhouse, use requests session (keep alive)
|
|
20
|
+
def _send(self, data, settings=None, stream=False):
|
|
21
|
+
if isinstance(data, str):
|
|
22
|
+
data = data.encode("utf-8")
|
|
23
|
+
if not hasattr(self, "_session"):
|
|
24
|
+
self._session = requests.session()
|
|
25
|
+
params = self._build_params(settings)
|
|
26
|
+
r = self._session.post(self.db_url, params=params, data=data, stream=stream)
|
|
27
|
+
if r.status_code != 200:
|
|
28
|
+
raise Exception(r.text)
|
|
29
|
+
return r
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
clickhouse.Database._send = _send
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ParamEscaper(clickhouse.ParamEscaper):
|
|
36
|
+
def escape_item(self, item):
|
|
37
|
+
if item is None:
|
|
38
|
+
return "NULL"
|
|
39
|
+
elif isinstance(item, (int, float)):
|
|
40
|
+
return self.escape_number(item)
|
|
41
|
+
elif isinstance(item, str):
|
|
42
|
+
return self.escape_string(item)
|
|
43
|
+
elif isinstance(item, datetime.date):
|
|
44
|
+
return self.escape_string(str(item))
|
|
45
|
+
else:
|
|
46
|
+
raise Exception("Unsupported object {}".format(item))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Hack: sqlalchemy_clickhouse 的 ParamEscaper 不支持日期类型
|
|
50
|
+
clickhouse._escaper = ParamEscaper()
|
|
51
|
+
|
|
52
|
+
_clickhouse_type_to_canonical_type = {
|
|
53
|
+
# pigeon 没有定义 uint, 用「更长」的 INT 表示,防止溢出
|
|
54
|
+
"UInt8": types.INT16,
|
|
55
|
+
"UInt16": types.INT32,
|
|
56
|
+
"UInt32": types.INT64,
|
|
57
|
+
"UInt64": types.INT64,
|
|
58
|
+
"Int8": types.INT8,
|
|
59
|
+
"Int16": types.INT16,
|
|
60
|
+
"Int32": types.INT32,
|
|
61
|
+
"Int64": types.INT64,
|
|
62
|
+
"Float32": types.FLOAT32,
|
|
63
|
+
"Float64": types.FLOAT64,
|
|
64
|
+
"String": types.STRING,
|
|
65
|
+
"FixedString": types.STRING,
|
|
66
|
+
"Date": types.DATE,
|
|
67
|
+
"DateTime": types.DATETIME,
|
|
68
|
+
"Enum": types.STRING,
|
|
69
|
+
"Array": types.JSON,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
_canonical_type_to_clickhouse_type = {
|
|
73
|
+
types.BOOLEAN: "UInt8",
|
|
74
|
+
types.INT8: "Int8",
|
|
75
|
+
types.INT16: "Int16",
|
|
76
|
+
types.INT32: "Int32",
|
|
77
|
+
types.INT64: "Int64",
|
|
78
|
+
types.FLOAT32: "Float32",
|
|
79
|
+
types.FLOAT64: "Float64",
|
|
80
|
+
types.DATE: "Date",
|
|
81
|
+
types.DATETIME: "DateTime",
|
|
82
|
+
types.STRING: "String",
|
|
83
|
+
types.JSON: "String",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
_clickhouse_type_to_orm_filed = {
|
|
87
|
+
"UInt8": fields.UInt8Field(),
|
|
88
|
+
"UInt16": fields.UInt16Field(),
|
|
89
|
+
"UInt32": fields.UInt32Field(),
|
|
90
|
+
"UInt64": fields.UInt64Field(),
|
|
91
|
+
"Int8": fields.Int8Field(),
|
|
92
|
+
"Int16": fields.Int16Field(),
|
|
93
|
+
"Int32": fields.Int32Field(),
|
|
94
|
+
"Int64": fields.Int64Field(),
|
|
95
|
+
"Float32": fields.Float32Field(),
|
|
96
|
+
"Float64": fields.Float64Field(),
|
|
97
|
+
"String": fields.StringField(),
|
|
98
|
+
"Date": fields.DateField(),
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
nullable_type_p = re.compile(r"Nullable\((?P<inner_type_code>.*)\)")
|
|
102
|
+
array_type_p = re.compile(r"Array\((?P<inner_type_code>.*)\)")
|
|
103
|
+
low_cardinality_type_p = re.compile(r"LowCardinality\((?P<inner_type_code>.*)\)")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _format_sql(operation, parameters=None):
|
|
107
|
+
if parameters is None or not parameters:
|
|
108
|
+
sql = operation
|
|
109
|
+
else:
|
|
110
|
+
sql = operation % clickhouse._escaper.escape_args(parameters)
|
|
111
|
+
return sql
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class WrappedCursor(ClosingCursor):
|
|
115
|
+
@property
|
|
116
|
+
def description(self):
|
|
117
|
+
return self._description
|
|
118
|
+
|
|
119
|
+
def execute(self, operation: str, parameters=None):
|
|
120
|
+
is_response = self._determine_is_response(operation)
|
|
121
|
+
self._cursor.execute(operation, parameters, is_response)
|
|
122
|
+
|
|
123
|
+
# sqlalchemy-clickhouse 的 cursor 默认的查询方式,如果结果为空,则没有 description
|
|
124
|
+
# 可以使用 FORMAT JSON 查询得到
|
|
125
|
+
self._description = self._cursor.description
|
|
126
|
+
if not self._cursor.description and is_response:
|
|
127
|
+
self._description = self._get_cursor_description(operation, parameters)
|
|
128
|
+
|
|
129
|
+
def _determine_is_response(self, query: str):
|
|
130
|
+
# 简单判断是否 SELECT 查询
|
|
131
|
+
keywords = ["INSERT", "CREATE", "ALTER", "DROP", "RENAME", "SET", "KILL QUERY", "ATTACH", "DETACH"]
|
|
132
|
+
for kw in keywords:
|
|
133
|
+
if re.search(f"\\b{kw}\\b", query, re.IGNORECASE):
|
|
134
|
+
return False
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
def _get_cursor_description(self, operation: str, parameters=None):
|
|
138
|
+
query = _format_sql(operation, parameters)
|
|
139
|
+
query += " FORMAT JSON"
|
|
140
|
+
rv = self._cursor._db.raw(query)
|
|
141
|
+
data = json.loads(rv)
|
|
142
|
+
return [
|
|
143
|
+
# name, type_code, display_size, internal_size, precision, scale, null_ok
|
|
144
|
+
(x["name"], x["type"], None, None, None, None, True)
|
|
145
|
+
for x in data["meta"]
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class ClickHouseField(object):
|
|
150
|
+
def __init__(self, name, type_code):
|
|
151
|
+
self.name = name
|
|
152
|
+
self.type_code = type_code # ClickHouse 的类型,比如 Array(String)
|
|
153
|
+
|
|
154
|
+
if self.is_array() or self.is_nullable() or self.is_low_cardinality():
|
|
155
|
+
self.inner_type = self._infer_inner_type()
|
|
156
|
+
else:
|
|
157
|
+
self.inner_type = None
|
|
158
|
+
|
|
159
|
+
def is_array(self):
|
|
160
|
+
return self.type_code.startswith("Array")
|
|
161
|
+
|
|
162
|
+
def is_nullable(self):
|
|
163
|
+
return self.type_code.startswith("Nullable")
|
|
164
|
+
|
|
165
|
+
def is_low_cardinality(self):
|
|
166
|
+
return self.type_code.startswith("LowCardinality")
|
|
167
|
+
|
|
168
|
+
def is_int(self):
|
|
169
|
+
return self._real_type in ["UInt8", "UInt16", "UInt32", "UInt64", "Int8", "Int16", "Int32", "Int64"]
|
|
170
|
+
|
|
171
|
+
def is_float(self):
|
|
172
|
+
return self._real_type in ["Float32", "Float64"]
|
|
173
|
+
|
|
174
|
+
def is_string(self):
|
|
175
|
+
return self._real_type == "String"
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def _real_type(self):
|
|
179
|
+
if self.is_nullable():
|
|
180
|
+
return self.inner_type
|
|
181
|
+
return self.type_code
|
|
182
|
+
|
|
183
|
+
def _infer_inner_type(self):
|
|
184
|
+
if self.is_array():
|
|
185
|
+
return array_type_p.search(self.type_code).groupdict()["inner_type_code"]
|
|
186
|
+
if self.is_nullable():
|
|
187
|
+
return nullable_type_p.search(self.type_code).groupdict()["type_code"]
|
|
188
|
+
if self.is_low_cardinality():
|
|
189
|
+
return low_cardinality_type_p.search(self.type_code).groupdict()["inner_type_code"]
|
|
190
|
+
raise TypeError("No inner type, use type_code instead")
|
|
191
|
+
|
|
192
|
+
def cast(self, value):
|
|
193
|
+
if value is None:
|
|
194
|
+
if self.is_string():
|
|
195
|
+
return ""
|
|
196
|
+
else:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
if self.type_code == "DateTime":
|
|
200
|
+
value = self._convert_datetime(value)
|
|
201
|
+
return str(value)
|
|
202
|
+
|
|
203
|
+
if self.type_code == "Date":
|
|
204
|
+
return str(value)
|
|
205
|
+
if self.is_string():
|
|
206
|
+
return value
|
|
207
|
+
|
|
208
|
+
if self.is_int() or self.is_float():
|
|
209
|
+
if value == "":
|
|
210
|
+
return 0
|
|
211
|
+
return _clickhouse_type_to_orm_filed[self._real_type].to_python(value, timezone_in_use=None)
|
|
212
|
+
|
|
213
|
+
# 处理数组类型
|
|
214
|
+
if self.is_array():
|
|
215
|
+
if isinstance(value, str):
|
|
216
|
+
try:
|
|
217
|
+
value = json.loads(value)
|
|
218
|
+
except Exception:
|
|
219
|
+
value = []
|
|
220
|
+
|
|
221
|
+
if self.inner_type == "DateTime":
|
|
222
|
+
value = str(value)
|
|
223
|
+
value = [self._convert_datetime(x) for x in value]
|
|
224
|
+
inner = _clickhouse_type_to_orm_filed[self.inner_type]
|
|
225
|
+
return fields.ArrayField(inner).to_db_string(value)
|
|
226
|
+
|
|
227
|
+
# 其他类型,先不处理,需要的时候再说
|
|
228
|
+
return _clickhouse_type_to_orm_filed[self.type_code].to_db_string(value)
|
|
229
|
+
|
|
230
|
+
def _convert_datetime(self, value):
|
|
231
|
+
return str(value)
|
|
232
|
+
|
|
233
|
+
def __repr__(self):
|
|
234
|
+
return f"<ClickHouseField({repr(self.name)}, {repr(self.type_code)})>"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@register_connector_class(["clickhouse"])
|
|
238
|
+
class ClickHouseConnector(_ShowTableLikeMixin, DBAPIConnector):
|
|
239
|
+
_sqla_driver = "clickhouse"
|
|
240
|
+
_default_port = 8123
|
|
241
|
+
_default_database = "default"
|
|
242
|
+
|
|
243
|
+
def __init__(self, *args, **kwargs):
|
|
244
|
+
super().__init__(*args, **kwargs)
|
|
245
|
+
|
|
246
|
+
self._tcp_port = self.kwargs.get("tcp_port", 9000)
|
|
247
|
+
self._http_port = self.port or self._default_port
|
|
248
|
+
|
|
249
|
+
@toolz.memoize
|
|
250
|
+
def connect_impl(self, autocommit=False, *args, **kwargs):
|
|
251
|
+
db_url = f"http://{self.host}:{self.port}"
|
|
252
|
+
return clickhouse.connect(db_name=self.database, db_url=db_url, username=self.user, password=self.password)
|
|
253
|
+
|
|
254
|
+
def cursor(self, autocommit=False, dryrun=False, commit_on_close=True, **kwargs):
|
|
255
|
+
if dryrun:
|
|
256
|
+
return NullCursor()
|
|
257
|
+
return WrappedCursor(self.connect(autocommit))
|
|
258
|
+
|
|
259
|
+
def is_clickhouse(self):
|
|
260
|
+
return True
|
|
261
|
+
|
|
262
|
+
@staticmethod
|
|
263
|
+
def to_canonical_type(type_code, size):
|
|
264
|
+
if "nullable" in type_code.lower():
|
|
265
|
+
type_code = nullable_type_p.search(type_code).groupdict()["inner_type_code"]
|
|
266
|
+
if "lowcardinality" in type_code.lower():
|
|
267
|
+
type_code = low_cardinality_type_p.search(type_code).groupdict()["inner_type_code"]
|
|
268
|
+
if "FixedString" in type_code:
|
|
269
|
+
type_code = "FixedString"
|
|
270
|
+
if "Array" in type_code:
|
|
271
|
+
type_code = "Array"
|
|
272
|
+
return _clickhouse_type_to_canonical_type.get(type_code, types.STRING)
|
|
273
|
+
|
|
274
|
+
@staticmethod
|
|
275
|
+
def from_canonical_type(canonical_type, size):
|
|
276
|
+
return _canonical_type_to_clickhouse_type.get(canonical_type, "String")
|
|
277
|
+
|
|
278
|
+
def generate_create_table_ddl(self, name, schema, **kwargs):
|
|
279
|
+
"""从 schema 文件生成建表语句。Table engine 需要从 kwargs 传入,否则默认使用 Log"""
|
|
280
|
+
# Nullable
|
|
281
|
+
cols = []
|
|
282
|
+
for f in schema:
|
|
283
|
+
col_name = self.quote_identifier(f.name)
|
|
284
|
+
if f.comment:
|
|
285
|
+
cols.append(f"{col_name} Nullable({self.from_canonical_type(f.type, f.size)}) COMMENT {f.comment!r}")
|
|
286
|
+
else:
|
|
287
|
+
cols.append(f"{col_name} Nullable({self.from_canonical_type(f.type, f.size)})")
|
|
288
|
+
|
|
289
|
+
col_types = ",\n".join(cols)
|
|
290
|
+
name = self.quote_identifier(name)
|
|
291
|
+
ddl = f"CREATE TABLE {name} (\n{col_types}\n)"
|
|
292
|
+
|
|
293
|
+
# ddl = super().generate_create_table_ddl(name, schema)
|
|
294
|
+
|
|
295
|
+
# Table Engines: https://clickhouse.yandex/docs/en/operations/table_engines/
|
|
296
|
+
engine = kwargs.get("ENGINE", "Log")
|
|
297
|
+
ddl += f" ENGINE = {engine}"
|
|
298
|
+
return ddl
|
|
299
|
+
|
|
300
|
+
def load_csv(
|
|
301
|
+
self,
|
|
302
|
+
table,
|
|
303
|
+
filename,
|
|
304
|
+
delimiter=",",
|
|
305
|
+
quotechar='"',
|
|
306
|
+
lineterminator="\r\n",
|
|
307
|
+
escapechar=None,
|
|
308
|
+
skiprows=0,
|
|
309
|
+
using_insert=False,
|
|
310
|
+
**kwargs,
|
|
311
|
+
):
|
|
312
|
+
"""Load CSV file to ClickHouse table, support both batch INSERT by Python and clickhouse-client binary"""
|
|
313
|
+
infile = filename
|
|
314
|
+
if skiprows:
|
|
315
|
+
infile = fs.skip_lines(filename, skiprows)
|
|
316
|
+
|
|
317
|
+
clickhouse_client_binary = shutil.which("clickhouse-client")
|
|
318
|
+
try_clickhouse_client = (not using_insert) and clickhouse_client_binary
|
|
319
|
+
if try_clickhouse_client:
|
|
320
|
+
self.logger.info("found clickhouse-client in %s, try to load file using it", clickhouse_client_binary)
|
|
321
|
+
self._load_csv_by_clickhouse_client(clickhouse_client_binary, table, filename, delimiter)
|
|
322
|
+
else:
|
|
323
|
+
# fallback to perform INSERT
|
|
324
|
+
self._load_csv_by_inserting(table, filename, delimiter, quotechar, lineterminator, escapechar, **kwargs)
|
|
325
|
+
|
|
326
|
+
if infile != filename:
|
|
327
|
+
fs.remove_files_safely(infile)
|
|
328
|
+
|
|
329
|
+
def _load_csv_by_clickhouse_client(self, binary, table, filename, delimiter=","):
|
|
330
|
+
if "." not in table:
|
|
331
|
+
table = f"{self.database}.{table}"
|
|
332
|
+
command = " ".join(
|
|
333
|
+
[
|
|
334
|
+
binary,
|
|
335
|
+
f"--host {self.host}",
|
|
336
|
+
f"--port {self._tcp_port}",
|
|
337
|
+
f"--user {self.user}",
|
|
338
|
+
f"--password {self.password}",
|
|
339
|
+
f'--format_csv_delimiter="{delimiter}"',
|
|
340
|
+
f'--query="INSERT INTO {table} FORMAT CSV"' f"< {filename}",
|
|
341
|
+
]
|
|
342
|
+
)
|
|
343
|
+
self.logger.info(command)
|
|
344
|
+
subprocess.check_call(command, shell=True)
|
|
345
|
+
|
|
346
|
+
def _load_csv_by_inserting(self, table, filename, delimiter, quotechar, lineterminator, escapechar, **kwargs):
|
|
347
|
+
# https://clickhouse.yandex/docs/en/query_language/insert_into/
|
|
348
|
+
# Performance Considerations
|
|
349
|
+
# INSERT sorts the input data by primary key and splits them into partitions by a partition key
|
|
350
|
+
# If you insert data into several partitions at once, it can significantly reduce the performance.
|
|
351
|
+
# To avoid this:
|
|
352
|
+
#
|
|
353
|
+
# - Add data in fairly large batches, such as 100,000 rows at a time.
|
|
354
|
+
# - Group data by month before uploading it to ClickHouse.
|
|
355
|
+
batch_size = kwargs.get("batch_size") or 10000
|
|
356
|
+
|
|
357
|
+
# https://clickhouse.yandex/docs/en/single/#strong-typing
|
|
358
|
+
columns = self._get_columns_with_type(table)
|
|
359
|
+
values_hook = functools.partial(self._handle_row, columns=columns)
|
|
360
|
+
column_names = [x.name for x in columns]
|
|
361
|
+
|
|
362
|
+
self.logger.info("columns: %s", columns)
|
|
363
|
+
self.logger.info("batch size: %s", batch_size)
|
|
364
|
+
self.load_csv_by_inserting(
|
|
365
|
+
table=table,
|
|
366
|
+
filename=filename,
|
|
367
|
+
columns=column_names,
|
|
368
|
+
delimiter=delimiter,
|
|
369
|
+
quotechar=quotechar,
|
|
370
|
+
lineterminator=lineterminator,
|
|
371
|
+
escapechar=escapechar,
|
|
372
|
+
skiprows=0,
|
|
373
|
+
batch_size=batch_size,
|
|
374
|
+
values_hook=values_hook,
|
|
375
|
+
concurrency=kwargs.get("concurrency", 1),
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
def _handle_row(self, row, columns):
|
|
379
|
+
rv = []
|
|
380
|
+
for col, value in zip(columns, row):
|
|
381
|
+
rv.append(col.cast(value))
|
|
382
|
+
return tuple(rv)
|
|
383
|
+
|
|
384
|
+
def _get_columns_with_type(self, table):
|
|
385
|
+
with self.cursor() as cursor:
|
|
386
|
+
cursor.execute(
|
|
387
|
+
"SELECT * FROM {}.{} LIMIT 0".format(self.quote_identifier(self.database), self.quote_identifier(table))
|
|
388
|
+
)
|
|
389
|
+
cursor.fetchall()
|
|
390
|
+
cols = [ClickHouseField(x[0], x[1]) for x in cursor.description]
|
|
391
|
+
return cols
|
|
392
|
+
|
|
393
|
+
def generate_ddl(self, table, database=None, if_exists=True):
|
|
394
|
+
if database is None:
|
|
395
|
+
database = self.database
|
|
396
|
+
if not self.has_table(table, database):
|
|
397
|
+
raise ValueError(f"Table {table!r} not exists in {database!r}")
|
|
398
|
+
|
|
399
|
+
with self.cursor() as cursor:
|
|
400
|
+
cursor.execute(f"SHOW CREATE TABLE {database}.{table}")
|
|
401
|
+
if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
|
|
402
|
+
body = re.search(r"CREATE TABLE (.*)", cursor.fetchall()[0][1], flags=re.S).group(1)
|
|
403
|
+
return f"CREATE TABLE{if_exists_stmt}{body}"
|