recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import subprocess
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
from recurvedata.pigeon.connector._registry import register_connector_class
|
|
6
|
+
from recurvedata.pigeon.connector.mysql import MySQLConnector
|
|
7
|
+
from recurvedata.pigeon.schema import types
|
|
8
|
+
|
|
9
|
+
_canonical_type_to_doris_type = {
|
|
10
|
+
types.BOOLEAN: "TINYINT",
|
|
11
|
+
types.INT8: "TINYINT",
|
|
12
|
+
types.INT16: "SMALLINT",
|
|
13
|
+
types.INT32: "INT",
|
|
14
|
+
types.INT64: "BIGINT",
|
|
15
|
+
types.FLOAT32: "FLOAT",
|
|
16
|
+
types.FLOAT64: "DOUBLE",
|
|
17
|
+
types.DATE: "DATE",
|
|
18
|
+
types.DATETIME: "DATETIME",
|
|
19
|
+
types.STRING: "STRING",
|
|
20
|
+
types.JSON: "STRING",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@register_connector_class(["doris"])
|
|
25
|
+
class DorisConnector(MySQLConnector):
|
|
26
|
+
_sqla_driver = "doris+pymysql"
|
|
27
|
+
_default_port = 9030
|
|
28
|
+
_default_fe_http_port = 8030
|
|
29
|
+
|
|
30
|
+
def __init__(self, host, port=None, http_port=None, database=None, user=None, password=None, *args, **kwargs):
|
|
31
|
+
self.http_port = http_port or self._default_fe_http_port
|
|
32
|
+
super().__init__(host=host, port=port, database=database, user=user, password=password, *args, **kwargs)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def load_strict_mode(self):
|
|
36
|
+
if not hasattr(self, "_load_strict_mode"):
|
|
37
|
+
return False
|
|
38
|
+
return self._load_strict_mode
|
|
39
|
+
|
|
40
|
+
@load_strict_mode.setter
|
|
41
|
+
def load_strict_mode(self, mode: bool):
|
|
42
|
+
self._load_strict_mode = mode
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def max_filter_ratio(self):
|
|
46
|
+
if not hasattr(self, "_max_filter_ratio"):
|
|
47
|
+
return 0
|
|
48
|
+
return self._max_filter_ratio
|
|
49
|
+
|
|
50
|
+
def has_table(self, table, database=None, cursor=None, **kwargs):
|
|
51
|
+
retry_num = 6
|
|
52
|
+
for attempt in range(retry_num):
|
|
53
|
+
if super().has_table(table, database, cursor, **kwargs):
|
|
54
|
+
return True
|
|
55
|
+
if attempt < retry_num - 1:
|
|
56
|
+
wait_time = (attempt + 1) ** 2
|
|
57
|
+
time.sleep(wait_time) # wait for table to be created and visible
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
@max_filter_ratio.setter
|
|
61
|
+
def max_filter_ratio(self, ratio: float):
|
|
62
|
+
if ratio < 0:
|
|
63
|
+
self._max_filter_ratio = 0
|
|
64
|
+
elif ratio > 1:
|
|
65
|
+
self._max_filter_ratio = 1
|
|
66
|
+
else:
|
|
67
|
+
self._max_filter_ratio = ratio
|
|
68
|
+
|
|
69
|
+
def _load_csv_mysql(
|
|
70
|
+
self,
|
|
71
|
+
table,
|
|
72
|
+
filename,
|
|
73
|
+
columns=None,
|
|
74
|
+
delimiter=",",
|
|
75
|
+
quotechar='"',
|
|
76
|
+
lineterminator="\r\n",
|
|
77
|
+
escapechar=None,
|
|
78
|
+
skiprows=0,
|
|
79
|
+
**kwargs,
|
|
80
|
+
):
|
|
81
|
+
"""
|
|
82
|
+
stream load data from csv file into table
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def _split_database_table(table_name: str):
|
|
86
|
+
tmp_lst = table_name.split(".")
|
|
87
|
+
if len(tmp_lst) == 1:
|
|
88
|
+
return self.database, table_name
|
|
89
|
+
return tmp_lst
|
|
90
|
+
|
|
91
|
+
db_name, table_name = _split_database_table(table)
|
|
92
|
+
shell_cmd = self._format_load_shell(filename, db_name, table_name)
|
|
93
|
+
|
|
94
|
+
# Set only authentication environment variables
|
|
95
|
+
_env = {}
|
|
96
|
+
if self.user is not None:
|
|
97
|
+
_env["DORIS_USER"] = self.user
|
|
98
|
+
if self.password is not None:
|
|
99
|
+
_env["DORIS_PASSWORD"] = self.password
|
|
100
|
+
|
|
101
|
+
output = subprocess.check_output(shell_cmd, env=_env, shell=True)
|
|
102
|
+
self._log(output)
|
|
103
|
+
res_txt = output.decode()
|
|
104
|
+
res = json.loads(res_txt)
|
|
105
|
+
self._log(res_txt)
|
|
106
|
+
|
|
107
|
+
if res["Status"] != "Success":
|
|
108
|
+
if "ErrorURL" not in res:
|
|
109
|
+
err_output = res["Message"]
|
|
110
|
+
else:
|
|
111
|
+
err_url = res["ErrorURL"]
|
|
112
|
+
err_output = subprocess.check_output(["curl", err_url])
|
|
113
|
+
self._log(f"error: {err_output}")
|
|
114
|
+
raise Exception("load csv failed")
|
|
115
|
+
|
|
116
|
+
def _format_load_shell(self, filename: str, db_name: str, table_name: str) -> str:
|
|
117
|
+
"""Format the curl command for Doris stream load.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
filename: Path to the CSV file to load
|
|
121
|
+
db_name: Target database name
|
|
122
|
+
table_name: Target table name
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Formatted curl command string for stream loading data
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __format_column_mapping(db_name: str, table_name: str) -> str:
|
|
129
|
+
columns = self.get_columns(table_name, db_name)
|
|
130
|
+
cols_txt = ",".join(columns)
|
|
131
|
+
return f"columns: {cols_txt}"
|
|
132
|
+
|
|
133
|
+
def __format_stream_load_url(db_name: str, table_name: str) -> str:
|
|
134
|
+
return f"http://{self.host}:{self.http_port}/api/{db_name}/{table_name}/_stream_load"
|
|
135
|
+
|
|
136
|
+
# Clean table and db names
|
|
137
|
+
db_name = db_name.strip("`")
|
|
138
|
+
table_name = table_name.strip("`")
|
|
139
|
+
|
|
140
|
+
# Build command components
|
|
141
|
+
url = __format_stream_load_url(db_name, table_name)
|
|
142
|
+
strict_mode = "true" if self.load_strict_mode else "false"
|
|
143
|
+
column_mapping = __format_column_mapping(db_name, table_name)
|
|
144
|
+
|
|
145
|
+
# Construct the full curl command with properly escaped quotes
|
|
146
|
+
return (
|
|
147
|
+
f"curl --location-trusted -u $DORIS_USER:$DORIS_PASSWORD "
|
|
148
|
+
f'-H "Expect:100-continue" '
|
|
149
|
+
f'-H "max_filter_ratio:{self.max_filter_ratio}" '
|
|
150
|
+
f'-H "column_separator:," '
|
|
151
|
+
f'-H "enclose:\\"" '
|
|
152
|
+
f'-H "trim_double_quotes:true" '
|
|
153
|
+
f'-H "strict_mode:{strict_mode}" '
|
|
154
|
+
f'-H "escape:\'" '
|
|
155
|
+
f'-H "{column_mapping}" '
|
|
156
|
+
f"-T {filename} -XPUT "
|
|
157
|
+
f"{url}"
|
|
158
|
+
).strip()
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
def from_canonical_type(canonical_type, size):
|
|
162
|
+
if canonical_type == types.STRING:
|
|
163
|
+
doris_type = "STRING"
|
|
164
|
+
else:
|
|
165
|
+
doris_type = _canonical_type_to_doris_type.get(canonical_type, "STRING")
|
|
166
|
+
return doris_type
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import hashlib
|
|
3
|
+
import pickle
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
|
|
6
|
+
from elasticsearch import Elasticsearch, helpers
|
|
7
|
+
from elasticsearch.exceptions import NotFoundError
|
|
8
|
+
|
|
9
|
+
from recurvedata.pigeon.connector._registry import register_connector_class
|
|
10
|
+
from recurvedata.pigeon.csv import CSV
|
|
11
|
+
from recurvedata.pigeon.schema import Schema, types
|
|
12
|
+
from recurvedata.pigeon.utils import LoggingMixin, ensure_str_list, replace_null_values
|
|
13
|
+
|
|
14
|
+
# https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html#_field_datatypes
|
|
15
|
+
_es_type_to_canonical_type = {
|
|
16
|
+
"boolean": types.BOOLEAN,
|
|
17
|
+
"byte": types.INT8,
|
|
18
|
+
"short": types.INT16,
|
|
19
|
+
"integer": types.INT32,
|
|
20
|
+
"long": types.INT64,
|
|
21
|
+
"half_float": types.FLOAT32,
|
|
22
|
+
"float": types.FLOAT32,
|
|
23
|
+
"double": types.FLOAT64,
|
|
24
|
+
"scaled_float": types.FLOAT64,
|
|
25
|
+
"date": types.DATETIME,
|
|
26
|
+
"text": types.STRING,
|
|
27
|
+
"keyword": types.STRING,
|
|
28
|
+
"ip": types.STRING,
|
|
29
|
+
"object": types.STRING,
|
|
30
|
+
"nested": types.STRING,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
_canonical_type_to_es_type = {
|
|
34
|
+
types.BOOLEAN: "boolean",
|
|
35
|
+
types.INT8: "byte",
|
|
36
|
+
types.INT16: "short",
|
|
37
|
+
types.INT32: "integer",
|
|
38
|
+
types.INT64: "long",
|
|
39
|
+
types.FLOAT32: "float",
|
|
40
|
+
types.FLOAT64: "double",
|
|
41
|
+
types.DATETIME: "date",
|
|
42
|
+
types.STRING: "text",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@register_connector_class(["es", "elasticsearch"])
|
|
47
|
+
class ElasticSearchConnector(LoggingMixin):
|
|
48
|
+
def __init__(self, host, **kwargs):
|
|
49
|
+
self.host = host
|
|
50
|
+
self._es = Elasticsearch(self.host, **kwargs)
|
|
51
|
+
|
|
52
|
+
def scan(self, query=None, index=None, doc_type=None, fields=None, **search_kwargs):
|
|
53
|
+
if isinstance(query, str):
|
|
54
|
+
real_query = {"query": {"query_string": {"query": query}}}
|
|
55
|
+
else:
|
|
56
|
+
real_query = query
|
|
57
|
+
|
|
58
|
+
search_kwargs = search_kwargs.copy()
|
|
59
|
+
search_kwargs.update({"index": index, "doc_type": doc_type})
|
|
60
|
+
if fields:
|
|
61
|
+
search_kwargs["_source_include"] = fields
|
|
62
|
+
return helpers.scan(self._es, query=real_query, **search_kwargs)
|
|
63
|
+
|
|
64
|
+
def get_mapping(self, index, doc_type):
|
|
65
|
+
try:
|
|
66
|
+
result = self._es.indices.get_mapping(index=index, doc_type=doc_type)
|
|
67
|
+
except NotFoundError as e:
|
|
68
|
+
self.logger.error(str(e))
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
mappings = list(result.values())[0]["mappings"]
|
|
72
|
+
if doc_type is not None:
|
|
73
|
+
properties = mappings[doc_type]["properties"]
|
|
74
|
+
else:
|
|
75
|
+
properties = list(mappings.values())[0]["properties"]
|
|
76
|
+
return properties
|
|
77
|
+
|
|
78
|
+
def get_schema(self, index, doc_type):
|
|
79
|
+
mapping = self.get_mapping(index, doc_type)
|
|
80
|
+
schema = Schema()
|
|
81
|
+
for name, attrs in mapping.items():
|
|
82
|
+
es_type = attrs.get("type", "text").lower()
|
|
83
|
+
schema.add_field_by_attrs(name, self.to_canonical_type(es_type))
|
|
84
|
+
return schema
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def get_meta_field_type(name):
|
|
88
|
+
return {
|
|
89
|
+
"_index": types.STRING,
|
|
90
|
+
"_type": types.STRING,
|
|
91
|
+
"_id": types.STRING,
|
|
92
|
+
"_score": types.FLOAT64,
|
|
93
|
+
}[name]
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def to_canonical_type(es_type):
|
|
97
|
+
return _es_type_to_canonical_type.get(es_type, types.STRING)
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def from_canonical_type(canonical_type):
|
|
101
|
+
return _canonical_type_to_es_type[canonical_type]
|
|
102
|
+
|
|
103
|
+
def load_csv(
|
|
104
|
+
self,
|
|
105
|
+
filename,
|
|
106
|
+
index,
|
|
107
|
+
doc_type="_doc",
|
|
108
|
+
schema=None,
|
|
109
|
+
id_field=None,
|
|
110
|
+
generate_id=False,
|
|
111
|
+
null_values=("NULL", r"\N"),
|
|
112
|
+
null_replacer=None,
|
|
113
|
+
**csv_options,
|
|
114
|
+
):
|
|
115
|
+
csv_proxy = CSV(filename, **csv_options)
|
|
116
|
+
if not csv_proxy.has_header:
|
|
117
|
+
raise ValueError(f"missing header in CSV file {filename}")
|
|
118
|
+
|
|
119
|
+
# ensure id fields are present in header
|
|
120
|
+
if id_field:
|
|
121
|
+
fields = ensure_str_list(id_field)
|
|
122
|
+
if not all(x in csv_proxy.header for x in fields):
|
|
123
|
+
raise ValueError(f"{id_field} is invalid, only {csv_proxy.header} are support")
|
|
124
|
+
else:
|
|
125
|
+
fields = None
|
|
126
|
+
|
|
127
|
+
if schema is not None:
|
|
128
|
+
typed_fields = {x.name: x for x in schema.fields}
|
|
129
|
+
else:
|
|
130
|
+
typed_fields = {}
|
|
131
|
+
|
|
132
|
+
def actions_generator():
|
|
133
|
+
counters = defaultdict(int)
|
|
134
|
+
with csv_proxy.reader(as_dict=True) as reader:
|
|
135
|
+
for doc in reader:
|
|
136
|
+
doc = replace_null_values(doc, null_values, null_replacer)
|
|
137
|
+
doc = self.values_hook(doc, typed_fields)
|
|
138
|
+
|
|
139
|
+
action = {"_index": index, "_type": doc_type, "_source": doc}
|
|
140
|
+
|
|
141
|
+
if fields:
|
|
142
|
+
# fields = ensure_str_list(id_field)
|
|
143
|
+
if len(fields) == 1:
|
|
144
|
+
action["_id"] = doc[fields[0]]
|
|
145
|
+
else:
|
|
146
|
+
action["_id"] = self.encode_id([doc[x] for x in fields])
|
|
147
|
+
if generate_id:
|
|
148
|
+
action["_id"] = self.encode_id(doc.values())
|
|
149
|
+
|
|
150
|
+
counters["rows_read"] += 1
|
|
151
|
+
counters["rows_yield"] += 1
|
|
152
|
+
if counters["rows_yield"] % 10000 == 0:
|
|
153
|
+
self.logger.info("progress: %s", counters)
|
|
154
|
+
|
|
155
|
+
yield action
|
|
156
|
+
|
|
157
|
+
# 消费生成器
|
|
158
|
+
for _ in helpers.parallel_bulk(
|
|
159
|
+
self._es, actions=actions_generator(), thread_count=8, chunk_size=1000, queue_size=8
|
|
160
|
+
):
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def encode_id(values):
|
|
165
|
+
content = pickle.dumps(tuple(values))
|
|
166
|
+
return base64.urlsafe_b64encode(hashlib.sha1(content).digest()).decode()
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def values_hook(doc: dict, typed_fields: dict):
|
|
170
|
+
for k, v in doc.items():
|
|
171
|
+
field = typed_fields.get(k)
|
|
172
|
+
if field is None or field.type in [types.STRING]:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
doc[k] = typed_fields[k].cast(v)
|
|
176
|
+
return doc
|