recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import TYPE_CHECKING, Any, List, Optional
|
|
3
|
+
|
|
4
|
+
from recurvedata.pigeon import const
|
|
5
|
+
from recurvedata.pigeon.loader.csv_to_mysql import CSVToMySQLLoader
|
|
6
|
+
from recurvedata.pigeon.utils import md5hash, randomized_suffix
|
|
7
|
+
from recurvedata.pigeon.utils.sql import bak_table_of, reconcile_table_of, staging_table_of
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from recurvedata.pigeon.connector.starrocks import StarRocksConnector
|
|
11
|
+
|
|
12
|
+
allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CSVToStarRocksLoader(CSVToMySQLLoader):
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
database: str,
|
|
19
|
+
table: str,
|
|
20
|
+
filename: str,
|
|
21
|
+
connector: Optional["StarRocksConnector"] = None,
|
|
22
|
+
create_table_ddl: Optional[str] = None,
|
|
23
|
+
mode: str = const.LOAD_OVERWRITE,
|
|
24
|
+
primary_keys: Optional[List[str]] = None,
|
|
25
|
+
skiprows: int = 0,
|
|
26
|
+
columns: Optional[List[str]] = None,
|
|
27
|
+
using_insert: bool = False,
|
|
28
|
+
insert_batch_size: int = 1000,
|
|
29
|
+
insert_concurrency: int = 1,
|
|
30
|
+
delete_file: bool = False,
|
|
31
|
+
pre_queries: Optional[List[str]] = None,
|
|
32
|
+
post_queries: Optional[List[str]] = None,
|
|
33
|
+
load_strict_mode: bool = False,
|
|
34
|
+
*args: Any,
|
|
35
|
+
**kwargs: Any,
|
|
36
|
+
):
|
|
37
|
+
if not connector:
|
|
38
|
+
raise ValueError(f"connector is required for {self.__class__.__name__}")
|
|
39
|
+
self.load_strict_mode: bool = load_strict_mode
|
|
40
|
+
connector.load_strict_mode = load_strict_mode
|
|
41
|
+
self.logger.info(f"load_strict_mode: {load_strict_mode}")
|
|
42
|
+
# the same filename incoming, the same intermediate tables will be generated
|
|
43
|
+
# and the previous fail intermediate tables will be cleaned in a new try
|
|
44
|
+
table_suffix: str = md5hash(filename)[:6] if filename is not None else randomized_suffix()
|
|
45
|
+
self.__staging_table: str = staging_table_of(table) + "_" + table_suffix
|
|
46
|
+
self.__reconcile_table: str = reconcile_table_of(table) + "_" + table_suffix
|
|
47
|
+
self.__bak_table: str = bak_table_of(table) + "_" + table_suffix
|
|
48
|
+
if any(
|
|
49
|
+
[
|
|
50
|
+
len(self.__staging_table) > 64,
|
|
51
|
+
len(self.__reconcile_table) > 64,
|
|
52
|
+
len(self.__bak_table) > 64,
|
|
53
|
+
]
|
|
54
|
+
):
|
|
55
|
+
self.logger.error(
|
|
56
|
+
f"table name {self.__staging_table} 's length: {len(self.__staging_table)}\n"
|
|
57
|
+
f"table name {self.__reconcile_table}'s length: {len(self.__reconcile_table)}\n"
|
|
58
|
+
f"table name {self.__bak_table}'s length: {len(self.__bak_table)}\n"
|
|
59
|
+
)
|
|
60
|
+
raise ValueError("length of intermediate table name is greater than 64!")
|
|
61
|
+
super().__init__(
|
|
62
|
+
database=database,
|
|
63
|
+
table=table,
|
|
64
|
+
filename=filename,
|
|
65
|
+
connector=connector,
|
|
66
|
+
create_table_ddl=create_table_ddl,
|
|
67
|
+
mode=mode,
|
|
68
|
+
primary_keys=primary_keys,
|
|
69
|
+
skiprows=skiprows,
|
|
70
|
+
columns=columns,
|
|
71
|
+
using_insert=using_insert,
|
|
72
|
+
insert_batch_size=insert_batch_size,
|
|
73
|
+
insert_concurrency=insert_concurrency,
|
|
74
|
+
delete_file=delete_file,
|
|
75
|
+
pre_queries=pre_queries,
|
|
76
|
+
post_queries=post_queries,
|
|
77
|
+
*args,
|
|
78
|
+
**kwargs,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def staging_table(self) -> str:
|
|
83
|
+
"""
|
|
84
|
+
overwrite method, return a table name with randomized postfix
|
|
85
|
+
"""
|
|
86
|
+
return self.__staging_table
|
|
87
|
+
|
|
88
|
+
def _merge_into_target_table(self) -> None:
|
|
89
|
+
queries = []
|
|
90
|
+
if self.mode == const.LOAD_MERGE:
|
|
91
|
+
queries.extend(self._ingest_by_merging())
|
|
92
|
+
elif self.mode == const.LOAD_OVERWRITE:
|
|
93
|
+
# bak_table = bak_table_of(self.table)
|
|
94
|
+
bak_table = self.__bak_table
|
|
95
|
+
queries.extend(
|
|
96
|
+
[
|
|
97
|
+
f"DROP TABLE IF EXISTS {bak_table}",
|
|
98
|
+
f"ALTER TABLE {self.table} RENAME {bak_table}",
|
|
99
|
+
f"ALTER TABLE {self.staging_table} RENAME {self.table}",
|
|
100
|
+
f"DROP TABLE IF EXISTS {bak_table}",
|
|
101
|
+
]
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
# special process at `APPEND` mode, cuz an occasional error happens:
|
|
105
|
+
# ================================== ERROR MSG START ======================================
|
|
106
|
+
# pymysql.err.ProgrammingError: (1064, 'Unexpected exception: Failed to drop table {table_name}.
|
|
107
|
+
# msg: There are still some transactions in the COMMITTED state waiting to be completed.
|
|
108
|
+
# The table {table_name} cannot be dropped. If you want to forcibly drop(cannot be recovered),
|
|
109
|
+
# please use "DROP TABLE <table> FORCE".')
|
|
110
|
+
# ================================== ERROR MSG END ========================================
|
|
111
|
+
# Here's the optimization: commit insert statement first, make it blocking until finished.
|
|
112
|
+
queries.append(f"INSERT INTO {self.table} SELECT * FROM {self.staging_table}")
|
|
113
|
+
self.connector.execute(self.pre_queries + queries, autocommit=True, commit_on_close=False)
|
|
114
|
+
|
|
115
|
+
queries.clear()
|
|
116
|
+
queries.append(f"DROP TABLE {self.staging_table}")
|
|
117
|
+
self.connector.execute(queries + self.post_queries, autocommit=True, commit_on_close=False)
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
queries = self.pre_queries + queries + self.post_queries
|
|
121
|
+
self.logger.info("running MySQL queries within a transaction")
|
|
122
|
+
self.connector.execute(queries, autocommit=False, commit_on_close=True)
|
|
123
|
+
|
|
124
|
+
def _ingest_by_merging(self) -> List[str]:
|
|
125
|
+
"""Merge with deduplication based on primary keys using StarRocks-compatible syntax"""
|
|
126
|
+
# First, deduplicate staging table based on primary keys using window function
|
|
127
|
+
pk_columns = ", ".join(self.primary_keys)
|
|
128
|
+
|
|
129
|
+
# Get all columns from staging table (excluding the rn column we'll add)
|
|
130
|
+
cols = self.connector.get_columns(self.staging_table)
|
|
131
|
+
cols_str = ", ".join(self.connector.quote_identifier(x) for x in cols)
|
|
132
|
+
|
|
133
|
+
# Create a temporary table with deduplicated data
|
|
134
|
+
tmp_table = f"{self.staging_table}_dedup"
|
|
135
|
+
dedup_sql = f"""
|
|
136
|
+
DROP TABLE IF EXISTS {tmp_table};
|
|
137
|
+
CREATE TABLE {tmp_table} LIKE {self.staging_table};
|
|
138
|
+
INSERT INTO {tmp_table}
|
|
139
|
+
SELECT {cols_str} FROM (
|
|
140
|
+
SELECT *, ROW_NUMBER() OVER(PARTITION BY {pk_columns} ORDER BY {pk_columns}) AS rn
|
|
141
|
+
FROM {self.staging_table}
|
|
142
|
+
) t
|
|
143
|
+
WHERE rn = 1;
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
# Replace staging table with deduplicated data
|
|
147
|
+
replace_sql = f"""
|
|
148
|
+
DROP TABLE {self.staging_table};
|
|
149
|
+
ALTER TABLE {tmp_table} RENAME {self.staging_table};
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
# Simple merge: backup target table, then merge and deduplicate in one step
|
|
153
|
+
bak = self.__bak_table
|
|
154
|
+
table = self.connector.quote_identifier(self.table)
|
|
155
|
+
staging = self.connector.quote_identifier(self.staging_table)
|
|
156
|
+
bak = self.connector.quote_identifier(bak)
|
|
157
|
+
|
|
158
|
+
# Simple and efficient merge: backup + merge + deduplicate in one operation
|
|
159
|
+
merge_sql = f"""
|
|
160
|
+
-- Backup target table
|
|
161
|
+
DROP TABLE IF EXISTS {bak};
|
|
162
|
+
ALTER TABLE {table} RENAME {bak};
|
|
163
|
+
|
|
164
|
+
-- Create new target table and insert deduplicated merged data in one step
|
|
165
|
+
CREATE TABLE {table} AS
|
|
166
|
+
SELECT {cols_str} FROM (
|
|
167
|
+
SELECT *, ROW_NUMBER() OVER(PARTITION BY {pk_columns} ORDER BY {pk_columns}) AS rn
|
|
168
|
+
FROM (
|
|
169
|
+
SELECT * FROM {bak}
|
|
170
|
+
UNION ALL
|
|
171
|
+
SELECT * FROM {staging}
|
|
172
|
+
) combined
|
|
173
|
+
) t WHERE rn = 1;
|
|
174
|
+
|
|
175
|
+
-- Clean up
|
|
176
|
+
DROP TABLE {bak};
|
|
177
|
+
DROP TABLE {staging};
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
return [dedup_sql, replace_sql, merge_sql]
|
|
181
|
+
|
|
182
|
+
def execute(self) -> None:
|
|
183
|
+
"""
|
|
184
|
+
overwrite method, implemented try...catch...
|
|
185
|
+
"""
|
|
186
|
+
self.before_execute()
|
|
187
|
+
try:
|
|
188
|
+
self.execute_impl()
|
|
189
|
+
except Exception as e:
|
|
190
|
+
self.handle_exception()
|
|
191
|
+
raise e
|
|
192
|
+
self.after_execute()
|
|
193
|
+
|
|
194
|
+
def _prepare_staging_table(self):
|
|
195
|
+
queries = """
|
|
196
|
+
DROP TABLE IF EXISTS {staging};
|
|
197
|
+
CREATE TABLE {staging} LIKE {table};
|
|
198
|
+
""".format(
|
|
199
|
+
staging=self.staging_table, table=self.table
|
|
200
|
+
)
|
|
201
|
+
self.connector.execute(queries, autocommit=True)
|
|
202
|
+
time.sleep(5) # wait for table to be created and visible
|
|
203
|
+
|
|
204
|
+
def handle_exception(self) -> None:
|
|
205
|
+
"""
|
|
206
|
+
ensure all intermediate tables are cleaned safely after catch the exception
|
|
207
|
+
"""
|
|
208
|
+
qry_exists_sql = """
|
|
209
|
+
SELECT 1 FROM information_schema.tables
|
|
210
|
+
WHERE table_schema = '{database}' AND table_name = '{table}';
|
|
211
|
+
"""
|
|
212
|
+
is_table_exists = self.connector.fetchall(qry_exists_sql.format(database=self.database, table=self.table))
|
|
213
|
+
is_bak_exists = self.connector.fetchall(qry_exists_sql.format(database=self.database, table=self.__bak_table))
|
|
214
|
+
if is_table_exists:
|
|
215
|
+
# clean intermediate tables directly.
|
|
216
|
+
queries = [
|
|
217
|
+
f"DROP TABLE IF EXISTS {self.__bak_table}",
|
|
218
|
+
f"DROP TABLE IF EXISTS {self.__staging_table}",
|
|
219
|
+
f"DROP TABLE IF EXISTS {self.__reconcile_table}",
|
|
220
|
+
]
|
|
221
|
+
elif is_bak_exists:
|
|
222
|
+
# rollback from bak_table
|
|
223
|
+
queries = [
|
|
224
|
+
f"ALTER TABLE {self.__bak_table} RENAME {self.table}",
|
|
225
|
+
f"DROP TABLE IF EXISTS {self.__staging_table}",
|
|
226
|
+
f"DROP TABLE IF EXISTS {self.__reconcile_table}",
|
|
227
|
+
]
|
|
228
|
+
else:
|
|
229
|
+
queries = [
|
|
230
|
+
f"DROP TABLE IF EXISTS {self.__staging_table}",
|
|
231
|
+
f"DROP TABLE IF EXISTS {self.__reconcile_table}",
|
|
232
|
+
]
|
|
233
|
+
self.connector.execute(queries, autocommit=False, commit_on_close=True)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
import cytoolz as toolz
|
|
5
|
+
|
|
6
|
+
from recurvedata.pigeon.schema import Schema
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JSONEncoder(json.JSONEncoder):
|
|
10
|
+
def default(self, o):
|
|
11
|
+
if isinstance(o, datetime.date):
|
|
12
|
+
return o.isoformat()
|
|
13
|
+
if isinstance(o, datetime.timedelta):
|
|
14
|
+
return str(o)
|
|
15
|
+
if isinstance(o, Schema):
|
|
16
|
+
return o.to_list()
|
|
17
|
+
return super().default(o)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Meta(object):
|
|
21
|
+
def to_dict(self):
|
|
22
|
+
raise NotImplementedError()
|
|
23
|
+
|
|
24
|
+
def to_json(self, **kwargs):
|
|
25
|
+
params = toolz.merge({"sort_keys": True, "ensure_ascii": False, "cls": JSONEncoder}, kwargs)
|
|
26
|
+
return json.dumps(self.to_dict(), **params)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class HandlerMeta(Meta):
|
|
30
|
+
def __init__(self):
|
|
31
|
+
self.reset()
|
|
32
|
+
|
|
33
|
+
def reset(self):
|
|
34
|
+
self.num_input_rows = 0
|
|
35
|
+
self.num_output_rows = 0
|
|
36
|
+
self.num_error_rows = 0
|
|
37
|
+
self.error_log_size = 0 # 当前打印的报错 row 字符串的字符数
|
|
38
|
+
|
|
39
|
+
def to_dict(self):
|
|
40
|
+
return {
|
|
41
|
+
"num_input_rows": self.num_input_rows,
|
|
42
|
+
"num_output_rows": self.num_output_rows,
|
|
43
|
+
"num_error_rows": self.num_error_rows,
|
|
44
|
+
"error_log_size": self.error_log_size,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class HandlerFactoryMeta(HandlerMeta):
|
|
49
|
+
def __init__(self, name):
|
|
50
|
+
self.name = name
|
|
51
|
+
super().__init__()
|
|
52
|
+
|
|
53
|
+
def update(self, handler_meta):
|
|
54
|
+
self.num_input_rows += handler_meta.num_input_rows
|
|
55
|
+
self.num_output_rows += handler_meta.num_output_rows
|
|
56
|
+
self.num_error_rows += handler_meta.num_error_rows
|
|
57
|
+
self.error_log_size += handler_meta.error_log_size
|
|
58
|
+
|
|
59
|
+
def to_dict(self):
|
|
60
|
+
d = super().to_dict()
|
|
61
|
+
d["name"] = self.name
|
|
62
|
+
return d
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class DumperWorkerMeta(Meta):
|
|
66
|
+
def __init__(self):
|
|
67
|
+
self.num_dumped_rows = 0
|
|
68
|
+
self.schema = None
|
|
69
|
+
self.handlers_meta = None
|
|
70
|
+
|
|
71
|
+
def to_dict(self):
|
|
72
|
+
return {
|
|
73
|
+
"num_dumped_rows": self.num_dumped_rows,
|
|
74
|
+
"schema": self.schema,
|
|
75
|
+
"handlers_meta": [x.to_dict() for x in self.handlers_meta],
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class DumperMeta(Meta):
|
|
80
|
+
def __init__(self, context=None):
|
|
81
|
+
self.time_start = None
|
|
82
|
+
self.time_finish = None
|
|
83
|
+
self.num_dumped_rows = 0
|
|
84
|
+
self.context = context
|
|
85
|
+
self.schema = None
|
|
86
|
+
self.handlers_meta = []
|
|
87
|
+
|
|
88
|
+
def mark_start(self):
|
|
89
|
+
self.time_start = datetime.datetime.now()
|
|
90
|
+
|
|
91
|
+
def mark_finish(self):
|
|
92
|
+
self.time_finish = datetime.datetime.now()
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def rows_per_second(self):
|
|
96
|
+
return self.num_dumped_rows / (self.time_finish - self.time_start).total_seconds()
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def duration(self):
|
|
100
|
+
if not self.time_start:
|
|
101
|
+
return None
|
|
102
|
+
if not self.time_finish:
|
|
103
|
+
return datetime.datetime.now() - self.time_start
|
|
104
|
+
return self.time_finish - self.time_start
|
|
105
|
+
|
|
106
|
+
def to_dict(self):
|
|
107
|
+
return {
|
|
108
|
+
"time_start": self.time_start,
|
|
109
|
+
"time_finish": self.time_finish,
|
|
110
|
+
"time_duration": self.duration,
|
|
111
|
+
"num_dumped_rows": self.num_dumped_rows,
|
|
112
|
+
"rows_per_second": self.rows_per_second,
|
|
113
|
+
"context": self.context,
|
|
114
|
+
"schema": self.schema,
|
|
115
|
+
"handlers_meta": [x.to_dict() for x in self.handlers_meta],
|
|
116
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
|
|
4
|
+
from recurvedata.pigeon.utils.keyed_tuple import KeyedTuple
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def tuple_factory(colnames, row):
|
|
10
|
+
"""Returns each row as a tuple"""
|
|
11
|
+
return row
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def keyed_tuple_factory(colnames, row):
|
|
15
|
+
return KeyedTuple(row, colnames)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def dict_factory(colnames, row):
|
|
19
|
+
return dict(zip(colnames, row))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def ordered_dict_factory(colnames, row):
|
|
23
|
+
return OrderedDict(zip(colnames, row))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_row_keys(row):
|
|
27
|
+
if isinstance(row, dict):
|
|
28
|
+
# created by dict_factory or ordered_dict_factory
|
|
29
|
+
return list(row.keys())
|
|
30
|
+
if hasattr(row, "_fields"):
|
|
31
|
+
# created by keyed_tuple_factory
|
|
32
|
+
return list(row._fields)
|
|
33
|
+
else:
|
|
34
|
+
# created by tuple_factory, which is not able to know the keys
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_row_values(row):
|
|
39
|
+
if isinstance(row, dict):
|
|
40
|
+
# created by dict_factory or ordered_dict_factory
|
|
41
|
+
return list(row.values())
|
|
42
|
+
return list(row)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import dateutil.parser
|
|
4
|
+
|
|
5
|
+
from recurvedata.pigeon.schema import types
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Field(object):
|
|
9
|
+
def __init__(self, name, type, size=None, comment=None, extra=None):
|
|
10
|
+
self.name = name
|
|
11
|
+
self.type = type
|
|
12
|
+
self.size = size
|
|
13
|
+
self.comment = comment
|
|
14
|
+
self.extra = extra
|
|
15
|
+
|
|
16
|
+
self._cast_func = {
|
|
17
|
+
types.INT8: self._cast_to_int,
|
|
18
|
+
types.INT16: self._cast_to_int,
|
|
19
|
+
types.INT32: self._cast_to_int,
|
|
20
|
+
types.INT64: self._cast_to_int,
|
|
21
|
+
types.FLOAT32: self._cast_to_float,
|
|
22
|
+
types.FLOAT64: self._cast_to_float,
|
|
23
|
+
types.BOOLEAN: self._cast_to_boolean,
|
|
24
|
+
types.DATETIME: self._cast_to_datetime,
|
|
25
|
+
types.DATE: self._cast_to_date,
|
|
26
|
+
types.JSON: self._cast_to_json,
|
|
27
|
+
}.get(self.type, self._cast_pass)
|
|
28
|
+
|
|
29
|
+
def cast(self, value):
|
|
30
|
+
if value is None:
|
|
31
|
+
return None
|
|
32
|
+
if value == "NULL":
|
|
33
|
+
return None
|
|
34
|
+
return self._cast_func(value)
|
|
35
|
+
|
|
36
|
+
def _cast_pass(self, value):
|
|
37
|
+
return value
|
|
38
|
+
|
|
39
|
+
def _cast_to_int(self, value: str):
|
|
40
|
+
if value == "":
|
|
41
|
+
return 0
|
|
42
|
+
return int(value)
|
|
43
|
+
|
|
44
|
+
def _cast_to_float(self, value: str):
|
|
45
|
+
if value == "":
|
|
46
|
+
return 0.0
|
|
47
|
+
return float(value)
|
|
48
|
+
|
|
49
|
+
def _cast_to_boolean(self, value: str):
|
|
50
|
+
if value.lower() in ("", "0", "false"):
|
|
51
|
+
return False
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
def _cast_to_datetime(self, value: str):
|
|
55
|
+
if value == "":
|
|
56
|
+
return None
|
|
57
|
+
return dateutil.parser.parse(value)
|
|
58
|
+
|
|
59
|
+
def _cast_to_date(self, value: str):
|
|
60
|
+
if value == "":
|
|
61
|
+
return None
|
|
62
|
+
return dateutil.parser.parse(value).date()
|
|
63
|
+
|
|
64
|
+
def _cast_to_json(self, value: str):
|
|
65
|
+
if value in ("",):
|
|
66
|
+
# 正常情况下不会有 '',很可能是从 CSV 文件读到了空字符,当作 None 处理
|
|
67
|
+
return None
|
|
68
|
+
return json.loads(value)
|
|
69
|
+
|
|
70
|
+
def to_dict(self):
|
|
71
|
+
return {
|
|
72
|
+
"name": self.name,
|
|
73
|
+
"type": self.type,
|
|
74
|
+
"size": self.size,
|
|
75
|
+
"comment": self.comment,
|
|
76
|
+
"extra": self.extra,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
def __str__(self):
|
|
80
|
+
return f'<Field ("{self.name}", "{self.type}")>'
|
|
81
|
+
|
|
82
|
+
def __repr__(self):
|
|
83
|
+
return f'<Field ("{self.name}", "{self.type}")>'
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class Schema(object):
|
|
87
|
+
def __init__(self, fields=None):
|
|
88
|
+
self.fields = fields or []
|
|
89
|
+
|
|
90
|
+
def add_field(self, field):
|
|
91
|
+
# TODO(liyangliang): clean field names, handle special characters and duplications
|
|
92
|
+
self.fields.append(field)
|
|
93
|
+
|
|
94
|
+
def add_field_by_attrs(self, name, type, size=None, comment=None, extra=None):
|
|
95
|
+
self.add_field(Field(name, type, size, comment, extra))
|
|
96
|
+
|
|
97
|
+
def remove_field(self, name):
|
|
98
|
+
self.fields = [x for x in self.fields if x.name != name]
|
|
99
|
+
|
|
100
|
+
def keep_fields(self, names):
|
|
101
|
+
self.fields = [x for x in self.fields if x.name in names]
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def field_names(self):
|
|
105
|
+
return [x.name for x in self.fields]
|
|
106
|
+
|
|
107
|
+
def __iter__(self):
|
|
108
|
+
return iter(self.fields)
|
|
109
|
+
|
|
110
|
+
def to_list(self):
|
|
111
|
+
return [x.to_dict() for x in self.fields]
|
|
112
|
+
|
|
113
|
+
def to_json(self):
|
|
114
|
+
return json.dumps(self.to_list())
|
|
115
|
+
|
|
116
|
+
def dump(self, filename):
|
|
117
|
+
with open(filename, "w") as f:
|
|
118
|
+
json.dump(self.to_list(), f, indent=2)
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def load(cls, filename):
|
|
122
|
+
with open(filename) as f:
|
|
123
|
+
data = json.load(f)
|
|
124
|
+
return cls([Field(**item) for item in data])
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
INT8 = "INT8" # 1-byte (8-bit) signed integers
|
|
2
|
+
INT16 = "INT16" # 2-byte (16-bit) signed integers
|
|
3
|
+
INT32 = "INT32" # 4-byte (32-bit) signed integers
|
|
4
|
+
INT64 = "INT64" # 8-byte (64-bit) signed integers
|
|
5
|
+
FLOAT32 = "FLOAT32" # 4-byte (32-bit) single-precision floating
|
|
6
|
+
FLOAT64 = "FLOAT64" # 8-byte (64-bit) double-precision floating
|
|
7
|
+
BOOLEAN = "BOOLEAN"
|
|
8
|
+
|
|
9
|
+
DATETIME = "DATETIME"
|
|
10
|
+
DATE = "DATE"
|
|
11
|
+
|
|
12
|
+
STRING = "STRING"
|
|
13
|
+
JSON = "JSON"
|