recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
2
|
+
|
|
3
|
+
from recurvedata.pigeon import const
|
|
4
|
+
from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
|
|
5
|
+
from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from recurvedata.pigeon.connector.microsoft_fabric import MsFabricConnector
|
|
9
|
+
|
|
10
|
+
allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CSVToMsFabricLoader(BaseLoader, CSVToDBAPIMixin):
|
|
14
|
+
"""Loader for Microsoft Fabric that supports bulk loading data using COPY command.
|
|
15
|
+
|
|
16
|
+
This loader provides Microsoft Fabric specific data loading capabilities.
|
|
17
|
+
It uses the COPY command for efficient data loading and supports various
|
|
18
|
+
loading modes (OVERWRITE, MERGE, APPEND).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
table: str,
|
|
24
|
+
filename: str,
|
|
25
|
+
connector: "MsFabricConnector",
|
|
26
|
+
schema: Optional[str] = None,
|
|
27
|
+
create_table_ddl: Optional[str] = None,
|
|
28
|
+
mode: str = const.LOAD_MERGE,
|
|
29
|
+
primary_keys: Optional[Union[str, List[str]]] = None,
|
|
30
|
+
columns: Optional[Union[str, List[str]]] = None,
|
|
31
|
+
compress: bool = True,
|
|
32
|
+
delete_file: bool = True,
|
|
33
|
+
dedup: bool = False,
|
|
34
|
+
dedup_uniq_keys: Optional[Union[str, List[str]]] = None,
|
|
35
|
+
dedup_orderby: Optional[Union[str, List[str]]] = None,
|
|
36
|
+
pre_queries: Optional[Union[str, List[str]]] = None,
|
|
37
|
+
post_queries: Optional[Union[str, List[str]]] = None,
|
|
38
|
+
lineterminator: Optional[str] = "0x0D0A",
|
|
39
|
+
*args: Any,
|
|
40
|
+
**kwargs: Any,
|
|
41
|
+
):
|
|
42
|
+
"""Initialize the loader.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
table: Target table name
|
|
46
|
+
filename: Source file path
|
|
47
|
+
connector: MsFabricConnector instance
|
|
48
|
+
schema: Schema name
|
|
49
|
+
create_table_ddl: SQL to create target table
|
|
50
|
+
mode: Loading mode (OVERWRITE/MERGE/APPEND)
|
|
51
|
+
primary_keys: Primary key columns for MERGE mode
|
|
52
|
+
columns: Column list to load
|
|
53
|
+
compress: Whether to compress data before loading
|
|
54
|
+
delete_file: Whether to delete source file after loading
|
|
55
|
+
dedup: Whether to deduplicate data
|
|
56
|
+
dedup_uniq_keys: Columns for deduplication
|
|
57
|
+
dedup_orderby: Order by clause for deduplication
|
|
58
|
+
pre_queries: Queries to run before loading
|
|
59
|
+
post_queries: Queries to run after loading
|
|
60
|
+
"""
|
|
61
|
+
if "." in table:
|
|
62
|
+
self.schema, self.table = table.split(".")
|
|
63
|
+
else:
|
|
64
|
+
self.schema = schema or "dbo"
|
|
65
|
+
self.table = table
|
|
66
|
+
|
|
67
|
+
self.connector = connector
|
|
68
|
+
self.filename = filename
|
|
69
|
+
self.create_table_ddl = create_table_ddl
|
|
70
|
+
self.compress = compress
|
|
71
|
+
self.delete_file = delete_file
|
|
72
|
+
|
|
73
|
+
if mode not in allowed_modes:
|
|
74
|
+
raise ValueError(f"mode should be one of ({allowed_modes})")
|
|
75
|
+
|
|
76
|
+
self.mode = mode
|
|
77
|
+
self.primary_keys = ensure_str_list(primary_keys)
|
|
78
|
+
self.columns = ensure_str_list(columns)
|
|
79
|
+
|
|
80
|
+
# dedup stuff
|
|
81
|
+
self.dedup = dedup
|
|
82
|
+
self.dedup_uniq_keys = ensure_str_list(dedup_uniq_keys)
|
|
83
|
+
self.dedup_orderby = dedup_orderby
|
|
84
|
+
if self.dedup and not self.dedup_uniq_keys:
|
|
85
|
+
raise ValueError("dedup_uniq_keys should not be empty if dedup is true")
|
|
86
|
+
|
|
87
|
+
self.pre_queries = ensure_query_list(pre_queries) or []
|
|
88
|
+
self.post_queries = ensure_query_list(post_queries) or []
|
|
89
|
+
self.lineterminator = lineterminator
|
|
90
|
+
|
|
91
|
+
super().__init__()
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def staging_table(self) -> str:
|
|
95
|
+
return f"{self.table}_staging"
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def full_staging_table_name(self) -> str:
|
|
99
|
+
return f"{self.schema}.{self.staging_table}"
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def full_table_name(self) -> str:
|
|
103
|
+
return f"{self.schema}.{self.table}"
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def quoted_full_staging_table(self) -> str:
|
|
107
|
+
return self.connector.quote_identifier(self.full_staging_table_name)
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def quoted_full_table_name(self) -> str:
|
|
111
|
+
return self.connector.quote_identifier(self.full_table_name)
|
|
112
|
+
|
|
113
|
+
def execute_impl(self) -> None:
|
|
114
|
+
"""Execute the data loading process."""
|
|
115
|
+
if fs.is_file_empty(self.filename):
|
|
116
|
+
self.logger.error("file not exists or has no content. %s", self.filename)
|
|
117
|
+
fs.remove_files_safely(fs.schema_filename(self.filename))
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
self._prepare_target_table()
|
|
121
|
+
self._prepare_staging_table()
|
|
122
|
+
self._load_to_staging()
|
|
123
|
+
self._merge_into_target_table()
|
|
124
|
+
|
|
125
|
+
# do cleaning things
|
|
126
|
+
if self.delete_file:
|
|
127
|
+
self.logger.info("delete local file %s", self.filename)
|
|
128
|
+
fs.remove_files_safely(self.filename)
|
|
129
|
+
fs.remove_files_safely(fs.schema_filename(self.filename))
|
|
130
|
+
|
|
131
|
+
def _prepare_staging_table(self) -> None:
|
|
132
|
+
"""Prepare the staging table for data loading."""
|
|
133
|
+
schema, table = self.full_staging_table_name.split(".")
|
|
134
|
+
drop = self._make_drop_table_query(schema, table)
|
|
135
|
+
ddl = f"SELECT TOP 0 * INTO {self.quoted_full_staging_table} FROM {self.quoted_full_table_name}"
|
|
136
|
+
self.connector.execute([drop, ddl])
|
|
137
|
+
|
|
138
|
+
def _make_drop_table_query(self, schema: str, table: str) -> str:
|
|
139
|
+
"""Generate SQL to drop a table if it exists."""
|
|
140
|
+
if "." in table:
|
|
141
|
+
schema, table = table.split(".")
|
|
142
|
+
if not schema:
|
|
143
|
+
schema = self.schema
|
|
144
|
+
full_table = f"{schema}.{table}"
|
|
145
|
+
query = f"""
|
|
146
|
+
IF EXISTS (
|
|
147
|
+
SELECT * FROM sys.tables
|
|
148
|
+
WHERE schema_name(schema_id) = '{schema}' AND name = '{table}'
|
|
149
|
+
)
|
|
150
|
+
DROP TABLE {self.connector.quote_identifier(full_table)}
|
|
151
|
+
"""
|
|
152
|
+
return query
|
|
153
|
+
|
|
154
|
+
def _load_to_staging(self) -> None:
|
|
155
|
+
"""Load data into staging table using COPY command."""
|
|
156
|
+
self.logger.info(f"load {self.filename} into staging table {self.full_staging_table_name}")
|
|
157
|
+
self.connector.load_csv_bulk(
|
|
158
|
+
table=self.full_staging_table_name,
|
|
159
|
+
filename=self.filename,
|
|
160
|
+
columns=self.columns,
|
|
161
|
+
compress=self.compress,
|
|
162
|
+
lineterminator=self.lineterminator,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if self.dedup:
|
|
166
|
+
dedup_query = self._construct_dedup_query()
|
|
167
|
+
self.connector.execute(dedup_query, autocommit=False, commit_on_close=True)
|
|
168
|
+
|
|
169
|
+
def _construct_dedup_query(self) -> str:
|
|
170
|
+
"""Construct query for deduplication."""
|
|
171
|
+
partitions_cols = []
|
|
172
|
+
for col in self.dedup_uniq_keys:
|
|
173
|
+
partitions_cols.append(self.connector.quote_identifier(col))
|
|
174
|
+
partition_by = ", ".join(partitions_cols)
|
|
175
|
+
columns = " ,".join(self.connector.get_columns(schema=self.schema, table=self.staging_table))
|
|
176
|
+
tmp_table = f"{self.full_staging_table_name}_tmp"
|
|
177
|
+
quoted_tmp_table = self.connector.quote_identifier(tmp_table)
|
|
178
|
+
quoted_bak_table = self.connector.quote_identifier(f"{self.staging_table}_bak")
|
|
179
|
+
|
|
180
|
+
queries = f"""
|
|
181
|
+
{self._make_drop_table_query(self.schema, tmp_table)};
|
|
182
|
+
|
|
183
|
+
CREATE TABLE {quoted_tmp_table} AS
|
|
184
|
+
SELECT {', '.join(self.connector.quote_identifier(x) for x in columns)}
|
|
185
|
+
FROM (
|
|
186
|
+
SELECT *, ROW_NUMBER() OVER (PARTITION BY {partition_by} ORDER BY {self.dedup_orderby}) rn
|
|
187
|
+
FROM {self.quoted_full_staging_table}
|
|
188
|
+
) AS t
|
|
189
|
+
WHERE rn = 1;
|
|
190
|
+
|
|
191
|
+
RENAME OBJECT {self.quoted_full_staging_table} TO {quoted_bak_table};
|
|
192
|
+
RENAME OBJECT {quoted_tmp_table} TO {self.staging_table};
|
|
193
|
+
DROP TABLE {quoted_bak_table};
|
|
194
|
+
"""
|
|
195
|
+
return queries
|
|
196
|
+
|
|
197
|
+
def _merge_into_target_table(self) -> None:
|
|
198
|
+
"""Merge data from staging table into target table."""
|
|
199
|
+
target = self.quoted_full_table_name
|
|
200
|
+
staging = self.quoted_full_staging_table
|
|
201
|
+
|
|
202
|
+
append_sql = f"INSERT INTO {target} SELECT * FROM {staging}"
|
|
203
|
+
if self.mode == const.LOAD_OVERWRITE:
|
|
204
|
+
queries = [f"TRUNCATE TABLE {target}", append_sql]
|
|
205
|
+
elif self.mode == const.LOAD_MERGE:
|
|
206
|
+
queries = self._ingest_by_merging()
|
|
207
|
+
else:
|
|
208
|
+
# APPEND
|
|
209
|
+
queries = [append_sql]
|
|
210
|
+
|
|
211
|
+
queries.append(f"DROP TABLE {staging}")
|
|
212
|
+
|
|
213
|
+
queries = self.pre_queries + queries + self.post_queries
|
|
214
|
+
self.logger.info("running Microsoft Fabric queries...")
|
|
215
|
+
self.connector.execute(queries, autocommit=True, commit_on_close=True)
|
|
216
|
+
self.logger.info("done.")
|
|
217
|
+
|
|
218
|
+
def _ingest_by_merging(self) -> List[str]:
|
|
219
|
+
"""Construct merge query for MERGE mode."""
|
|
220
|
+
merge_table = f"{self.full_table_name}_merge"
|
|
221
|
+
quote = self.connector.quote_identifier
|
|
222
|
+
join = " AND ".join([f"a.{quote(x)} = b.{quote(x)}" for x in self.primary_keys])
|
|
223
|
+
|
|
224
|
+
drop_merge_table = self._make_drop_table_query(self.schema, merge_table)
|
|
225
|
+
queries = f"""
|
|
226
|
+
{drop_merge_table};
|
|
227
|
+
|
|
228
|
+
CREATE TABLE {quote(merge_table)} WITH (DISTRIBUTION = ROUND_ROBIN)
|
|
229
|
+
AS
|
|
230
|
+
SELECT a.*
|
|
231
|
+
FROM {self.quoted_full_table_name} AS a
|
|
232
|
+
LEFT JOIN {self.quoted_full_staging_table} AS b ON {join}
|
|
233
|
+
WHERE b.{quote(self.primary_keys[0])} IS NULL
|
|
234
|
+
UNION ALL
|
|
235
|
+
SELECT * FROM {self.quoted_full_staging_table};
|
|
236
|
+
|
|
237
|
+
TRUNCATE TABLE {self.quoted_full_table_name};
|
|
238
|
+
INSERT INTO {self.quoted_full_table_name} SELECT * FROM {quote(merge_table)};
|
|
239
|
+
|
|
240
|
+
{drop_merge_table};
|
|
241
|
+
"""
|
|
242
|
+
return queries.split(";")
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from recurvedata.pigeon import const
|
|
2
|
+
from recurvedata.pigeon.connector.mssql import SQLServerConnector
|
|
3
|
+
from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
|
|
4
|
+
from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
|
|
5
|
+
|
|
6
|
+
allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
|
|
7
|
+
STATING_TABLE_NAME_PLACEHOLDER = "<TABLE>"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CSVToMsSQLLoader(BaseLoader, CSVToDBAPIMixin):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
database,
|
|
14
|
+
table,
|
|
15
|
+
filename,
|
|
16
|
+
connector: SQLServerConnector,
|
|
17
|
+
schema=None,
|
|
18
|
+
create_table_ddl=None,
|
|
19
|
+
staging_create_table_ddl=None,
|
|
20
|
+
mode=const.LOAD_OVERWRITE,
|
|
21
|
+
primary_keys=None,
|
|
22
|
+
skiprows=0,
|
|
23
|
+
columns=None,
|
|
24
|
+
using_insert=True,
|
|
25
|
+
insert_batch_size=500,
|
|
26
|
+
insert_concurrency=1,
|
|
27
|
+
delete_file=False,
|
|
28
|
+
pre_queries=None,
|
|
29
|
+
post_queries=None,
|
|
30
|
+
*args,
|
|
31
|
+
**kwargs,
|
|
32
|
+
):
|
|
33
|
+
self.database = database
|
|
34
|
+
self.table = table
|
|
35
|
+
|
|
36
|
+
if "." in table:
|
|
37
|
+
self.schema, self.table = table.split(".")
|
|
38
|
+
else:
|
|
39
|
+
self.schema = schema or "dbo"
|
|
40
|
+
self.table = table
|
|
41
|
+
|
|
42
|
+
connector.database = self.database
|
|
43
|
+
self.connector = connector
|
|
44
|
+
self.filename = filename
|
|
45
|
+
self.create_table_ddl = create_table_ddl
|
|
46
|
+
# 考虑到完整的复制表结构(包括约束和索引)比较复杂,允许指定 staging 表的 DDL
|
|
47
|
+
# 表名用特殊符号 <TABLE> 占位
|
|
48
|
+
self.staging_create_table_ddl = staging_create_table_ddl
|
|
49
|
+
if self.staging_create_table_ddl and STATING_TABLE_NAME_PLACEHOLDER not in self.staging_create_table_ddl:
|
|
50
|
+
raise ValueError(f"use {STATING_TABLE_NAME_PLACEHOLDER} as table name placeholder")
|
|
51
|
+
|
|
52
|
+
if mode not in allowed_modes:
|
|
53
|
+
raise ValueError("mode should be one of ({})".format(allowed_modes))
|
|
54
|
+
|
|
55
|
+
self.mode = mode
|
|
56
|
+
self.primary_keys = ensure_str_list(primary_keys)
|
|
57
|
+
if self.mode == const.LOAD_MERGE and not self.primary_keys:
|
|
58
|
+
raise ValueError("primary_keys should not be empty in mode {}".format(const.LOAD_MERGE))
|
|
59
|
+
|
|
60
|
+
# self.columns = columns or self.csvfile.header
|
|
61
|
+
# self.skiprows = int(skiprows or self.csvfile.has_header)
|
|
62
|
+
self.columns = columns
|
|
63
|
+
self.skiprows = int(skiprows)
|
|
64
|
+
self.using_insert = using_insert
|
|
65
|
+
self.insert_batch_size = insert_batch_size
|
|
66
|
+
self.insert_concurrency = insert_concurrency
|
|
67
|
+
self.delete_file = delete_file
|
|
68
|
+
|
|
69
|
+
self.pre_queries = ensure_query_list(pre_queries) or []
|
|
70
|
+
self.post_queries = ensure_query_list(post_queries) or []
|
|
71
|
+
|
|
72
|
+
super().__init__()
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def staging_table(self):
|
|
76
|
+
return f"{self.schema}.{self.table}_staging"
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def full_table_name(self):
|
|
80
|
+
return f"{self.schema}.{self.table}"
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def quoted_staging_table(self):
|
|
84
|
+
return self.connector.quote_identifier(self.staging_table)
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def quoted_full_table_name(self):
|
|
88
|
+
return self.connector.quote_identifier(self.full_table_name)
|
|
89
|
+
|
|
90
|
+
def execute_impl(self):
|
|
91
|
+
if fs.is_file_empty(self.filename):
|
|
92
|
+
self.logger.error("file not exists or has no content. %s", self.filename)
|
|
93
|
+
fs.remove_files_safely(fs.schema_filename(self.filename))
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
self._prepare_target_table()
|
|
97
|
+
self._prepare_staging_table()
|
|
98
|
+
self._load_to_staging()
|
|
99
|
+
self._merge_into_target_table()
|
|
100
|
+
|
|
101
|
+
# do cleaning things
|
|
102
|
+
if self.delete_file:
|
|
103
|
+
self.logger.info("delete local file %s", self.filename)
|
|
104
|
+
fs.remove_files_safely(self.filename)
|
|
105
|
+
fs.remove_files_safely(fs.schema_filename(self.filename))
|
|
106
|
+
|
|
107
|
+
def _prepare_staging_table(self):
|
|
108
|
+
schema, table = self.staging_table.split(".")
|
|
109
|
+
if self.staging_create_table_ddl:
|
|
110
|
+
ddl: str = self.staging_create_table_ddl.replace(STATING_TABLE_NAME_PLACEHOLDER, self.quoted_staging_table)
|
|
111
|
+
ddl = ddl.rstrip(";")
|
|
112
|
+
else:
|
|
113
|
+
ddl = f"SELECT TOP 0 * INTO {self.quoted_staging_table} FROM {self.quoted_full_table_name}"
|
|
114
|
+
|
|
115
|
+
query = f"""
|
|
116
|
+
IF EXISTS (
|
|
117
|
+
SELECT * FROM sys.tables
|
|
118
|
+
WHERE schema_name(schema_id) = '{schema}' AND name = '{table}'
|
|
119
|
+
)
|
|
120
|
+
DROP TABLE {self.quoted_staging_table};
|
|
121
|
+
|
|
122
|
+
{ddl}
|
|
123
|
+
"""
|
|
124
|
+
self.connector.execute(query)
|
|
125
|
+
|
|
126
|
+
def _load_to_staging(self):
|
|
127
|
+
self.logger.info("load %s into staging table %s", self.filename, self.staging_table)
|
|
128
|
+
self.connector.load_csv(
|
|
129
|
+
table=self.staging_table,
|
|
130
|
+
filename=self.filename,
|
|
131
|
+
schema=self.schema,
|
|
132
|
+
columns=self.columns,
|
|
133
|
+
skiprows=self.skiprows,
|
|
134
|
+
using_insert=self.using_insert,
|
|
135
|
+
null_values=("NULL", r"\N", ""),
|
|
136
|
+
batch_size=self.insert_batch_size,
|
|
137
|
+
concurrency=self.insert_concurrency,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def _merge_into_target_table(self):
|
|
141
|
+
target = self.quoted_full_table_name
|
|
142
|
+
staging = self.quoted_staging_table
|
|
143
|
+
|
|
144
|
+
queries = []
|
|
145
|
+
if self.mode == const.LOAD_OVERWRITE:
|
|
146
|
+
queries.append(f"TRUNCATE TABLE {target}")
|
|
147
|
+
append_sql = f"INSERT INTO {target} SELECT * FROM {staging}"
|
|
148
|
+
queries.append(append_sql)
|
|
149
|
+
elif self.mode == const.LOAD_MERGE:
|
|
150
|
+
joins = []
|
|
151
|
+
for field in self.primary_keys:
|
|
152
|
+
field = self.connector.quote_identifier(field)
|
|
153
|
+
join = f"{target}.{field} = {staging}.{field}"
|
|
154
|
+
joins.append(join)
|
|
155
|
+
|
|
156
|
+
join_conditions = " AND ".join(joins)
|
|
157
|
+
# Delete existing records that match primary keys
|
|
158
|
+
delete_sql = f"DELETE {target} FROM {target} INNER JOIN {staging} ON {join_conditions}"
|
|
159
|
+
queries.append(delete_sql)
|
|
160
|
+
|
|
161
|
+
# Insert all data from staging table to target table
|
|
162
|
+
insert_sql = f"INSERT INTO {target} SELECT * FROM {staging}"
|
|
163
|
+
queries.append(insert_sql)
|
|
164
|
+
else:
|
|
165
|
+
# APPEND mode
|
|
166
|
+
append_sql = f"INSERT INTO {target} SELECT * FROM {staging}"
|
|
167
|
+
queries.append(append_sql)
|
|
168
|
+
|
|
169
|
+
queries.append(f"DROP TABLE {staging}")
|
|
170
|
+
|
|
171
|
+
queries = self.pre_queries + queries + self.post_queries
|
|
172
|
+
self.logger.info("running SQL Server queries...")
|
|
173
|
+
self.connector.execute(queries, autocommit=False, commit_on_close=True)
|
|
174
|
+
self.logger.info("done.")
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
from recurvedata.pigeon import const
|
|
2
|
+
from recurvedata.pigeon.connector import new_mysql_connector
|
|
3
|
+
from recurvedata.pigeon.csv import CSV
|
|
4
|
+
from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
|
|
5
|
+
from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
|
|
6
|
+
from recurvedata.pigeon.utils.sql import bak_table_of, staging_table_of
|
|
7
|
+
|
|
8
|
+
allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CSVToMySQLLoader(BaseLoader, CSVToDBAPIMixin):
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
database,
|
|
15
|
+
table,
|
|
16
|
+
filename,
|
|
17
|
+
connector=None,
|
|
18
|
+
create_table_ddl=None,
|
|
19
|
+
mode=const.LOAD_OVERWRITE,
|
|
20
|
+
primary_keys=None,
|
|
21
|
+
skiprows=0,
|
|
22
|
+
columns=None,
|
|
23
|
+
using_insert=False,
|
|
24
|
+
insert_batch_size=1000,
|
|
25
|
+
insert_concurrency=1,
|
|
26
|
+
delete_file=False,
|
|
27
|
+
tidb_dml_batch_size=500,
|
|
28
|
+
pre_queries=None,
|
|
29
|
+
post_queries=None,
|
|
30
|
+
*args,
|
|
31
|
+
**kwargs,
|
|
32
|
+
):
|
|
33
|
+
self.database = database
|
|
34
|
+
self.table = table
|
|
35
|
+
|
|
36
|
+
if isinstance(filename, CSV):
|
|
37
|
+
filename = filename.path
|
|
38
|
+
self.filename = filename
|
|
39
|
+
self.csvfile = CSV(self.filename)
|
|
40
|
+
|
|
41
|
+
if connector is None:
|
|
42
|
+
connector = new_mysql_connector(database=self.database)
|
|
43
|
+
else:
|
|
44
|
+
connector.database = self.database
|
|
45
|
+
self.connector = connector
|
|
46
|
+
|
|
47
|
+
self.create_table_ddl = create_table_ddl
|
|
48
|
+
|
|
49
|
+
if mode not in allowed_modes:
|
|
50
|
+
raise ValueError("mode should be one of ({})".format(allowed_modes))
|
|
51
|
+
|
|
52
|
+
self.mode = mode
|
|
53
|
+
self.primary_keys = ensure_str_list(primary_keys)
|
|
54
|
+
if self.mode == const.LOAD_MERGE and not self.primary_keys:
|
|
55
|
+
raise ValueError("primary_keys should not be empty in mode {}".format(const.LOAD_MERGE))
|
|
56
|
+
|
|
57
|
+
# self.columns = columns or self.csvfile.header
|
|
58
|
+
# self.skiprows = int(skiprows or self.csvfile.has_header)
|
|
59
|
+
self.columns = columns
|
|
60
|
+
self.skiprows = int(skiprows)
|
|
61
|
+
self.using_insert = using_insert
|
|
62
|
+
self.insert_batch_size = insert_batch_size
|
|
63
|
+
self.insert_concurrency = insert_concurrency
|
|
64
|
+
self.delete_file = delete_file
|
|
65
|
+
|
|
66
|
+
# https://pingcap.com/docs-cn/sql/tidb-specific/#tidb-dml-batch-size
|
|
67
|
+
self.tidb_dml_batch_size = tidb_dml_batch_size
|
|
68
|
+
|
|
69
|
+
self.pre_queries = ensure_query_list(pre_queries) or []
|
|
70
|
+
self.post_queries = ensure_query_list(post_queries) or []
|
|
71
|
+
|
|
72
|
+
super().__init__()
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def staging_table(self):
|
|
76
|
+
return staging_table_of(self.table)
|
|
77
|
+
|
|
78
|
+
def execute_impl(self):
|
|
79
|
+
if fs.is_file_empty(self.filename):
|
|
80
|
+
self.logger.error("file not exists or has no content. %s", self.filename)
|
|
81
|
+
fs.remove_files_safely(fs.schema_filename(self.filename))
|
|
82
|
+
return
|
|
83
|
+
self._prepare_target_table()
|
|
84
|
+
self._prepare_staging_table()
|
|
85
|
+
self._load_to_staging()
|
|
86
|
+
self._merge_into_target_table()
|
|
87
|
+
|
|
88
|
+
# do cleaning things
|
|
89
|
+
if self.delete_file:
|
|
90
|
+
self.logger.info("delete local file %s", self.filename)
|
|
91
|
+
fs.remove_files_safely(self.filename)
|
|
92
|
+
fs.remove_files_safely(fs.schema_filename(self.filename))
|
|
93
|
+
|
|
94
|
+
def _prepare_staging_table(self):
|
|
95
|
+
queries = """
|
|
96
|
+
DROP TABLE IF EXISTS {staging};
|
|
97
|
+
CREATE TABLE {staging} LIKE {table};
|
|
98
|
+
""".format(
|
|
99
|
+
staging=self.staging_table, table=self.table
|
|
100
|
+
)
|
|
101
|
+
self.connector.execute(queries, autocommit=True)
|
|
102
|
+
|
|
103
|
+
def _load_to_staging(self):
|
|
104
|
+
self.connector.load_csv(
|
|
105
|
+
table=self.staging_table,
|
|
106
|
+
filename=self.csvfile.path,
|
|
107
|
+
columns=self.columns,
|
|
108
|
+
lineterminator=self.csvfile.dialect.lineterminator,
|
|
109
|
+
skiprows=self.skiprows,
|
|
110
|
+
using_insert=self.using_insert,
|
|
111
|
+
null_values=("NULL", r"\N", ""),
|
|
112
|
+
batch_size=self.insert_batch_size,
|
|
113
|
+
concurrency=self.insert_concurrency,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _merge_into_target_table(self):
|
|
117
|
+
queries = []
|
|
118
|
+
if self.connector.is_tidb():
|
|
119
|
+
queries.append("SET autocommit=1")
|
|
120
|
+
queries.append("SET @@session.tidb_batch_delete=ON")
|
|
121
|
+
queries.append("SET @@session.tidb_batch_insert=ON")
|
|
122
|
+
if self.tidb_dml_batch_size:
|
|
123
|
+
queries.append(f"SET @@session.tidb_dml_batch_size={self.tidb_dml_batch_size}")
|
|
124
|
+
|
|
125
|
+
if self.mode == const.LOAD_MERGE:
|
|
126
|
+
queries.extend(self._ingest_by_merging())
|
|
127
|
+
elif self.mode == const.LOAD_OVERWRITE:
|
|
128
|
+
bak_table = bak_table_of(self.table)
|
|
129
|
+
queries.append(f"DROP TABLE IF EXISTS {bak_table}")
|
|
130
|
+
queries.append(f"RENAME TABLE {self.table} TO {bak_table}")
|
|
131
|
+
queries.append(f"RENAME TABLE {self.staging_table} TO {self.table}")
|
|
132
|
+
queries.append(f"DROP TABLE IF EXISTS {bak_table}")
|
|
133
|
+
else:
|
|
134
|
+
queries.append(f"INSERT INTO {self.table} SELECT * FROM {self.staging_table}")
|
|
135
|
+
queries.append(f"DROP TABLE {self.staging_table}")
|
|
136
|
+
|
|
137
|
+
queries = self.pre_queries + queries + self.post_queries
|
|
138
|
+
self.logger.info("running MySQL queries within a transaction")
|
|
139
|
+
self.connector.execute(queries, autocommit=False, commit_on_close=True)
|
|
140
|
+
|
|
141
|
+
def _ingest_by_merging(self):
|
|
142
|
+
"""Merge with deduplication based on specified primary_keys"""
|
|
143
|
+
# First, deduplicate staging table based on primary_keys using window function
|
|
144
|
+
pk_columns = ", ".join(self.primary_keys)
|
|
145
|
+
|
|
146
|
+
# Get all columns from staging table (excluding the rn column we'll add)
|
|
147
|
+
cols = self.connector.get_columns(self.staging_table)
|
|
148
|
+
cols_str = ", ".join(self.connector.quote_identifier(x) for x in cols)
|
|
149
|
+
|
|
150
|
+
# Create a temporary table with deduplicated data
|
|
151
|
+
tmp_table = f"{self.staging_table}_dedup"
|
|
152
|
+
dedup_sql = f"""
|
|
153
|
+
DROP TABLE IF EXISTS {tmp_table};
|
|
154
|
+
CREATE TABLE {tmp_table} LIKE {self.staging_table};
|
|
155
|
+
INSERT INTO {tmp_table}
|
|
156
|
+
SELECT {cols_str} FROM (
|
|
157
|
+
SELECT *, ROW_NUMBER() OVER(PARTITION BY {pk_columns} ORDER BY {pk_columns}) AS rn
|
|
158
|
+
FROM {self.staging_table}
|
|
159
|
+
) t
|
|
160
|
+
WHERE rn = 1;
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
# Replace staging table with deduplicated data
|
|
164
|
+
replace_sql = f"""
|
|
165
|
+
DROP TABLE {self.staging_table};
|
|
166
|
+
RENAME TABLE {tmp_table} TO {self.staging_table};
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
# Delete records from target table that have the same primary keys as staging table
|
|
170
|
+
join_condition = " AND ".join([f"a.{pk} = b.{pk}" for pk in self.primary_keys])
|
|
171
|
+
delete_sql = f"""
|
|
172
|
+
DELETE a FROM {self.table} a
|
|
173
|
+
INNER JOIN {self.staging_table} b ON {join_condition}
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
# Insert deduplicated data into target table
|
|
177
|
+
insert_sql = f"INSERT INTO {self.table} SELECT * FROM {self.staging_table}"
|
|
178
|
+
drop_sql = f"DROP TABLE {self.staging_table}"
|
|
179
|
+
|
|
180
|
+
return [dedup_sql, replace_sql, delete_sql, insert_sql, drop_sql]
|