PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/pigeon/loader/csv_to_es.py ADDED Viewed

@@ -0,0 +1,51 @@
+import csv
+from recurvedata.pigeon.loader.base import BaseLoader
+from recurvedata.pigeon.schema import Schema
+from recurvedata.pigeon.utils import fs
+class CSVToElasticSearchLoader(BaseLoader):
+    def __init__(
+        self,
+        index,
+        doc_type,
+        filename,
+        connector,
+        id_field=None,
+        generate_id=False,
+        delete_file=False,
+        csv_options=None,
+    ):
+        self.index = index
+        self.doc_type = doc_type
+        self.filename = filename
+        self.id_field = id_field
+        self.generate_id = generate_id
+        self.delete_file = delete_file
+        self.es = connector
+        self.csv_options = csv_options or {"quoting": csv.QUOTE_ALL, "doublequote": True}
+        super().__init__()
+    def execute_impl(self):
+        schema_file = fs.schema_filename(self.filename)
+        if fs.exists(schema_file):
+            schema = Schema.load(schema_file)
+        else:
+            schema = None
+        self.es.load_csv(
+            self.filename,
+            self.index,
+            self.doc_type,
+            schema,
+            id_field=self.id_field,
+            generate_id=self.generate_id,
+            **self.csv_options,
+        )
+        if self.delete_file:
+            fs.remove_files_safely(self.filename)
+            fs.remove_files_safely(schema_file)

recurvedata/pigeon/loader/csv_to_google_bigquery.py ADDED Viewed

@@ -0,0 +1,169 @@
+from typing import TYPE_CHECKING, List, Union
+from recurvedata.pigeon import const
+from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
+from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
+from recurvedata.pigeon.utils.sql import bak_table_of, reconcile_table_of, staging_table_of
+if TYPE_CHECKING:
+    from recurvedata.pigeon.connector.google_bigquery import GoogleBigqueryConnector
+allowed_modes = (
+    const.LOAD_OVERWRITE,
+    const.LOAD_MERGE,
+    const.LOAD_APPEND
+)
+class CSVToGoogleBigqueryLoader(BaseLoader, CSVToDBAPIMixin):
+    def __init__(
+            self,
+            table: str,
+            filename: str,
+            google_bigquery_connector: 'GoogleBigqueryConnector' = None,
+            dataset: str = None,
+            create_table_ddl: str = None,
+            mode: str = const.LOAD_OVERWRITE,
+            primary_keys: Union[str, List[str]] = None,
+            columns: Union[str, List[str]] = None,
+            skiprows: int = 0,
+            delete_file: bool = True,
+            pre_queries: str = None,
+            post_queries: str = None,
+            *args, **kwargs
+    ):
+        self.table = table
+        self.dataset = dataset
+        self.google_bigquery = google_bigquery_connector
+        # determine table name of target table and staging table
+        self.filename = filename  # full file path
+        self.skiprows = skiprows
+        # determine table ddl stuff
+        self.create_table_ddl = create_table_ddl
+        # merge stuff
+        if mode not in allowed_modes:
+            raise ValueError(f'mode should be one of ({allowed_modes})')
+        self.mode = mode
+        self.primary_keys = ensure_str_list(primary_keys)
+        if self.mode == const.LOAD_MERGE and not self.primary_keys:
+            raise ValueError('primary_keys should not be empty in mode {}'.format(const.LOAD_MERGE))
+        self.columns = ensure_str_list(columns)
+        self.pre_queries = ensure_query_list(pre_queries) or []
+        self.post_queries = ensure_query_list(post_queries) or []
+        self.delete_file = delete_file
+        super().__init__()
+    def execute_impl(self):
+        if fs.is_file_empty(self.filename):
+            self.logger.error('file not exists or has no content. %s', self.filename)
+            fs.remove_files_safely(fs.schema_filename(self.filename))
+            return
+        self._prepare_target_table()
+        self._prepare_staging_table()
+        self._merge_into_target_table()
+        # do cleaning things
+        if self.delete_file:
+            self.logger.info('delete local file %s', self.filename)
+            fs.remove_files_safely(self.filename)
+            fs.remove_files_safely(fs.schema_filename(self.filename))
+    @property
+    def connector(self):
+        return self.google_bigquery
+    @property
+    def staging_table(self):
+        return staging_table_of(self.table)
+    @property
+    def full_staging_table_name(self):
+        return f'{self.dataset}.{self.staging_table}'
+    @property
+    def full_table_name(self):
+        return f'{self.dataset}.{self.table}'
+    def _prepare_target_table(self):
+        # add schema for azure data warehouse, dataset for google bigquery
+        if self.connector.has_table(table=self.table, schema=getattr(self, 'schema', None),
+                                    dataset=getattr(self, 'dataset', None)):
+            return
+        self.logger.info('table not found, try to create it')
+        ddl = self._infer_create_table_ddl()
+        if not ddl:
+            raise ValueError('table not found, create_table_ddl is required')
+        ddl = ddl.strip().rstrip(';')
+        self.logger.info('create table ddl: %s\n', ddl)
+        with self.connector.cursor() as cursor:
+            cursor.execute(ddl)
+    def _prepare_staging_table(self):
+        dataset, table = self.full_staging_table_name.split('.')
+        drop = f'DROP TABLE IF EXISTS {self.full_staging_table_name}'
+        staging_ddl = f'CREATE TABLE IF NOT EXISTS {self.full_staging_table_name} LIKE {self.full_table_name}'
+        self.connector.execute([drop, staging_ddl], auto_commit=False, commit_on_close=True)
+        self.logger.info(f'load {self.filename} into staging table {self.full_staging_table_name}')
+        self.connector.load_csv(table=self.full_staging_table_name,
+                                filename=self.filename,
+                                schema=self.connector.get_schema(table, dataset),
+                                skiprows=self.skiprows)
+    def _merge_into_target_table(self):
+        target = self.full_table_name
+        staging = self.full_staging_table_name
+        append_sql = f'INSERT INTO {target} SELECT * FROM {staging}'
+        if self.mode == const.LOAD_OVERWRITE:
+            queries = [f'TRUNCATE TABLE {target}', append_sql]
+        elif self.mode == const.LOAD_MERGE:
+            queries = self._ingest_by_merging()
+        else:
+            # APPEND
+            queries = [append_sql]
+        queries.append(f'DROP TABLE {staging}')
+        queries = self.pre_queries + queries + self.post_queries
+        self.logger.info('running Google Bigquery queries...')
+        self.connector.execute(queries)
+        self.logger.info('done.')
+    def _ingest_by_merging(self):
+        reconcile = reconcile_table_of(self.table)
+        bak = bak_table_of(self.table)
+        quote = self.connector.quote_identifier
+        join = ' AND '.join([f'a.{quote(x)} = b.{quote(x)}' for x in self.primary_keys])
+        queries = f"""
+            DROP TABLE IF EXISTS {self.dataset}.{reconcile};
+            CREATE TABLE IF NOT EXISTS {self.dataset}.{reconcile} LIKE {self.full_table_name};
+            INSERT INTO {self.dataset}.{reconcile}
+            SELECT a.*
+            FROM {self.full_table_name} AS a
+            LEFT JOIN {self.full_staging_table_name} AS b ON {join}
+            WHERE b.{quote(self.primary_keys[0])} IS NULL
+            UNION ALL
+            SELECT * FROM {self.full_staging_table_name};
+            ALTER TABLE {self.full_table_name} RENAME TO {bak};
+            ALTER TABLE {self.dataset}.{reconcile} RENAME TO {self.table};
+            DROP TABLE IF EXISTS {self.dataset}.{bak};
+            DROP TABLE IF EXISTS {self.dataset}.{reconcile};
+        """
+        return queries.split(';')

recurvedata/pigeon/loader/csv_to_hive.py ADDED Viewed

@@ -0,0 +1,468 @@
+import functools
+import glob
+import json
+import os
+import tempfile
+from json.decoder import JSONDecodeError
+from typing import TYPE_CHECKING, Dict, List, Union
+import cytoolz as toolz
+from slugify import slugify
+from recurvedata.pigeon import const
+from recurvedata.pigeon.connector import new_hive_connector, new_impala_connector
+from recurvedata.pigeon.csv import CSV
+from recurvedata.pigeon.handler.csv_handler import convert_csv_to_hive_textfile
+from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
+from recurvedata.pigeon.utils import ensure_list, ensure_query_list, ensure_str_list, fs, silent
+from recurvedata.pigeon.utils.sql import reconcile_table_of, staging_table_of
+if TYPE_CHECKING:
+    from recurvedata.pigeon.connector.hive_impala import HiveConnector, ImpalaConnector
+allowed_modes = (
+    const.LOAD_OVERWRITE,
+    const.LOAD_MERGE,
+    const.LOAD_APPEND
+)
+AUTO = object()
+def _enable_connection_pooling(method):
+    @functools.wraps(method)
+    def inner(self: 'CSVToHiveLoader', *args, **kwargs):
+        self.hive.enable_connection_pooling(reset_on_return=False)
+        self.impala.enable_connection_pooling(reset_on_return=False)
+        try:
+            return method(self, *args, **kwargs)
+        except BaseException as e:
+            raise e
+        finally:
+            self.hive.dispose()
+            self.impala.dispose()
+    return inner
+class CSVToHiveLoader(BaseLoader, CSVToDBAPIMixin):
+    def __init__(
+            self,
+            database: str,
+            table: str,
+            filename: str,
+            hive_connector: 'HiveConnector' = None,
+            impala_connector: 'ImpalaConnector' = None,
+            create_table_ddl: str = None,
+            dynamic_partition: str = None,
+            partition: Dict = None,
+            mode: str = const.LOAD_OVERWRITE,
+            primary_keys: List[str] = None,
+            using_impala: bool = AUTO,
+            delete_file: bool = False,
+            dedup: bool = False,
+            dedup_uniq_keys: List[str] = None,
+            dedup_orderby: str = None,
+            pre_queries: Union[str, List[str]] = None,
+            post_queries: Union[str, List[str]] = None,
+            is_std_csv: bool = False,
+            has_header: bool = False,
+            csv_options: Dict = None,
+            compression_codec: str = "snappy",
+            dumper_meta: Dict = None,
+            refresh_impala_metadata: bool = True
+    ):
+        """Loads csv file into a Hive table.
+        :param database: the target database name
+        :param table: target table name, should not contains database portion
+        :param filename: the absolute path to csv file, can be a single string or list of strings
+        :param hive_connector: a HiveConnector object used to query Hive
+        :param impala_connector: a ImpalaConnector object used to query Impala
+        :param create_table_ddl: create table
+        :param dynamic_partition: dynamic_partition specs,should be a string like 'site,month'
+        :param partition: partition specs, should be a dict like {'dt': '2017-01-01'}
+        :param mode: one of (LOAD_OVERWRITE, LOAD_MERGE, LOAD_APPEND)
+        :param primary_keys: columns that identifies a unique row, e.g. ['dt', 'product_id']. Required if mode is LOAD_MERGE
+        :param using_impala: whether use Impala to merge data or not. Possible values:
+                             - `AUTO` (by default): determine by whether the table has complex type fields.
+                             - `True`: use Impala, will fail if the table has complex type fields
+                             - `False` and other values: fallback to use Hive
+        :param delete_file: delete the CSV file after loading, default is True
+        :param dedup: remove duplicated records from staging table before being merged into target
+        :param dedup_uniq_keys: columns that identifies a unique row.
+        :param dedup_orderby: determine which row should be kept.
+                              e.g. "to keep the row has minimal timestamp", then set `dedup_orderby='timestamp ASC'
+        :param pre_queries: queries executed before loading
+        :param post_queries: queries after loading
+        :param is_std_csv: indicates the input filename is a standard CSV file or not (standard Hive TextFile)
+        :param compression_codec: compression format code,emum{none,snappy,gzip}
+        :param dumper_meta: dumper output and options like check_dumper_row
+        """
+        self.database = database
+        self.table = table
+        if hive_connector is None:
+            hive_connector = new_hive_connector(database=self.database)
+        else:
+            hive_connector.database = self.database
+        self.hive = hive_connector
+        if impala_connector is None:
+            impala_connector = new_impala_connector(database=self.database)
+        else:
+            impala_connector.database = self.database
+        self.impala = impala_connector
+        self.refresh_impala_metadata = refresh_impala_metadata
+        self.filename = filename
+        self._local_data_files = self._determine_local_data_files()
+        self._schema_filename = self._infer_schema_filename()
+        self.is_std_csv = is_std_csv
+        self.has_header = has_header
+        self.csv_options = csv_options or {}
+        self.create_table_ddl = create_table_ddl
+        # 原来的partition重新命名为static_partiton，构造函数传入变量不换，避免修改大量的业务代码
+        self.static_partition = partition
+        if dynamic_partition:
+            self.dynamic_partition = dynamic_partition.split(',')
+        else:
+            self.dynamic_partition = None
+        if self.static_partition and self.dynamic_partition:
+            raise ValueError('Partition mode only be static or dynamic')
+        if self.dynamic_partition and not self.connector.is_table_partitioned(self.database, self.table):
+            if not create_table_ddl or 'partitioned by' not in self.create_table_ddl.lower():
+                raise ValueError("Table not found or is not partitioned, create_table_ddl is required and "
+                                 "assign partition columns when use dynamic partition mode")
+        if mode not in allowed_modes:
+            raise ValueError('mode should be one of ({})'.format(allowed_modes))
+        self.mode = mode
+        self.primary_keys = ensure_str_list(primary_keys)
+        if self.mode == const.LOAD_MERGE and not self.primary_keys:
+            raise ValueError('primary_keys should not be empty in mode {}'.format(const.LOAD_MERGE))
+        if self.mode == const.LOAD_MERGE and (self.static_partition or self.dynamic_partition):
+            raise ValueError('merge into partitioned table is not supported')
+        self.delete_file = delete_file
+        self.using_impala = using_impala
+        self.dedup = dedup
+        self.dedup_uniq_keys = ensure_str_list(dedup_uniq_keys)
+        self.dedup_orderby = dedup_orderby
+        if self.dedup and not self.dedup_uniq_keys:
+            raise ValueError('dedup_uniq_keys should not be empty')
+        if not self.dedup_orderby:
+            self.dedup_orderby = ', '.join(self.dedup_uniq_keys)
+        self.pre_queries = ensure_query_list(pre_queries) or []
+        self.post_queries = ensure_query_list(post_queries) or []
+        self.compression_codec = compression_codec
+        self.dumper_meta = dumper_meta
+        super().__init__()
+    @property
+    def schema_filename(self) -> str:
+        return self._schema_filename
+    @_enable_connection_pooling
+    def execute_impl(self):
+        if all([fs.is_file_empty(x) for x in self._local_data_files]):
+            self.logger.error('file not exists or has no content. %s', self.filename)
+            self._cleanup()
+            return
+        self._prepare_target_table()
+        self._check_target_table_cols_num()
+        self._prepare_staging_table()
+        self._merge_into_target_table()
+        if self.refresh_impala_metadata:
+            self._compute_stats()
+        if self.delete_file:
+            self._cleanup()
+    @property
+    def slugify_partition(self) -> str:
+        if self.static_partition is None:
+            return ''
+        names = [slugify(str(value), separator='') for _, value in self.static_partition.items()]
+        return '_'.join(names)
+    @property
+    def staging_table(self) -> str:
+        if not self.static_partition:
+            table_name = staging_table_of(self.table)
+        else:
+            table_name = staging_table_of(f'{self.table}_{self.slugify_partition}')
+        return table_name[:120]
+    @property
+    def reconciled_table(self) -> str:
+        if not self.static_partition:
+            table_name = reconcile_table_of(self.table)
+        else:
+            table_name = reconcile_table_of(f'{self.table}_{self.slugify_partition}')
+        return table_name[:120]
+    @property
+    def connector(self) -> 'HiveConnector':
+        return self.hive
+    def _determine_local_data_files(self) -> List[str]:
+        if isinstance(self.filename, str) and os.path.isdir(self.filename):
+            raise TypeError('filename should neither be a single path or list of paths, directory is not supported')
+        # ignore the empty or non-exist files
+        files = [x for x in ensure_list(self.filename) if not x.endswith('.schema') and not fs.is_file_empty(x)]
+        # make sure the first file is not empty
+        files.sort(key=lambda x: os.path.getsize(x), reverse=True)
+        return files
+    def _infer_schema_filename(self) -> str:
+        if self._local_data_files:
+            f = self._local_data_files[0]
+        elif self.filename:
+            f = ensure_list(self.filename)[0]
+        else:
+            return None
+        return fs.schema_filename(os.path.splitext(f)[0])
+    def _cleanup(self):
+        fs.remove_files_safely(self.filename)
+        fs.remove_files_safely(self._schema_filename)
+    def _check_target_table_cols_num(self):
+        # 获取目标表的字段长度信息
+        if not self.static_partition:
+            exclude = None
+        else:
+            exclude = self.static_partition.keys()
+        target_table_cols = self.connector.get_columns(table=self.table, database=self.database, exclude=exclude)
+        # 解析schema文件，获取fields长度信息
+        if not fs.is_file_empty(self._schema_filename):
+            with open(self._schema_filename) as f:
+                try:
+                    schema_fields = json.load(f)
+                    if len(schema_fields) == len(target_table_cols):
+                        return
+                except JSONDecodeError:
+                    pass
+        # 解析csv数据文件，获取列的数量
+        if self.is_std_csv:
+            cf = CSV(self._local_data_files[0], **self.csv_options)
+            with cf.reader(as_dict=False) as reader:
+                row = next(reader)
+            schema_fields_num = len(row)
+        else:
+            # hive格式的csv
+            with open(self._local_data_files[0]) as f:
+                line = next(f)
+            schema_fields_num = len(line.split(const.HIVE_FIELD_DELIMITER))
+        if schema_fields_num != len(target_table_cols):
+            raise Exception(f'number of columns mismatch, target table has {target_table_cols} columns,'
+                            f' while data file has {schema_fields_num}')
+    def _prepare_staging_table(self):
+        staging_table = self.hive.quote_identifier(self.staging_table)
+        queries = [
+            f"DROP TABLE IF EXISTS {staging_table} PURGE;"
+        ]
+        exclude_columns = self.static_partition.keys() if self.static_partition else None
+        staging_ddl = self.hive.generate_load_staging_table_ddl(staging_table, self.table, self.database,
+                                                                exclude_columns=exclude_columns)
+        queries.append(staging_ddl)
+        self.hive.execute(queries)
+        path_to_load = self._local_data_files
+        if self.is_std_csv:
+            self.logger.info('got standard CSV file, convert to Hive text file before loading')
+            prefix = os.path.splitext(os.path.basename(self._local_data_files[0]))[0]
+            tmp_folder = tempfile.mkdtemp(prefix=f'{prefix}_', dir=os.path.dirname(self._local_data_files[0]))
+            if os.path.exists(tmp_folder):
+                self.logger.warning(f'tmp folder {tmp_folder} already exists, will overwrite any files if exist')
+                fs.remove_folder_safely(tmp_folder)
+            os.makedirs(tmp_folder, exist_ok=True)
+            for cf in self._local_data_files:
+                convert_csv_to_hive_textfile(cf, folder=tmp_folder, replace=False,
+                                             has_header=self.has_header, **self.csv_options)
+            path_to_load = glob.glob(os.path.join(tmp_folder, '*'))
+            self.logger.info(f'the real files to be loaded into {self.staging_table} are {path_to_load}')
+        self.hive.load_local_file(self.staging_table, path_to_load)
+        if self._determine_using_impala():
+            self.impala.execute(f'INVALIDATE METADATA {self.impala.quote_identifier(self.staging_table)}')
+        self._check_staging_table_rows()
+        # remove the temp files
+        if path_to_load != self._local_data_files:
+            self.logger.info(f'delete {path_to_load} after being loaded to {self.staging_table}')
+            fs.remove_folder_safely(os.path.dirname(path_to_load[0]))
+    def _construct_dedup_query(self) -> str:
+        partition_cols = []
+        for col in self.dedup_uniq_keys:
+            partition_cols.append(self.hive.quote_identifier(col))
+        partition_by = ', '.join(partition_cols)
+        cols = self.hive.get_columns(self.staging_table)
+        staging_table = self.hive.quote_identifier(self.staging_table)
+        query = f'''
+            WITH t AS (
+              SELECT *, ROW_NUMBER() OVER(PARTITION BY {partition_by} ORDER BY {self.dedup_orderby}) AS rnk
+              FROM {staging_table}
+            )
+            INSERT OVERWRITE TABLE {staging_table}
+            SELECT {', '.join(self.hive.quote_identifier(x) for x in cols)}
+            FROM t WHERE rnk = 1
+        '''
+        return query
+    def _get_compression_sqls(self) -> List[str]:
+        using_impala = self._determine_using_impala()
+        compression_sqls = []
+        if using_impala:
+            allow_text = "SET ALLOW_UNSUPPORTED_FORMATS=True"
+            set_codec = "SET COMPRESSION_CODEC = {}".format(self.compression_codec)
+            compression_sqls = [allow_text, set_codec]
+        else:
+            if self.compression_codec != "none" and self._is_low_hive_version():
+                set_codec = "SET parquet.compression = {}".format(self.compression_codec)
+                compression_sqls = [set_codec]
+        return compression_sqls
+    def _merge_into_target_table(self):
+        if self.dedup:
+            self.pre_queries.append(self._construct_dedup_query())
+        if self.mode in (const.LOAD_OVERWRITE, const.LOAD_APPEND):
+            queries = self._ingest_by_overwriting_appending()
+        else:
+            queries = self._ingest_by_merging()
+        queries.append('DROP TABLE IF EXISTS {} PURGE'.format(self.hive.quote_identifier(self.staging_table)))
+        all_queries = self.pre_queries + queries + self.post_queries
+        self._execute_merge_queries(all_queries)
+    def _ingest_by_overwriting_appending(self) -> List[str]:
+        compression_sqls = self._get_compression_sqls()
+        insert_mode = {
+            const.LOAD_OVERWRITE: 'OVERWRITE',
+            const.LOAD_APPEND: 'INTO'
+        }
+        partition = ''
+        if self.static_partition:
+            spec = ', '.join([f'{self.hive.quote_identifier(k)}={repr(v)}' for k, v in self.static_partition.items()])
+            partition = f'PARTITION ({spec})'
+        elif self.dynamic_partition:
+            spec = ', '.join(self.hive.quote_identifier(p) for p in self.dynamic_partition)
+            partition = f'PARTITION ({spec})'
+        queries = []
+        if not self._determine_using_impala():
+            queries.append('SET hive.exec.dynamic.partition.mode=nonstrict')
+        sql = 'INSERT {mode} TABLE {table} {partition} SELECT * FROM {staging}'.format(
+            mode=insert_mode[self.mode], partition=partition,
+            table=self.hive.quote_identifier(self.table),
+            staging=self.hive.quote_identifier(self.staging_table))
+        queries.append(sql)
+        return compression_sqls + queries
+    def _ingest_by_merging(self) -> List[str]:
+        reconcile = self.reconciled_table
+        join = ' AND '.join(
+            [f'a.{self.hive.quote_identifier(x)} = b.{self.hive.quote_identifier(x)}' for x in self.primary_keys])
+        sql = '''
+            DROP TABLE IF EXISTS {reconcile} PURGE;
+            CREATE TABLE {reconcile} STORED AS PARQUET AS
+            SELECT a.* FROM {table} a LEFT OUTER JOIN {staging} b ON {join} WHERE b.{pk} IS NULL
+            UNION ALL
+            SELECT * FROM {staging};
+            {compression_sqls};
+            INSERT OVERWRITE TABLE {table} SELECT * FROM {reconcile};
+            DROP TABLE IF EXISTS {reconcile} PURGE;
+        '''.format(reconcile=self.hive.quote_identifier(reconcile),
+                   table=self.hive.quote_identifier(self.table),
+                   staging=self.hive.quote_identifier(self.staging_table),
+                   compression_sqls=";".join(self._get_compression_sqls()),
+                   # bak=self.hive.quote_identifier('{}_bak'.format(self.table)),
+                   join=join,
+                   pk=f'{self.hive.quote_identifier(self.primary_keys[0])}')
+        queries = sql.split(';')
+        return queries
+    def _execute_merge_queries(self, queries: List[str]):
+        using_impala = self._determine_using_impala()
+        if using_impala:
+            # staging_update_meta = f'INVALIDATE METADATA {self.impala.quote_identifier(self.staging_table)}'
+            # self.impala.execute(staging_update_meta)
+            self.impala.refresh(self.table, compute_stats=False)
+            self.impala.execute(queries)
+        else:
+            if self.dynamic_partition:
+                allow_dynamic_partition_queries_list = ['SET hive.exec.dynamic.partition=true',
+                                                        'SET hive.exec.dynamic.partition.mode=nonstrict']
+                queries = allow_dynamic_partition_queries_list + queries
+            self.hive.execute(queries)
+    @toolz.memoize
+    def _is_low_hive_version(self):
+        """
+        2.3.0 以下的版本, 动态修改 parquet 只能通过 SET parquet.compression = "xx" 的方式操作;
+        2.3.0 以上的版本, 则只能在 create table 时指定
+        """
+        result = self.hive.fetchall('SELECT version()')
+        self.logger.info(f"current hive's version: {result[0][0]}")
+        return result[0][0] < "2.3.0"
+    @toolz.memoize
+    def _determine_using_impala(self) -> bool:
+        if self.impala is None:
+            self.logger.info('impala connector is not set')
+            return False
+        if self.using_impala is True:
+            self.logger.info('`using_impala` is set to True by caller')
+            return True
+        if self.using_impala is AUTO:
+            self.logger.info('`using_impala` is set to AUTO, checking complex type fields')
+            if not self.hive.has_complex_type_fields(self.table):
+                self.logger.info('found no complex type fields, happy to use Impala')
+                return True
+            self.logger.info('detected complex type fields, fallback to using Hive')
+        return False
+    @silent()
+    def _compute_stats(self):
+        self.impala.refresh(self.table, True)
+    def _check_staging_table_rows(self):
+        if not self.dumper_meta:
+            return
+        check_dumper_row: bool = self.dumper_meta.get('check_dumper_row', True)
+        dumper_rows: int = self.dumper_meta.get('dumper_output_rows')
+        if not (check_dumper_row and dumper_rows):
+            return
+        staging_table = self.impala.quote_identifier(self.staging_table)
+        if self._determine_using_impala():
+            # self.impala.execute(f'INVALIDATE METADATA {staging_table}')
+            staging_table_cnt, = self.impala.fetchone(f'SELECT COUNT(1) AS cnt FROM {staging_table}')
+        else:
+            staging_table_cnt, = self.hive.fetchone(f'SELECT COUNT(1) AS cnt FROM {staging_table}')
+        if staging_table_cnt != dumper_rows:
+            raise ValueError(f'staging table {staging_table} cnt {staging_table_cnt} != dumper_rows {dumper_rows} '
+                             'maybe something wrong when load csv to staging table, please retry')
+        self.logger.info(f'staging_table {staging_table} cnt {staging_table_cnt} equals with dumper_output')