PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/operators/transfer_operator/load_task_google_sheet.py ADDED Viewed

@@ -0,0 +1,130 @@
+import csv
+import logging
+from typing import Any
+from recurvedata.core.translation import _l
+from recurvedata.operators.transfer_operator import const
+from recurvedata.operators.transfer_operator.task import LoadTask
+from recurvedata.pigeon.utils import fs
+logger = logging.getLogger(__name__)
+GOOGLE_SHEET_MAX_ROWS = 1000000
+GOOGLE_SHEET_MAX_COLUMNS = 18278
+class GoogleSheetLoadTask(LoadTask):
+    ds_name_fields = ("google_service_account",)
+    should_write_header = True
+    worker_install_require = ["gspread"]
+    @staticmethod
+    def check_csv_content(filename: str) -> tuple[int, int]:
+        """Check if the CSV file row and column counts exceed the maximum allowed limits for Google Sheets."""
+        row_count = 0
+        col_count = 0
+        with open(filename, "r") as file:
+            reader = csv.reader(file)
+            for row in reader:
+                row_count += 1
+                if row_count == 1:
+                    col_count = len(row)
+                    if col_count > GOOGLE_SHEET_MAX_COLUMNS:
+                        raise ValueError(
+                            f"CSV file contains {col_count} columns, which exceeds the maximum allowed "
+                            f"{GOOGLE_SHEET_MAX_COLUMNS} columns in Google Sheets."
+                        )
+                if row_count > GOOGLE_SHEET_MAX_ROWS:
+                    raise ValueError(
+                        f"CSV file contains {row_count} rows, which exceeds the maximum allowed "
+                        f"{GOOGLE_SHEET_MAX_ROWS} rows in Google Sheets."
+                    )
+        return row_count, col_count
+    def execute_impl(self, *args: Any, **kwargs: Any) -> None:
+        import pandas as pd
+        if fs.is_file_empty(self.filename):
+            logger.warning("File %s does not exist or has no content, skipping.", self.filename)
+            return
+        ds = self.must_get_connection_by_name(self.config["google_service_account"])
+        service_account = ds.recurve_connector
+        _, sheet_id = service_account.parse_sheet_url(self.config["file_url"])
+        sheet = service_account.get_sheet(self.config["file_url"], sheet_id)
+        logger.info(f'Loading to {self.config["file_url"]}, gid {sheet_id}')
+        # Perform all necessary checks
+        csv_row_count, csv_col_count = self.check_csv_content(self.filename)
+        current_sheet_rows, current_sheet_cols = sheet.row_count, sheet.col_count
+        if self.config["mode"] == const.LOAD_APPEND:
+            csv_row_count += current_sheet_rows
+            csv_col_count = max(current_sheet_cols, csv_col_count)
+        if csv_row_count > GOOGLE_SHEET_MAX_ROWS:
+            raise ValueError(
+                f"Appending the CSV file will exceed the maximum allowed {GOOGLE_SHEET_MAX_ROWS} rows in Google Sheets."
+            )
+        if csv_col_count > GOOGLE_SHEET_MAX_COLUMNS:
+            raise ValueError(
+                f"Appending the CSV file will exceed the maximum allowed {GOOGLE_SHEET_MAX_COLUMNS} columns in Google Sheets."
+            )
+        # Load the CSV file into a DataFrame after checking the row count
+        df = pd.read_csv(self.filename, keep_default_na=False)
+        df.fillna("", inplace=True)
+        try:
+            service_account.load_df_to_sheet(df, sheet, self.config["mode"], value_input_option="USER_ENTERED")
+            logger.info(
+                f'Data loaded successfully into {self.config["file_url"]}, mode: {self.config["mode"]}, '
+                f"rows: {csv_row_count}, cols: {csv_col_count}"
+            )
+        except Exception as e:
+            logger.error(f'Failed to load data into {self.config["file_url"]}: {e}')
+            raise
+    @classmethod
+    def config_schema(cls) -> dict[str, Any]:
+        schema = {
+            "type": "object",
+            "properties": {
+                "google_service_account": {
+                    "type": "string",
+                    "title": _l("Google Service Account Connection"),
+                    "description": _l(
+                        "Select the Google Service Account connection with write permissions to the target spreadsheet"
+                    ),
+                    "ui:field": "ProjectConnectionSelectorField",
+                    "ui:options": {
+                        "supportTypes": ["google_service_account"],
+                    },
+                },
+                "file_url": {
+                    "type": "string",
+                    "title": _l("Google Sheet URL"),
+                    "description": _l(
+                        "URL of the target Google Sheet in format: "
+                        "https://docs.google.com/spreadsheets/d/{Spreadsheet ID}/edit#gid={Sheet GID}. "
+                        "If no sheet GID is specified, the first sheet will be used."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "mode": {
+                    "type": "string",
+                    "title": _l("Import Mode"),
+                    "enum": [const.LOAD_OVERWRITE, const.LOAD_APPEND],
+                    "enumNames": [const.LOAD_OVERWRITE, const.LOAD_APPEND],
+                    "default": const.LOAD_OVERWRITE,
+                    "description": _l(
+                        "OVERWRITE: Replace existing data with new data. " "APPEND: Add new data after existing data."
+                    ),
+                },
+            },
+            "required": ["google_service_account", "file_url", "mode"],
+        }
+        return schema

recurvedata/operators/transfer_operator/load_task_hive.py ADDED Viewed

@@ -0,0 +1,158 @@
+import copy
+import glob
+import json
+import os
+try:
+    from recurvedata.pigeon.loader.csv_to_hive import CSVToHiveLoader
+    from recurvedata.pigeon.utils import fs
+except ImportError:
+    pass
+from recurvedata.core.translation import _l
+from recurvedata.operators.transfer_operator import utils
+from recurvedata.operators.transfer_operator.task import LoadTask
+class HiveLoadTask(LoadTask):
+    ds_name_fields = ("hive_data_source_name",)
+    ds_types = ("hive",)
+    default_dumper_handler_options = {
+        "hive": True,
+        "merge_files": False,  # do not merge intermediate files, pass in file pattern
+    }
+    worker_install_require = ["pigeon[hive_impala]"]
+    def execute_impl(self, *args, **kwargs):
+        hive_ds = self.must_get_connection_by_name(self.config["hive_data_source_name"])
+        load_options = self.rendered_config.copy()
+        for k in ["hive_data_source_name", "impala_data_source_name"]:
+            load_options.pop(k, None)
+        partition = load_options.pop("partition", None)
+        if partition:
+            load_options["partition"] = json.loads(partition)
+        sub_files = glob.glob(f"{self.filename}.[0-9]*")
+        if os.path.exists(self.filename) and not sub_files:
+            # dumper merged file
+            filename = self.filename
+        else:
+            # dump without merging, use pattern
+            # if upstream dump result is empty, sub_files is empty array, force to [self.filename] to ensure array is not empty
+            if all([fs.is_file_empty(x) for x in sub_files]):
+                sub_files = [self.filename]
+            filename = sub_files
+        load_options.update(
+            {
+                "filename": filename,
+                "hive_connector": hive_ds.connector,
+                "delete_file": True,
+            }
+        )
+        impala_ds = self.get_connection_by_name(self.config["impala_data_source_name"])
+        if impala_ds:
+            load_options.update({"impala_connector": impala_ds.connector})
+        loader = CSVToHiveLoader(**load_options)
+        return loader.execute()
+    @classmethod
+    def config_schema(cls):
+        # hive_dss = cls.get_connection_names_by_type(cls.ds_types)
+        # impala_dss = cls.get_connection_names_by_type('impala')
+        schema = {
+            "type": "object",
+            "properties": {
+                "hive_data_source_name": {
+                    "type": "string",
+                    "title": _l("Hive Connection"),
+                    "ui:field": "ProjectConnectionSelectorField",
+                    "ui:options": {
+                        "supportTypes": cls.ds_types,
+                    },
+                    # 'default': cls.first_or_default(hive_dss, ''),
+                },
+                "impala_data_source_name": {
+                    "type": "string",
+                    "title": _l("Impala Connection"),
+                    "description": _l("Optional Impala connection for faster data loading"),
+                    "ui:field": "ProjectConnectionSelectorField",
+                    "ui:options": {
+                        "supportTypes": [
+                            "impala",
+                        ],
+                    },
+                    # 'default': cls.first_or_default(impala_dss, ''),
+                },
+                "database": {
+                    "type": "string",
+                    "title": _l("Database Name"),
+                    "description": _l("Name of the Hive database to load data into. Supports template variables."),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "table": {
+                    "type": "string",
+                    "title": _l("Table Name"),
+                    "description": _l("Name of the Hive table to load data into. Supports template variables."),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "create_table_ddl": {
+                    "type": "string",
+                    "title": _l("Table Creation SQL"),
+                    "description": _l(
+                        "SQL statement to create the table if it doesn't exist. "
+                        "PARQUET storage format is recommended for better performance. "
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "code",
+                        "lang": "sql",
+                        "sqlLang": "hive",
+                    },
+                },
+                "partition": {
+                    "type": "string",
+                    "title": _l("Partition Specification"),
+                    "description": _l(
+                        "JSON object specifying the partition to load data into. "
+                        "For T+1 tasks, use {'dt': '{{ yesterday_dt }}'} to load yesterday's partition. "
+                        "Supports template variables."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "compression_codec": {
+                    "type": "string",
+                    "title": _l("Compression Method"),
+                    "enum": ["snappy", "none", "gzip"],
+                    "enumNames": ["snappy", "none", "gzip"],
+                    "description": _l(
+                        "Data compression format. 'none' for no compression, 'gzip' for maximum compression, "
+                        "'snappy' for balanced compression/performance."
+                    ),
+                    "default": "snappy",
+                },
+            },
+            # NOTE: frontend uses vue-json-schema, which has a bug where enum fields must be required...
+            "required": [
+                "hive_data_source_name",
+                "impala_data_source_name",
+                "database",
+                "table",
+                "mode",
+                "compression_codec",
+            ],
+        }
+        properties_schema = schema["properties"]
+        properties_schema.update(copy.deepcopy(utils.LOAD_COMMON))
+        return schema

recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py ADDED Viewed

@@ -0,0 +1,105 @@
+import logging
+from typing import Any
+from recurvedata.core.translation import _l
+from recurvedata.operators.transfer_operator import const
+from recurvedata.operators.transfer_operator.task import LoadTask
+from recurvedata.operators.transfer_operator.utils import allowed_modes
+try:
+    from recurvedata.pigeon.loader.csv_to_microsoft_fabric import CSVToMsFabricLoader
+except ImportError:
+    pass
+logger = logging.getLogger(__name__)
+class MicrosoftFabricLoadTask(LoadTask):
+    ds_name_fields = ("data_source_name",)
+    ds_types = ("microsoft_fabric",)
+    worker_install_require = ["pigeon[azure]"]
+    def execute_impl(self, *args: Any, **kwargs: Any) -> None:
+        ds = self.must_get_connection_by_name(self.config["data_source_name"])
+        load_options: dict[str, Any] = self.rendered_config.copy()
+        for k in ["data_source_name"]:
+            load_options.pop(k, None)
+        columns = load_options.get("columns", "")
+        columns = [x.strip() for x in columns.split(",")] if columns.strip(" ,") else []
+        load_options["lineterminator"] = "\r\n" if self.dump_task_type == "PythonDumpTask" else "0x0D0A"
+        load_options.update(
+            {
+                "filename": self.filename,
+                "connector": ds.connector,
+                "delete_file": True,
+                "using_insert": False,
+                "columns": columns,
+                "database": ds.database,
+                "schema": ds.data.get("schema"),
+                "compress": True,  # Enable compression for better performance
+                "blob_options": ds.data.get("blob_options", {}),
+            }
+        )
+        logger.info(load_options)
+        loader = CSVToMsFabricLoader(**load_options)
+        return loader.execute()
+    @classmethod
+    def config_schema(cls):
+        schema = {
+            "type": "object",
+            "properties": {
+                "data_source_name": {
+                    "type": "string",
+                    "title": _l("Microsoft Fabric Connection"),
+                    "description": _l("The Microsoft Fabric data source to load data into"),
+                    "ui:field": "ProjectConnectionSelectorField",
+                    "ui:options": {
+                        "supportTypes": cls.ds_types,
+                    },
+                },
+                "table": {
+                    "type": "string",
+                    "title": _l("Target Table"),
+                    "description": _l("Name of the table to load data into"),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "create_table_ddl": {
+                    "type": "string",
+                    "title": _l("Table Creation SQL"),
+                    "description": _l("SQL statement to create the target table if it doesn't exist"),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "code",
+                        "lang": "sql",
+                        "sqlLang": "sql",
+                    },
+                },
+                "mode": {
+                    "type": "string",
+                    "title": _l("Load Mode"),
+                    "description": _l("How to handle existing data in the target table"),
+                    "enum": list(allowed_modes),
+                    "enumNames": list(allowed_modes),
+                    "default": const.LOAD_OVERWRITE,
+                },
+                "primary_keys": {
+                    "ui:hidden": '{{parentFormData.mode !== "MERGE"}}',
+                    "type": "string",
+                    "title": _l("Primary Keys"),
+                    "description": _l(
+                        "Comma-separated list of columns used for deduplication in MERGE mode. "
+                        "Should be primary or unique key columns."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+            },
+            "required": ["data_source_name", "table"],
+        }
+        return schema

recurvedata/operators/transfer_operator/load_task_mssql.py ADDED Viewed

@@ -0,0 +1,153 @@
+import logging
+try:
+    from recurvedata.pigeon.loader.csv_to_mssql import CSVToMsSQLLoader
+except ImportError:
+    pass
+from recurvedata.core.translation import _l
+from recurvedata.operators.transfer_operator import const
+from recurvedata.operators.transfer_operator.task import LoadTask
+from recurvedata.operators.transfer_operator.utils import allowed_modes
+logger = logging.getLogger(__name__)
+class MsSQLLoadTask(LoadTask):
+    ds_name_fields = ("data_source_name",)
+    ds_types = ("mssql", "azure_mssql")
+    worker_install_require = ["pigeon[azure]"]
+    def execute_impl(self, *args, **kwargs):
+        ds = self.must_get_connection_by_name(self.config["data_source_name"])
+        load_options = self.rendered_config.copy()
+        for k in ["data_source_name"]:
+            load_options.pop(k, None)
+        columns = load_options.get("columns", "")
+        columns = [x.strip() for x in columns.split(",")] if columns.strip(" ,") else []
+        load_options.update(
+            {
+                "filename": self.filename,
+                "connector": ds.connector,
+                "delete_file": True,
+                "using_insert": False,  # 自动推导，优先使用批量加载文件
+                "columns": columns,
+                "database": ds.database,
+                "schema": ds.data.get("schema"),
+            }
+        )
+        logger.info(load_options)
+        loader = CSVToMsSQLLoader(**load_options)
+        return loader.execute()
+    @classmethod
+    def config_schema(cls):
+        # get_choices_by_type = cls.get_connection_names_by_type
+        # dss = get_choices_by_type(cls.ds_types)
+        schema = {
+            "type": "object",
+            "properties": {
+                "data_source_name": {
+                    "type": "string",
+                    "title": _l("MSSQL Connection"),
+                    "description": _l("The MSSQL data source to load data into"),
+                    "ui:field": "ProjectConnectionSelectorField",
+                    "ui:options": {
+                        "supportTypes": cls.ds_types,
+                    },
+                },
+                # "database": {
+                #     "type": "string",
+                #     "title": _l("Target Database"),
+                #     "description": _l("Name of the database to load data into"),
+                #     "ui:field": "CodeEditorWithReferencesField",
+                #     "ui:options": {
+                #         "type": "plain",
+                #     },
+                # },
+                # "schema": {
+                #     "type": "string",
+                #     "title": _l("Database Schema"),
+                #     "description": _l("Schema name in the target database (default: dbo)"),
+                #     "default": "dbo",
+                #     "ui:field": "CodeEditorWithReferencesField",
+                #     "ui:options": {
+                #         "type": "plain",
+                #     },
+                # },
+                "table": {
+                    "type": "string",
+                    "title": _l("Target Table"),
+                    "description": _l("Name of the table to load data into"),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "create_table_ddl": {
+                    "type": "string",
+                    "title": _l("Table Creation SQL"),
+                    "description": _l("SQL statement to create the target table if it doesn't exist"),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "code",
+                        "lang": "sql",
+                        "sqlLang": "sql",
+                    },
+                },
+                "mode": {
+                    "type": "string",
+                    "title": _l("Load Mode"),
+                    "description": _l("How to handle existing data in the target table"),
+                    "enum": list(allowed_modes),
+                    "enumNames": list(allowed_modes),
+                    "default": const.LOAD_OVERWRITE,
+                },
+                "primary_keys": {
+                    "ui:hidden": '{{parentFormData.mode !== "MERGE"}}',
+                    "type": "string",
+                    "title": _l("Primary Keys"),
+                    "description": _l(
+                        "Comma-separated list of columns used for deduplication in MERGE mode. "
+                        "Should be primary or unique key columns."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                # 'insert_batch_size': {
+                #     'type': 'number',
+                #      "ui:options": {"controls": False},
+                #     'title': 'INSERT Batch Size',
+                #     'default': 500,
+                #     'minimum': 1,
+                #     'maximum': 2000,
+                #     'description': '如果不支持批量加载 INSERT 导入数据，该参数设定 batch 大小'
+                # }
+                # "pre_queries": {
+                #     "type": "string",
+                #     "title": "Queries Ran Before Loading",
+                #     "description": '新数据导入前运行的 SQL，多条 SQL 用 `;` 分隔；支持传入变量，详见 <a target="_blank" href="http://bit.ly/2JMutjn">文档</a>',
+                #     "ui:field": "CodeEditorWithReferencesField",
+                #     "ui:options": {
+                #         "type": "code",
+                #         "lang": "sql",
+                #         "sqlLang": "sql",
+                #     },
+                # },
+                # "post_queries": {
+                #     "type": "string",
+                #     "title": "Queries Ran After Loading",
+                #     "description": '新数据导入后运行的 SQL，多条 SQL 用 `;` 分隔；支持传入变量，详见 <a target="_blank" href="http://bit.ly/2JMutjn">文档</a>',
+                #     "ui:field": "CodeEditorWithReferencesField",
+                #     "ui:options": {
+                #         "type": "code",
+                #         "lang": "sql",
+                #         "sqlLang": "sql",
+                #     },
+                # },
+            },
+            "required": ["data_source_name", "table"],
+        }
+        return schema