PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/operators/transfer_operator/dump_task_cass.py ADDED Viewed

@@ -0,0 +1,155 @@
+import copy
+try:
+    import arrow
+    from recurvedata.pigeon.dumper.cass import CassandraDumper
+    from recurvedata.pigeon.row_factory import ordered_dict_factory
+except ImportError:
+    pass
+from recurvedata.operators.transfer_operator import utils
+from recurvedata.operators.transfer_operator.task import DumpTask
+from recurvedata.utils import extract_dict
+class CassandraDumpTask(DumpTask):
+    enabled = False
+    worker_install_require = ["pigeon[cassandra]"]
+    ds_name_fields = ("data_source_name",)
+    def determine_partitions(self):
+        if self.config.partitions:
+            return self.config.partitions
+        if not self.config.incremental_by_time:
+            return None
+        if self.dag.is_once:
+            return None
+        start_date, end_date = self.get_schedule_time_range()
+        partitions = arrow.Arrow.range(self.config.time_granularity, start_date, end_date)
+        partitions = [x.datetime for x in partitions[:-1]]
+        return partitions
+    def execute_impl(self, *args, **kwargs):
+        ds = self.must_get_connection_by_name(self.config["data_source_name"])
+        hf = self.create_handler_factory()
+        dump_options = extract_dict(
+            self.rendered_config, keys=["table", "columns", "where", "partition_column", "concurrency"]
+        )
+        dump_options.update({"connector": ds.connector, "handler_factories": [hf]})
+        partitions = self.determine_partitions()
+        if partitions:
+            dump_options.update({"partitions": partitions})
+        dumper = CassandraDumper(**dump_options)
+        # if self.has_custom_transformer():
+        dumper.row_factory = ordered_dict_factory
+        return dumper.execute()
+    @classmethod
+    def config_schema(cls):
+        # dss = cls.get_connection_names_by_type('cassandra')
+        return {
+            "type": "object",
+            "properties": {
+                "data_source_name": {
+                    "type": "string",
+                    "title": "Data Source",
+                    "ui:field": "ProjectConnectionSelectorField",
+                    "ui:options": {
+                        "supportTypes": [
+                            "cassandra",
+                        ],
+                    },
+                    # 'default': cls.first_or_default(dss, ''),
+                },
+                "table": {
+                    "type": "string",
+                    "title": "Table Name",
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "columns": {
+                    "type": "string",
+                    "title": "Columns",
+                    "description": "要导出的列，用 `,` 分隔；默认导出所有列（*）",
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "where": {
+                    "type": "string",
+                    "title": "Where Clause",
+                    "description": "Where 条件",
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "partition_column": {
+                    "type": "string",
+                    "title": "Partition Column",
+                    "description": "分区键，通常名为 date",
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "partitions": {
+                    "type": "string",
+                    "title": "Partitions",
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "concurrency": {
+                    "type": "number",
+                    "ui:options": {"controls": False},
+                    "title": "Concurrency",
+                    "default": 1,
+                    "description": "并发数，1~20",
+                    "minimum": 1,
+                    "maximum": 20,
+                },
+                "transform": copy.deepcopy(utils.TRANSFORM),
+                "incremental_by_time": {
+                    "type": "boolean",
+                    "title": "Incremental By Time",
+                    "default": False,
+                    "description": "是否按时间进行增量同步，这个时间必须是分区键",
+                    "ui:widget": "BaseCheckbox",
+                    "ui:options": {
+                        "label": "Incremental By Time",
+                    },
+                },
+                "time_granularity": {
+                    "ui:hidden": "{{!parentFormData.incremental_by_time}}",
+                    "type": "string",
+                    "title": "Time Granularity",
+                    "default": "day",
+                    "description": "分区键的时间粒度，用于生成分区值",
+                    "enum": ["day", "hour"],
+                    "enumNames": ["day", "hour"],
+                },
+                "time_auto_round": {
+                    "ui:hidden": "{{!parentFormData.incremental_by_time}}",
+                    "type": "boolean",
+                    "title": "Round Time Resolution",
+                    "default": True,
+                    "description": "是否把数据时间范围 round 到合适的粒度。比如每天 01:23 同步上一个自然日的数据，"
+                    "则运行时间是 01:23，数据范围是 [T-1 00:00, T 00:00)；否则数据范围是 [T-1 01:23, T 01:23)。"
+                    "开启后，每天运行的任务，数据范围会 round 到 0 点，即自然日；"
+                    "每周运行的任务，会 round 到周一 0 点；"
+                    "每月运行的任务，会 round 到每月 1 日 0 点",
+                },
+            },
+            # NOTE：前端用的 vue-json-schema 有 bug，enum 字段必须被 required...
+            "required": ["data_source_name", "table", "time_granularity"],
+            # 处理表单联动，只有 incremental_by_time 为 True 时，才需要显示其他两个输入框
+        }

recurvedata/operators/transfer_operator/dump_task_dbapi.py ADDED Viewed

@@ -0,0 +1,209 @@
+import copy
+import jsonschema
+from recurvedata.connectors.service import list_sql_operator_types
+from recurvedata.core.translation import _l
+from recurvedata.operators.transfer_operator import utils
+from recurvedata.operators.transfer_operator.task import DumpTask
+from recurvedata.pigeon.dumper.dbapi import DBAPIDumper
+from recurvedata.pigeon.row_factory import ordered_dict_factory
+from recurvedata.pigeon.utils.sql import apply_where_safely
+from recurvedata.utils import date_time, extract_dict
+class DBAPIDumpTask(DumpTask):
+    ds_name_fields = ("data_source_name",)
+    worker_install_require = ["pigeon"]
+    @property
+    def time_column_tz(self):
+        return self.config.get("time_column_tz", "UTC")
+    def determine_time_range(self):
+        start_date, end_date = self.get_schedule_time_range()
+        if self.config.time_column_type == "date":
+            return start_date.date(), end_date.date()
+        # convert timezone
+        start_date = date_time.astimezone(start_date, tz=self.time_column_tz)
+        end_date = date_time.astimezone(end_date, tz=self.time_column_tz)
+        return start_date.replace(tzinfo=None), end_date.replace(tzinfo=None)
+    def derive_sql_query(self, connector, base_query: str):
+        base_query = base_query.strip().rstrip(";")
+        comment = self.get_query_comment_conf()
+        if not self.config.incremental_by_time or self.dag.is_once:
+            annotated_query = connector.add_leading_comment(base_query, comment)
+            return annotated_query
+        if not base_query:
+            base_query = f"SELECT * FROM {connector.quote_identifier(self.config.table)}"
+        annotated_query = connector.add_leading_comment(base_query, comment)
+        start, end = self.determine_time_range()
+        col = connector.quote_identifier(self.config.time_column)
+        if connector.is_phoenix():
+            where = f"{col} >= TIMESTAMP '{start}' AND {col} < TIMESTAMP '{end}'"
+        else:
+            where = f"{col} >= '{start}' AND {col} < '{end}'"
+        return apply_where_safely(annotated_query, where)
+    def execute_impl(self, *args, **kwargs):
+        ds = self.must_get_connection_by_name(self.config["data_source_name"])
+        hf = self.create_handler_factory()
+        dump_options = extract_dict(self.rendered_config, keys=["table", "splitby", "splits", "concurrency"])
+        dump_options.update(
+            {
+                "connector": ds.connector,
+                "sql": self.derive_sql_query(ds.connector, self.rendered_config.get("sql", "")),
+                "handler_factories": [hf],
+            }
+        )
+        if not dump_options.get("splitby"):
+            dump_options["splits"] = dump_options["concurrency"] = 1
+        dumper = DBAPIDumper(**dump_options)
+        dumper.row_factory = ordered_dict_factory
+        return dumper.execute()
+    @classmethod
+    def validate(cls, configuration):
+        conf = super().validate(configuration)
+        if not (conf.get("table") or conf.get("sql")):
+            raise jsonschema.ValidationError(message="either table or sql is required", path=("table", "sql"))
+        return conf
+    @classmethod
+    def config_schema(cls):
+        return {
+            "type": "object",
+            "properties": {
+                "data_source_name": {
+                    "type": "string",
+                    "title": _l("Data Source"),
+                    "description": _l("Database connection to extract data from"),
+                    "ui:field": "ProjectConnectionSelectorField",
+                    "ui:options": {
+                        "supportTypes": list_sql_operator_types(),
+                    },
+                },
+                "table": {
+                    "type": "string",
+                    "title": _l("Source Table"),
+                    "description": _l(
+                        "Table name including schema (if required). Either specify a table name or SQL query."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "sql": {
+                    "type": "string",
+                    "title": _l("Custom Query"),
+                    "description": _l(
+                        "Custom SELECT query with Jinja template support. Takes precedence over table name if both are specified."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "code",
+                        "lang": "sql",
+                        "sqlLang": "sql",
+                    },
+                },
+                "splitby": {
+                    "type": "string",
+                    "title": _l("Split Column"),
+                    "description": _l(
+                        "Column to partition data by for parallel processing. Must be indexed, sortable and non-null."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "splits": {
+                    "ui:hidden": "{{ !parentFormData.splitby }}",
+                    "type": "number",
+                    "ui:options": {"controls": False},
+                    "title": _l("Number of Splits"),
+                    "default": 1,
+                    "minimum": 1,
+                    "maximum": 2000,
+                },
+                "concurrency": {
+                    "ui:hidden": "{{ !parentFormData.splitby }}",
+                    "type": "number",
+                    "ui:options": {"controls": False},
+                    "title": _l("Parallel Threads"),
+                    "default": 1,
+                    "description": _l("Number of concurrent extraction threads (1-20)"),
+                    "minimum": 1,
+                    "maximum": 20,
+                },
+                "transform": copy.deepcopy(utils.TRANSFORM),
+                "incremental_by_time": {
+                    "type": "boolean",
+                    "title": _l("Enable Time-based Incremental Sync"),
+                    "default": False,
+                    "description": _l("Sync data incrementally based on a time column"),
+                    "ui:widget": "BaseCheckbox",
+                    "ui:options": {
+                        "label": _l("Enable Time-based Incremental Sync"),
+                    },
+                },
+                "time_column": {
+                    "ui:hidden": "{{!parentFormData.incremental_by_time}}",
+                    "type": "string",
+                    "title": _l("Time Column Name"),
+                    "default": "snapshot_time",
+                    "description": _l(
+                        "Name of the time column used for incremental sync. Column should be indexed for better performance."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "time_column_tz": {
+                    "ui:hidden": "{{!parentFormData.incremental_by_time}}",
+                    "type": "string",
+                    "title": _l("Time Column Timezone"),
+                    "default": "UTC",
+                    "enum": [
+                        "UTC",
+                        "Asia/Shanghai",
+                    ],
+                    "enumNames": [
+                        "UTC",
+                        "Asia/Shanghai",
+                    ],
+                },
+                "time_column_type": {
+                    "ui:hidden": "{{!parentFormData.incremental_by_time}}",
+                    "type": "string",
+                    "title": _l("Timestamp Format"),
+                    "default": "datetime",
+                    "enum": ["datetime", "date"],
+                    "enumNames": ["datetime", "date"],
+                },
+                "time_auto_round": {
+                    "ui:hidden": "{{!parentFormData.incremental_by_time}}",
+                    "type": "boolean",
+                    "title": "Auto Round Time Range",
+                    "default": True,
+                    "description": _l(
+                        "Automatically round time ranges to appropriate intervals. For example:\n"
+                        "- Daily tasks running at 01:23 will sync previous day's data from 00:00 to 00:00\n"
+                        "- Weekly tasks will round to Monday 00:00\n"
+                        "- Monthly tasks will round to 1st day 00:00\n"
+                        "If disabled, exact execution times will be used (e.g. 01:23 to 01:23)"
+                    ),
+                },
+            },
+            "required": [
+                "data_source_name",
+            ],
+        }

recurvedata/operators/transfer_operator/dump_task_es.py ADDED Viewed

@@ -0,0 +1,113 @@
+import copy
+import json
+try:
+    from recurvedata.pigeon.dumper.es import ElasticSearchDumper
+except ImportError:
+    pass
+from recurvedata.core.translation import _l
+from recurvedata.operators.transfer_operator import utils
+from recurvedata.operators.transfer_operator.task import DumpTask
+from recurvedata.utils import extract_dict
+class ElasticSearchDumpTask(DumpTask):
+    ds_name_fields = ("data_source_name",)
+    worker_install_require = ["pigeon[elasticsearch]"]
+    def execute_impl(self, *args, **kwargs):
+        ds = self.must_get_connection_by_name(self.config["data_source_name"])
+        hf = self.create_handler_factory()
+        dump_options = extract_dict(self.rendered_config, keys=["index", "doc_type", "query", "fields", "meta_fields"])
+        if self.rendered_config.get("search_kwargs"):
+            search_kwargs = json.loads(self.rendered_config.get("search_kwargs"))
+            dump_options["search_kwargs"] = search_kwargs
+        dump_options.update({"connector": ds.connector, "handler_factories": [hf]})
+        dumper = ElasticSearchDumper(**dump_options)
+        return dumper.execute()
+    @classmethod
+    def config_schema(cls):
+        # get_choices_by_type = cls.get_connection_names_by_type
+        # dss = get_choices_by_type('elasticsearch')
+        return {
+            "type": "object",
+            "properties": {
+                "data_source_name": {
+                    "type": "string",
+                    "title": _l("Elasticsearch Data Source"),
+                    "ui:field": "ProjectConnectionSelectorField",
+                    "ui:options": {
+                        "supportTypes": [
+                            "elasticsearch",
+                        ],
+                    },
+                    # 'default': cls.first_or_default(dss, ''),
+                },
+                "index": {
+                    "type": "string",
+                    "title": _l("Elasticsearch Index"),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "doc_type": {
+                    "type": "string",
+                    "title": _l("Document Type"),
+                    "default": "_doc",
+                    "description": _l("The type of documents to query"),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "query": {
+                    "type": "string",
+                    "title": _l("Search Query"),
+                    "default": "*",
+                    "description": _l("Elasticsearch query string to filter documents. Supports Jinja templating."),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "fields": {
+                    "type": "string",
+                    "title": _l("Document Fields"),
+                    "description": _l(
+                        "Comma-separated list of document fields to retrieve. Leave empty to get all fields."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "meta_fields": {
+                    "type": "array",
+                    "uniqueItems": True,
+                    "items": {
+                        "type": "string",
+                        "enum": ["_index", "_type", "_id"],
+                        "enumNames": ["_index", "_type", "_id"],
+                    },
+                    "title": _l("Document Metadata Fields"),
+                    "description": _l("Additional metadata fields to include with each document."),
+                    "ui:widget": "SelectWidget",
+                },
+                "search_kwargs": {
+                    "type": "string",
+                    "title": _l("Advanced Search Options"),
+                    "description": _l(
+                        "Additional options for Elasticsearch scan operation in JSON format (e.g. size, scroll)."
+                    ),
+                    "ui:field": "CodeEditorWithReferencesField",
+                    "ui:options": {
+                        "type": "plain",
+                    },
+                },
+                "transform": copy.deepcopy(utils.TRANSFORM),
+            },
+            "required": ["data_source_name", "index"],
+        }

recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py ADDED Viewed

@@ -0,0 +1,114 @@
+import json
+import logging
+import urllib.parse
+try:
+    import pandas as pd
+    from recurvedata.pigeon.connector.feishu import FeishuBot
+except ImportError:
+    pass
+from recurvedata.core.translation import _l
+from recurvedata.operators.transfer_operator.dump_sheet_task_base import SheetDumpTaskBase
+logger = logging.getLogger(__name__)
+class FeishuSheetDumpTask(SheetDumpTaskBase):
+    _AUTO_REGISTER = True
+    ds_name_fields = ("feishu_bot",)
+    worker_install_require = ["pandas", "numpy", "pigeon[feishu]"]
+    custom_config_schema_properties = {
+        "feishu_bot": {
+            "type": "string",
+            "title": _l("Feishu Bot Connection"),
+            "description": _l(
+                "Select the Feishu bot connection that has permissions to access and read the target sheet"
+            ),
+            "ui:field": "ProjectConnectionSelectorField",
+            "ui:options": {
+                "supportTypes": [
+                    "feishu_bot",
+                ],
+            },
+            # 'default': cls.first_or_default(dss, ''),
+        },
+        "file_url": {
+            "type": "string",
+            "title": _l("Feishu Document URL"),
+            "description": _l(
+                "URL of the Feishu spreadsheet or document to read data from. For multi-sheet documents, the first sheet will be used by default unless a specific sheet ID is included in the URL"
+            ),
+            "ui:field": "CodeEditorWithReferencesField",
+            "ui:options": {
+                "type": "plain",
+            },
+        },
+        "cell_range": {
+            "type": "string",
+            "title": _l("Data Range"),
+            "description": _l(
+                "Optional range of cells to read in A1 notation (e.g. A1:D100). Leave empty to read all data from the sheet"
+            ),
+            "ui:field": "CodeEditorWithReferencesField",
+            "ui:options": {
+                "type": "plain",
+            },
+        },
+    }
+    custom_config_schema_required = ["feishu_bot", "file_url"]
+    def read_origin_df(self) -> "pd.DataFrame":
+        conf = self.rendered_config
+        if conf.extra_read_kwargs:
+            extra_read_kwargs = json.loads(conf.extra_read_kwargs)
+        else:
+            extra_read_kwargs = {}
+        ds = self.must_get_connection_by_name(conf.feishu_bot)
+        bot = FeishuBot(**ds.extra)
+        file_type, file_token, sheet = self.parse_feishu_sheets_url(conf.file_url)
+        logger.info(f"reading {conf.file_url}")
+        if file_type == "file":
+            df = bot.read_feishuexcel(file_token, **extra_read_kwargs)
+        else:  # sheets
+            if not sheet:
+                logger.info("sheet_id not found in url, use the first sheet as default")
+                sheet_ids, _ = bot.get_sheet_ids(file_token)
+                sheet = sheet_ids[0]
+            if conf.cell_range:
+                sheet = f"{sheet}!{conf.cell_range}"
+            df = bot.read_feishusheet(file_token, sheet, **extra_read_kwargs)
+        logger.info(f"original DataFrame shape {df.shape}, dtypes:\n{df.dtypes}")
+        logger.info(df.head())
+        return df
+    @staticmethod
+    def parse_feishu_sheets_url(url: str) -> tuple[str, str, str]:
+        rv = urllib.parse.urlparse(url)
+        if "/file/" in url:
+            file_type = "file"
+        elif "/sheets/" in url:
+            file_type = "sheets"
+        elif "/wiki/" in url:
+            file_type = "wiki"
+        else:
+            raise ValueError(f"unsupported url {url}")
+        file_token = rv.path.rsplit("/", 1)[-1]
+        sheet = urllib.parse.parse_qs(rv.query).get("sheet") or None
+        if sheet:
+            sheet = sheet[0]
+        bot = FeishuBot()
+        if file_type == "wiki":
+            obj_type, obj_token = bot.get_wiki_type_token(wiki_token=file_token)
+            if obj_type == "sheet":
+                file_type, file_token = obj_type, obj_token
+            else:
+                raise ValueError(f"unsupported url {url}")
+        return file_type, file_token, sheet