recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
import jsonschema
|
|
7
|
+
|
|
8
|
+
from recurvedata.core.translation import _l
|
|
9
|
+
from recurvedata.operators.transfer_operator.const import FILE_TRANSFORM_FUNC_DEFAULT_VALUE
|
|
10
|
+
from recurvedata.operators.transfer_operator.mixin import HiveTextfileConverterMixin
|
|
11
|
+
from recurvedata.operators.transfer_operator.task import DumpTask
|
|
12
|
+
from recurvedata.operators.utils import file_factory as ff
|
|
13
|
+
from recurvedata.utils import unescape_backslash
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FTPDumpTask(DumpTask, HiveTextfileConverterMixin):
|
|
19
|
+
ds_name_fields = ("data_source_name",)
|
|
20
|
+
worker_install_require = ["pigeon"]
|
|
21
|
+
|
|
22
|
+
def execute_impl(self, *args, **kwargs):
|
|
23
|
+
from recurvedata.pigeon.dumper.ftp import FtpDumper
|
|
24
|
+
|
|
25
|
+
conf = self.rendered_config
|
|
26
|
+
|
|
27
|
+
ds = self.must_get_connection_by_name(conf["data_source_name"])
|
|
28
|
+
dumper = FtpDumper(ds.connector, src=conf["filepath"], dst=self.filename)
|
|
29
|
+
meta = dumper.execute()
|
|
30
|
+
|
|
31
|
+
filename = self.process_file(conf)
|
|
32
|
+
filename = self.transform_file(conf, filename)
|
|
33
|
+
|
|
34
|
+
if filename != self.filename:
|
|
35
|
+
logger.info("renaming %s to %s", filename, self.filename)
|
|
36
|
+
os.rename(filename, self.filename)
|
|
37
|
+
|
|
38
|
+
# TODO: pigeon loader 要支持不同的文件格式
|
|
39
|
+
self.convert_csv_to_hive_text_if_needed()
|
|
40
|
+
return meta
|
|
41
|
+
|
|
42
|
+
def process_file(self, conf):
|
|
43
|
+
filename = self.filename
|
|
44
|
+
if conf["decompress"] == "Gzip":
|
|
45
|
+
logger.info("decompressing %s using gzip", self.filename)
|
|
46
|
+
filename = ff.gzip_decompress(self.filename, inplace=True)
|
|
47
|
+
|
|
48
|
+
skip_head_lines = conf.get("skip_head_lines", 0)
|
|
49
|
+
if conf["file_format"] == "Excel":
|
|
50
|
+
logger.info("converting Excel to CSV...")
|
|
51
|
+
filename = ff.convert_excel_to_csv(filename, skiprows=skip_head_lines, inplace=True)
|
|
52
|
+
if conf["file_format"] == "JSONLines":
|
|
53
|
+
logger.info("converting JSON lines to CSV...")
|
|
54
|
+
filename = ff.convert_jsonlines_to_csv(
|
|
55
|
+
filename, skiprows=skip_head_lines, src_encoding=conf["encoding"], inplace=True
|
|
56
|
+
)
|
|
57
|
+
if conf["file_format"] == "CSV":
|
|
58
|
+
logger.info("converting CSV dialect and encoding if necessary...")
|
|
59
|
+
dialect_options = self._get_custom_csv_options(conf)
|
|
60
|
+
filename = ff.convert_csv_dialect(
|
|
61
|
+
filename,
|
|
62
|
+
src_dialect_options=dialect_options,
|
|
63
|
+
skiprows=skip_head_lines,
|
|
64
|
+
src_encoding=conf["encoding"],
|
|
65
|
+
inplace=True,
|
|
66
|
+
)
|
|
67
|
+
return filename
|
|
68
|
+
|
|
69
|
+
def transform_file(self, conf, filename):
|
|
70
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
71
|
+
if not transform_func_code:
|
|
72
|
+
return filename
|
|
73
|
+
|
|
74
|
+
func = validate_transform(transform_func_code)
|
|
75
|
+
if not func:
|
|
76
|
+
return filename
|
|
77
|
+
|
|
78
|
+
logger.info("calling transform function with %s", (filename,))
|
|
79
|
+
result_file = func(filename)
|
|
80
|
+
if result_file is None or not (isinstance(result_file, str) and os.path.isabs(result_file)):
|
|
81
|
+
raise ValueError("transform must return an absolute filepath, got %s instead", result_file)
|
|
82
|
+
logger.info("got %s", result_file)
|
|
83
|
+
return result_file
|
|
84
|
+
|
|
85
|
+
def _get_custom_csv_options(self, conf):
|
|
86
|
+
rv = {
|
|
87
|
+
"delimiter": unescape_backslash(conf["csv_delimiter"]),
|
|
88
|
+
"lineterminator": unescape_backslash(conf["csv_lineterminator"]),
|
|
89
|
+
}
|
|
90
|
+
quoting = conf["csv_quoting"]
|
|
91
|
+
rv["quoting"] = {
|
|
92
|
+
"QUOTE_ALL": csv.QUOTE_ALL,
|
|
93
|
+
"QUOTE_MINIMAL": csv.QUOTE_MINIMAL,
|
|
94
|
+
"QUOTE_NONE": csv.QUOTE_NONE,
|
|
95
|
+
"QUOTE_NONNUMERIC": csv.QUOTE_NONNUMERIC,
|
|
96
|
+
}[quoting]
|
|
97
|
+
return rv
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def config_schema(cls):
|
|
101
|
+
# get_choices_by_type = cls.get_connection_names_by_type
|
|
102
|
+
return {
|
|
103
|
+
"type": "object",
|
|
104
|
+
"properties": {
|
|
105
|
+
"data_source_name": {
|
|
106
|
+
"type": "string",
|
|
107
|
+
"title": _l("FTP Connection"),
|
|
108
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
109
|
+
"ui:options": {
|
|
110
|
+
"supportTypes": [
|
|
111
|
+
"ftp",
|
|
112
|
+
],
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
"filepath": {
|
|
116
|
+
"type": "string",
|
|
117
|
+
"title": _l("Source File Path"),
|
|
118
|
+
"description": _l(
|
|
119
|
+
"Absolute path to the file on FTP server (e.g. /path/to/file.csv). Supports Jinja templating."
|
|
120
|
+
),
|
|
121
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
122
|
+
"ui:options": {
|
|
123
|
+
"type": "plain",
|
|
124
|
+
},
|
|
125
|
+
},
|
|
126
|
+
"decompress": {
|
|
127
|
+
"type": "string",
|
|
128
|
+
"title": _l("Decompression Method"),
|
|
129
|
+
"description": _l("Decompress downloaded file using specified method"),
|
|
130
|
+
"enum": ["None", "Gzip"],
|
|
131
|
+
"enumNames": ["None", "Gzip"],
|
|
132
|
+
"default": "None",
|
|
133
|
+
},
|
|
134
|
+
"file_format": {
|
|
135
|
+
"type": "string",
|
|
136
|
+
"title": _l("Input Format"),
|
|
137
|
+
"description": _l("Format of the source file to be converted to CSV"),
|
|
138
|
+
"enum": ["CSV", "Excel", "JSONLines"],
|
|
139
|
+
"enumNames": ["CSV", "Excel", "JSONLines"],
|
|
140
|
+
"default": "CSV",
|
|
141
|
+
},
|
|
142
|
+
"skip_head_lines": {
|
|
143
|
+
"type": "number",
|
|
144
|
+
"ui:options": {"controls": False},
|
|
145
|
+
"title": _l("Skip Header Rows"),
|
|
146
|
+
"description": _l("Number of rows to skip from the beginning of the file"),
|
|
147
|
+
"default": 0,
|
|
148
|
+
"minimum": 0,
|
|
149
|
+
},
|
|
150
|
+
"encoding": {
|
|
151
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
152
|
+
"type": "string",
|
|
153
|
+
"title": _l("File Encoding"),
|
|
154
|
+
"description": _l("Character encoding of the CSV file (e.g. utf-8, gbk)"),
|
|
155
|
+
"default": "utf-8",
|
|
156
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
157
|
+
"ui:options": {
|
|
158
|
+
"type": "plain",
|
|
159
|
+
},
|
|
160
|
+
},
|
|
161
|
+
"csv_delimiter": {
|
|
162
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
163
|
+
"type": "string",
|
|
164
|
+
"title": _l("Field Delimiter"),
|
|
165
|
+
"description": _l("Character used to separate fields in the CSV file"),
|
|
166
|
+
"default": ",",
|
|
167
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
168
|
+
"ui:options": {
|
|
169
|
+
"type": "plain",
|
|
170
|
+
},
|
|
171
|
+
},
|
|
172
|
+
"csv_lineterminator": {
|
|
173
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
174
|
+
"type": "string",
|
|
175
|
+
"title": _l("Line Ending"),
|
|
176
|
+
"description": _l("Character sequence used to terminate lines"),
|
|
177
|
+
"enum": [r"\n", r"\r\n"],
|
|
178
|
+
"enumNames": [r"\n", r"\r\n"],
|
|
179
|
+
"default": r"\r\n",
|
|
180
|
+
},
|
|
181
|
+
"csv_quoting": {
|
|
182
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
183
|
+
"type": "string",
|
|
184
|
+
"title": _l("Field Quoting"),
|
|
185
|
+
"description": _l("Strategy for quoting fields in the CSV file"),
|
|
186
|
+
"enum": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
|
|
187
|
+
"enumNames": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
|
|
188
|
+
"default": "QUOTE_MINIMAL",
|
|
189
|
+
},
|
|
190
|
+
"transform_func": {
|
|
191
|
+
"type": "string",
|
|
192
|
+
"title": _l("Custom Transformation"),
|
|
193
|
+
"description": _l(
|
|
194
|
+
"Python function to transform the downloaded file. Must accept a filepath argument and return "
|
|
195
|
+
"the path to the transformed file. Runs after built-in transformations."
|
|
196
|
+
),
|
|
197
|
+
"default": FILE_TRANSFORM_FUNC_DEFAULT_VALUE,
|
|
198
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
199
|
+
"ui:options": {
|
|
200
|
+
"type": "code",
|
|
201
|
+
"lang": "python",
|
|
202
|
+
},
|
|
203
|
+
},
|
|
204
|
+
},
|
|
205
|
+
"required": ["data_source_name", "filepath"],
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
@classmethod
|
|
209
|
+
def validate(cls, configuration):
|
|
210
|
+
conf = super().validate(configuration)
|
|
211
|
+
|
|
212
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
213
|
+
if transform_func_code:
|
|
214
|
+
validate_transform(transform_func_code)
|
|
215
|
+
return conf
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def validate_transform(raw_code):
|
|
219
|
+
code = compile(raw_code, "", "exec")
|
|
220
|
+
ns = {}
|
|
221
|
+
exec(code, ns)
|
|
222
|
+
func = ns.get("transform")
|
|
223
|
+
if not func:
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
if not callable(func):
|
|
227
|
+
raise jsonschema.ValidationError(message="transform should be callable", path=("transform_func",))
|
|
228
|
+
|
|
229
|
+
sig = inspect.signature(func)
|
|
230
|
+
if tuple(sig.parameters.keys()) != ("filename",):
|
|
231
|
+
raise jsonschema.ValidationError(
|
|
232
|
+
message="transform must accept and only accept filename as parameter", path=("transform_func",)
|
|
233
|
+
)
|
|
234
|
+
return func
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
import pandas as pd
|
|
5
|
+
except ImportError:
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
from recurvedata.core.translation import _l
|
|
9
|
+
from recurvedata.operators.transfer_operator.dump_sheet_task_base import SheetDumpTaskBase
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class GoogleSheetDumpTask(SheetDumpTaskBase):
|
|
15
|
+
_AUTO_REGISTER = True
|
|
16
|
+
ds_name_fields = ("google_service_account",)
|
|
17
|
+
worker_install_require = [
|
|
18
|
+
"gspread",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
custom_config_schema_properties = {
|
|
22
|
+
"google_service_account": {
|
|
23
|
+
"type": "string",
|
|
24
|
+
"title": _l("Service Account"),
|
|
25
|
+
"description": _l("Google service account with permissions to access the spreadsheet"),
|
|
26
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
27
|
+
"ui:options": {
|
|
28
|
+
"supportTypes": [
|
|
29
|
+
"google_service_account",
|
|
30
|
+
],
|
|
31
|
+
},
|
|
32
|
+
},
|
|
33
|
+
"file_url": {
|
|
34
|
+
"type": "string",
|
|
35
|
+
"title": _l("Spreadsheet URL"),
|
|
36
|
+
"description": _l("URL of the Google spreadsheet (defaults to first sheet if no sheet ID specified)"),
|
|
37
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
38
|
+
"ui:options": {
|
|
39
|
+
"type": "plain",
|
|
40
|
+
},
|
|
41
|
+
},
|
|
42
|
+
"cell_range": {
|
|
43
|
+
"type": "string",
|
|
44
|
+
"title": _l("Data Range"),
|
|
45
|
+
"description": _l("Cell range in A1 notation (e.g. A1:B10). Reads entire sheet if empty"),
|
|
46
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
47
|
+
"ui:options": {
|
|
48
|
+
"type": "plain",
|
|
49
|
+
},
|
|
50
|
+
},
|
|
51
|
+
}
|
|
52
|
+
custom_config_schema_required = ["google_service_account", "file_url"]
|
|
53
|
+
|
|
54
|
+
def read_origin_df(self) -> "pd.DataFrame":
|
|
55
|
+
conf = self.rendered_config
|
|
56
|
+
|
|
57
|
+
ds = self.must_get_connection_by_name(conf.google_service_account)
|
|
58
|
+
service_account = ds.recurve_connector
|
|
59
|
+
spread_sheet_id, sheet_id = service_account.parse_sheet_url(conf.file_url)
|
|
60
|
+
logger.info(f"reading {conf.file_url}, gid {sheet_id}")
|
|
61
|
+
|
|
62
|
+
sheet = service_account.get_sheet(conf.file_url, sheet_id)
|
|
63
|
+
df = service_account.read_sheet_to_df(sheet, cell_range=conf.cell_range)
|
|
64
|
+
logger.info(f"original DataFrame shape {df.shape}, dtypes:\n{df.dtypes}")
|
|
65
|
+
logger.info(df.head())
|
|
66
|
+
return df
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from bson import json_util
|
|
7
|
+
|
|
8
|
+
from recurvedata.pigeon.dumper.mongodb import MongoDBDumper
|
|
9
|
+
except ImportError:
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
from recurvedata.core.translation import _l
|
|
13
|
+
from recurvedata.operators.transfer_operator import utils
|
|
14
|
+
from recurvedata.operators.transfer_operator.task import DumpTask
|
|
15
|
+
from recurvedata.utils import date_time, extract_dict
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MongoDBDumpTask(DumpTask):
|
|
21
|
+
ds_name_fields = ("data_source_name",)
|
|
22
|
+
worker_install_require = ["pigeon[mongo]"]
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def time_column_tz(self):
|
|
26
|
+
return self.config.get("time_column_tz", "UTC")
|
|
27
|
+
|
|
28
|
+
def determine_time_range(self):
|
|
29
|
+
start_date, end_date = self.get_schedule_time_range()
|
|
30
|
+
# convert timezone
|
|
31
|
+
start_date = date_time.astimezone(start_date, tz=self.time_column_tz)
|
|
32
|
+
end_date = date_time.astimezone(end_date, tz=self.time_column_tz)
|
|
33
|
+
|
|
34
|
+
return start_date.replace(tzinfo=None), end_date.replace(tzinfo=None)
|
|
35
|
+
|
|
36
|
+
def execute_impl(self, *args, **kwargs):
|
|
37
|
+
ds = self.must_get_connection_by_name(self.config["data_source_name"])
|
|
38
|
+
hf = self.create_handler_factory()
|
|
39
|
+
dump_options = extract_dict(self.rendered_config, keys=["collection", "filter", "projection"])
|
|
40
|
+
dump_options.update({"connector": ds.connector, "handler_factories": [hf], "database": ds.database})
|
|
41
|
+
|
|
42
|
+
# projection 设置为 null 或 '' 都当作 None 处理,即包含所有字段
|
|
43
|
+
proj = dump_options.get("projection")
|
|
44
|
+
if proj:
|
|
45
|
+
dump_options["projection"] = json.loads(proj)
|
|
46
|
+
else:
|
|
47
|
+
dump_options["projection"] = None
|
|
48
|
+
|
|
49
|
+
if dump_options["filter"]:
|
|
50
|
+
flt = json_util.loads(dump_options["filter"])
|
|
51
|
+
else:
|
|
52
|
+
flt = {}
|
|
53
|
+
if not self.dag.is_once and self.config.incremental_by_time:
|
|
54
|
+
start, end = self.determine_time_range()
|
|
55
|
+
time_flt = {self.config.time_column: {"$gte": start, "$lt": end}}
|
|
56
|
+
flt.update(time_flt)
|
|
57
|
+
|
|
58
|
+
dump_options["filter"] = flt
|
|
59
|
+
|
|
60
|
+
logger.info("Dump options: %s", dump_options)
|
|
61
|
+
dumper = MongoDBDumper(**dump_options)
|
|
62
|
+
return dumper.execute()
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def config_schema(cls):
|
|
66
|
+
# get_choices_by_type = cls.get_connection_names_by_type
|
|
67
|
+
# dss = get_choices_by_type('mongodb')
|
|
68
|
+
return {
|
|
69
|
+
"type": "object",
|
|
70
|
+
"properties": {
|
|
71
|
+
"data_source_name": {
|
|
72
|
+
"type": "string",
|
|
73
|
+
"title": _l("MongoDB Connection"),
|
|
74
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
75
|
+
"ui:options": {
|
|
76
|
+
"supportTypes": [
|
|
77
|
+
"mongodb",
|
|
78
|
+
],
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
"collection": {
|
|
82
|
+
"type": "string",
|
|
83
|
+
"title": _l("MongoDB Collection"),
|
|
84
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
85
|
+
"ui:options": {
|
|
86
|
+
"type": "plain",
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
"filter": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"title": _l("Query Filter"),
|
|
92
|
+
"default": "{}",
|
|
93
|
+
"description": _l(
|
|
94
|
+
"MongoDB query filter in JSON format. Will be deserialized using bson.json_util and passed to find() method. "
|
|
95
|
+
"Supports MongoDB query operators like $gt, $lt, $in etc. See MongoDB documentation for details."
|
|
96
|
+
),
|
|
97
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
98
|
+
"ui:options": {
|
|
99
|
+
"type": "code",
|
|
100
|
+
"lang": "json",
|
|
101
|
+
},
|
|
102
|
+
},
|
|
103
|
+
"projection": {
|
|
104
|
+
"type": "string",
|
|
105
|
+
"title": _l("Field Selection"),
|
|
106
|
+
"description": _l(
|
|
107
|
+
"Specify which fields to return in JSON format. Empty value returns all fields. Passed directly to MongoDB find() function."
|
|
108
|
+
),
|
|
109
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
110
|
+
"ui:options": {
|
|
111
|
+
"type": "code",
|
|
112
|
+
"lang": "json",
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
"transform": copy.deepcopy(utils.TRANSFORM),
|
|
116
|
+
"incremental_by_time": {
|
|
117
|
+
"type": "boolean",
|
|
118
|
+
"title": _l("Enable Time-based Incremental Sync"),
|
|
119
|
+
"default": False,
|
|
120
|
+
"description": _l("Sync data incrementally based on a time column"),
|
|
121
|
+
"ui:widget": "BaseCheckbox",
|
|
122
|
+
"ui:options": {
|
|
123
|
+
"label": _l("Enable Time-based Incremental Sync"),
|
|
124
|
+
},
|
|
125
|
+
},
|
|
126
|
+
"time_column": {
|
|
127
|
+
"ui:hidden": "{{!parentFormData.incremental_by_time}}",
|
|
128
|
+
"type": "string",
|
|
129
|
+
"title": _l("Time Column Name"),
|
|
130
|
+
"default": "snapshot_time",
|
|
131
|
+
"description": _l(
|
|
132
|
+
"Name of the time column used for incremental sync. Column should be indexed for better performance."
|
|
133
|
+
),
|
|
134
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
135
|
+
"ui:options": {
|
|
136
|
+
"type": "plain",
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
"time_column_tz": {
|
|
140
|
+
"ui:hidden": "{{!parentFormData.incremental_by_time}}",
|
|
141
|
+
"type": "string",
|
|
142
|
+
"title": _l("Time Column Timezone"),
|
|
143
|
+
"default": "UTC",
|
|
144
|
+
"enum": [
|
|
145
|
+
"UTC",
|
|
146
|
+
"Asia/Shanghai",
|
|
147
|
+
],
|
|
148
|
+
"enumNames": [
|
|
149
|
+
"UTC",
|
|
150
|
+
"Asia/Shanghai",
|
|
151
|
+
],
|
|
152
|
+
},
|
|
153
|
+
"time_auto_round": {
|
|
154
|
+
"ui:hidden": "{{!parentFormData.incremental_by_time}}",
|
|
155
|
+
"type": "boolean",
|
|
156
|
+
"title": "Auto Round Time Range",
|
|
157
|
+
"default": True,
|
|
158
|
+
"description": _l(
|
|
159
|
+
"Automatically round time ranges to appropriate intervals. For example:\n"
|
|
160
|
+
"- Daily tasks running at 01:23 will sync previous day's data from 00:00 to 00:00\n"
|
|
161
|
+
"- Weekly tasks will round to Monday 00:00\n"
|
|
162
|
+
"- Monthly tasks will round to 1st day 00:00\n"
|
|
163
|
+
"If disabled, exact execution times will be used (e.g. 01:23 to 01:23)"
|
|
164
|
+
),
|
|
165
|
+
},
|
|
166
|
+
},
|
|
167
|
+
"required": ["data_source_name", "collection"],
|
|
168
|
+
}
|