recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
|
|
7
|
+
import jsonschema
|
|
8
|
+
|
|
9
|
+
from recurvedata.core.translation import _l
|
|
10
|
+
from recurvedata.operators.transfer_operator.const import FILE_TRANSFORM_FUNC_DEFAULT_VALUE
|
|
11
|
+
from recurvedata.operators.transfer_operator.mixin import HiveTextfileConverterMixin
|
|
12
|
+
from recurvedata.operators.transfer_operator.task import DumpTask
|
|
13
|
+
from recurvedata.operators.utils import file_factory as ff
|
|
14
|
+
from recurvedata.utils import unescape_backslash
|
|
15
|
+
from recurvedata.utils.files import merge_files
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class S3DumpTask(DumpTask, HiveTextfileConverterMixin):
|
|
21
|
+
ds_name_fields = ("data_source_name",)
|
|
22
|
+
|
|
23
|
+
def execute_impl(self, *args, **kwargs):
|
|
24
|
+
tmp_dirname = f"{self.filename}_dir"
|
|
25
|
+
if os.path.exists(tmp_dirname):
|
|
26
|
+
shutil.rmtree(tmp_dirname)
|
|
27
|
+
os.makedirs(tmp_dirname)
|
|
28
|
+
|
|
29
|
+
conf = self.rendered_config.copy()
|
|
30
|
+
|
|
31
|
+
ds = self.must_get_connection_by_name(conf["data_source_name"])
|
|
32
|
+
ds_extra_bucket = ds.extra.get("bucket")
|
|
33
|
+
config_bucket = conf.get("bucket_name")
|
|
34
|
+
bucket = config_bucket if config_bucket else ds_extra_bucket
|
|
35
|
+
|
|
36
|
+
conf_keys = conf["keys"]
|
|
37
|
+
keys = self.get_keys(ds=ds, bucket=bucket, prefix=conf_keys)
|
|
38
|
+
logger.info(f"[start] downloading keys:{keys} from s3")
|
|
39
|
+
local_files = []
|
|
40
|
+
for key in keys:
|
|
41
|
+
ds.connector.download(bucket, key, folder=tmp_dirname)
|
|
42
|
+
local_files.append(os.path.join(tmp_dirname, os.path.basename(key)))
|
|
43
|
+
logger.info(f"[finish] downloading files to {local_files}")
|
|
44
|
+
|
|
45
|
+
filename = self.process_file(conf=conf, files=local_files)
|
|
46
|
+
filename = self.transform_file(conf, filename)
|
|
47
|
+
|
|
48
|
+
if filename != self.filename:
|
|
49
|
+
logger.info("renaming %s to %s", filename, self.filename)
|
|
50
|
+
os.rename(filename, self.filename)
|
|
51
|
+
|
|
52
|
+
# TODO: pigeon loader 要支持不同的文件格式
|
|
53
|
+
self.convert_csv_to_hive_text_if_needed()
|
|
54
|
+
|
|
55
|
+
shutil.rmtree(tmp_dirname)
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def get_keys(ds, bucket, prefix):
|
|
60
|
+
keys = ds.connector.get_keys(bucket_name=bucket, prefix=prefix)
|
|
61
|
+
return keys
|
|
62
|
+
|
|
63
|
+
def process_file(self, conf, files):
|
|
64
|
+
filename = self.filename
|
|
65
|
+
|
|
66
|
+
for f in files:
|
|
67
|
+
if conf["decompress"] == "Gzip":
|
|
68
|
+
logger.info("decompressing %s using gzip", f)
|
|
69
|
+
ff.gzip_decompress(f, inplace=True)
|
|
70
|
+
if conf["decompress"] == "Zip":
|
|
71
|
+
logger.info("decompressing %s using gzip", f)
|
|
72
|
+
ff.zip_decompress(f, inplace=True)
|
|
73
|
+
|
|
74
|
+
skip_head_lines = conf.get("skip_head_lines", 0)
|
|
75
|
+
if conf["file_format"] == "Excel":
|
|
76
|
+
logger.info("converting Excel to CSV...")
|
|
77
|
+
ff.convert_excel_to_csv(f, skiprows=skip_head_lines, inplace=True)
|
|
78
|
+
if conf["file_format"] == "JSONLines":
|
|
79
|
+
logger.info("converting JSON lines to CSV...")
|
|
80
|
+
ff.convert_jsonlines_to_csv(f, skiprows=skip_head_lines, src_encoding=conf["encoding"], inplace=True)
|
|
81
|
+
if conf["file_format"] == "CSV":
|
|
82
|
+
logger.info("converting CSV dialect and encoding if necessary...")
|
|
83
|
+
dialect_options = self._get_custom_csv_options(conf)
|
|
84
|
+
ff.convert_csv_dialect(
|
|
85
|
+
f,
|
|
86
|
+
src_dialect_options=dialect_options,
|
|
87
|
+
skiprows=skip_head_lines,
|
|
88
|
+
src_encoding=conf["encoding"],
|
|
89
|
+
inplace=True,
|
|
90
|
+
)
|
|
91
|
+
if files:
|
|
92
|
+
merge_files(files=files, filename=filename)
|
|
93
|
+
return filename
|
|
94
|
+
|
|
95
|
+
def transform_file(self, conf, filename):
|
|
96
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
97
|
+
if not transform_func_code:
|
|
98
|
+
return filename
|
|
99
|
+
|
|
100
|
+
func = validate_transform(transform_func_code)
|
|
101
|
+
if not func:
|
|
102
|
+
return filename
|
|
103
|
+
|
|
104
|
+
logger.info("calling transform function with %s", (filename,))
|
|
105
|
+
result_file = func(filename)
|
|
106
|
+
if result_file is None or not (isinstance(result_file, str) and os.path.isabs(result_file)):
|
|
107
|
+
raise ValueError("transform must return an absolute filepath, got %s instead", result_file)
|
|
108
|
+
logger.info("got %s", result_file)
|
|
109
|
+
return result_file
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def _get_custom_csv_options(conf):
|
|
113
|
+
rv = {
|
|
114
|
+
"delimiter": unescape_backslash(conf["csv_delimiter"]),
|
|
115
|
+
"lineterminator": unescape_backslash(conf["csv_lineterminator"]),
|
|
116
|
+
}
|
|
117
|
+
quoting = conf["csv_quoting"]
|
|
118
|
+
rv["quoting"] = {
|
|
119
|
+
"QUOTE_ALL": csv.QUOTE_ALL,
|
|
120
|
+
"QUOTE_MINIMAL": csv.QUOTE_MINIMAL,
|
|
121
|
+
"QUOTE_NONE": csv.QUOTE_NONE,
|
|
122
|
+
"QUOTE_NONNUMERIC": csv.QUOTE_NONNUMERIC,
|
|
123
|
+
}[quoting]
|
|
124
|
+
return rv
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def config_schema(cls):
|
|
128
|
+
# get_choices_by_type = cls.get_connection_names_by_type
|
|
129
|
+
return {
|
|
130
|
+
"type": "object",
|
|
131
|
+
"properties": {
|
|
132
|
+
"data_source_name": {
|
|
133
|
+
"type": "string",
|
|
134
|
+
"title": _l("S3 Connection"),
|
|
135
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
136
|
+
"ui:options": {
|
|
137
|
+
"supportTypes": [
|
|
138
|
+
"s3",
|
|
139
|
+
],
|
|
140
|
+
},
|
|
141
|
+
},
|
|
142
|
+
"bucket_name": {
|
|
143
|
+
"type": "string",
|
|
144
|
+
"title": _l("S3 Bucket Name"),
|
|
145
|
+
"description": _l(
|
|
146
|
+
"Name of the S3 bucket to download from. Required if not configured in data source."
|
|
147
|
+
),
|
|
148
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
149
|
+
"ui:options": {
|
|
150
|
+
"type": "plain",
|
|
151
|
+
},
|
|
152
|
+
},
|
|
153
|
+
"keys": {
|
|
154
|
+
"type": "string",
|
|
155
|
+
"title": _l("S3 Object Keys"),
|
|
156
|
+
"description": _l("Object key or prefix pattern to download. Supports Jinja templating syntax."),
|
|
157
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
158
|
+
"ui:options": {
|
|
159
|
+
"type": "plain",
|
|
160
|
+
},
|
|
161
|
+
},
|
|
162
|
+
"decompress": {
|
|
163
|
+
"type": "string",
|
|
164
|
+
"title": _l("Decompression Method"),
|
|
165
|
+
"description": _l("Decompress downloaded file using specified method"),
|
|
166
|
+
"enum": ["None", "Gzip", "Zip"],
|
|
167
|
+
"enumNames": ["None", "Gzip", "Zip"],
|
|
168
|
+
"default": "None",
|
|
169
|
+
},
|
|
170
|
+
"file_format": {
|
|
171
|
+
"type": "string",
|
|
172
|
+
"title": _l("Input Format"),
|
|
173
|
+
"description": _l("Format of the source file to be converted to CSV"),
|
|
174
|
+
"enum": ["CSV", "Excel", "JSONLines"],
|
|
175
|
+
"enumNames": ["CSV", "Excel", "JSONLines"],
|
|
176
|
+
"default": "CSV",
|
|
177
|
+
},
|
|
178
|
+
"skip_head_lines": {
|
|
179
|
+
"type": "number",
|
|
180
|
+
"ui:options": {"controls": False},
|
|
181
|
+
"title": _l("Skip Header Rows"),
|
|
182
|
+
"description": _l("Number of rows to skip from the beginning of the file"),
|
|
183
|
+
"default": 0,
|
|
184
|
+
"minimum": 0,
|
|
185
|
+
},
|
|
186
|
+
"encoding": {
|
|
187
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
188
|
+
"type": "string",
|
|
189
|
+
"title": _l("File Encoding"),
|
|
190
|
+
"description": _l("Character encoding of the CSV file (e.g. utf-8, gbk)"),
|
|
191
|
+
"default": "utf-8",
|
|
192
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
193
|
+
"ui:options": {
|
|
194
|
+
"type": "plain",
|
|
195
|
+
},
|
|
196
|
+
},
|
|
197
|
+
"csv_delimiter": {
|
|
198
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
199
|
+
"type": "string",
|
|
200
|
+
"title": _l("Field Delimiter"),
|
|
201
|
+
"description": _l("Character used to separate fields in the CSV file"),
|
|
202
|
+
"default": ",",
|
|
203
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
204
|
+
"ui:options": {
|
|
205
|
+
"type": "plain",
|
|
206
|
+
},
|
|
207
|
+
},
|
|
208
|
+
"csv_lineterminator": {
|
|
209
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
210
|
+
"type": "string",
|
|
211
|
+
"title": _l("Line Ending"),
|
|
212
|
+
"description": _l("Character sequence used to terminate lines"),
|
|
213
|
+
"enum": [r"\n", r"\r\n"],
|
|
214
|
+
"enumNames": [r"\n", r"\r\n"],
|
|
215
|
+
"default": r"\r\n",
|
|
216
|
+
},
|
|
217
|
+
"csv_quoting": {
|
|
218
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
219
|
+
"type": "string",
|
|
220
|
+
"title": _l("Field Quoting"),
|
|
221
|
+
"description": _l("Strategy for quoting fields in the CSV file"),
|
|
222
|
+
"enum": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
|
|
223
|
+
"enumNames": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
|
|
224
|
+
"default": "QUOTE_MINIMAL",
|
|
225
|
+
},
|
|
226
|
+
"transform_func": {
|
|
227
|
+
"type": "string",
|
|
228
|
+
"title": _l("Custom Transformation"),
|
|
229
|
+
"description": _l(
|
|
230
|
+
"Python function to transform the downloaded file. Must accept a filepath argument and return "
|
|
231
|
+
"the path to the transformed file. Runs after built-in transformations."
|
|
232
|
+
),
|
|
233
|
+
"default": FILE_TRANSFORM_FUNC_DEFAULT_VALUE,
|
|
234
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
235
|
+
"ui:options": {
|
|
236
|
+
"type": "code",
|
|
237
|
+
"lang": "python",
|
|
238
|
+
},
|
|
239
|
+
},
|
|
240
|
+
},
|
|
241
|
+
"required": ["data_source_name", "keys"],
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
@classmethod
|
|
245
|
+
def validate(cls, configuration):
|
|
246
|
+
conf = super().validate(configuration)
|
|
247
|
+
|
|
248
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
249
|
+
if transform_func_code:
|
|
250
|
+
validate_transform(transform_func_code)
|
|
251
|
+
return conf
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def validate_transform(raw_code):
|
|
255
|
+
code = compile(raw_code, "", "exec")
|
|
256
|
+
ns = {}
|
|
257
|
+
exec(code, ns)
|
|
258
|
+
func = ns.get("transform")
|
|
259
|
+
if not func:
|
|
260
|
+
return None
|
|
261
|
+
|
|
262
|
+
if not callable(func):
|
|
263
|
+
raise jsonschema.ValidationError(message="transform should be callable", path=("transform_func",))
|
|
264
|
+
|
|
265
|
+
sig = inspect.signature(func)
|
|
266
|
+
if tuple(sig.parameters.keys()) != ("filename",):
|
|
267
|
+
raise jsonschema.ValidationError(
|
|
268
|
+
message="transform must accept and only accept filename as parameter", path=("transform_func",)
|
|
269
|
+
)
|
|
270
|
+
return func
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
import jsonschema
|
|
7
|
+
|
|
8
|
+
from recurvedata.core.translation import _l
|
|
9
|
+
from recurvedata.operators.transfer_operator.const import FILE_TRANSFORM_FUNC_DEFAULT_VALUE
|
|
10
|
+
from recurvedata.operators.transfer_operator.mixin import HiveTextfileConverterMixin
|
|
11
|
+
from recurvedata.operators.transfer_operator.task import DumpTask
|
|
12
|
+
from recurvedata.operators.utils import file_factory as ff
|
|
13
|
+
from recurvedata.utils import unescape_backslash
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SFTPDumpTask(DumpTask, HiveTextfileConverterMixin):
|
|
19
|
+
ds_name_fields = ("data_source_name",)
|
|
20
|
+
|
|
21
|
+
def execute_impl(self, *args, **kwargs):
|
|
22
|
+
conf = self.rendered_config
|
|
23
|
+
|
|
24
|
+
ds = self.must_get_connection_by_name(conf["data_source_name"])
|
|
25
|
+
ds.connector.download_file(conf["filepath"], self.filename)
|
|
26
|
+
|
|
27
|
+
filename = self.process_file(conf)
|
|
28
|
+
filename = self.transform_file(conf, filename)
|
|
29
|
+
|
|
30
|
+
if filename != self.filename:
|
|
31
|
+
logger.info("renaming %s to %s", filename, self.filename)
|
|
32
|
+
os.rename(filename, self.filename)
|
|
33
|
+
|
|
34
|
+
# TODO: pigeon loader 要支持不同的文件格式
|
|
35
|
+
self.convert_csv_to_hive_text_if_needed()
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
def process_file(self, conf):
|
|
39
|
+
filename = self.filename
|
|
40
|
+
if conf["decompress"] == "Gzip":
|
|
41
|
+
logger.info("decompressing %s using gzip", self.filename)
|
|
42
|
+
filename = ff.gzip_decompress(self.filename, inplace=True)
|
|
43
|
+
|
|
44
|
+
skip_head_lines = conf.get("skip_head_lines", 0)
|
|
45
|
+
if conf["file_format"] == "Excel":
|
|
46
|
+
logger.info("converting Excel to CSV...")
|
|
47
|
+
filename = ff.convert_excel_to_csv(filename, skiprows=skip_head_lines, inplace=True)
|
|
48
|
+
if conf["file_format"] == "JSONLines":
|
|
49
|
+
logger.info("converting JSON lines to CSV...")
|
|
50
|
+
filename = ff.convert_jsonlines_to_csv(
|
|
51
|
+
filename, skiprows=skip_head_lines, src_encoding=conf["encoding"], inplace=True
|
|
52
|
+
)
|
|
53
|
+
if conf["file_format"] == "CSV":
|
|
54
|
+
logger.info("converting CSV dialect and encoding if necessary...")
|
|
55
|
+
dialect_options = self._get_custom_csv_options(conf)
|
|
56
|
+
filename = ff.convert_csv_dialect(
|
|
57
|
+
filename,
|
|
58
|
+
src_dialect_options=dialect_options,
|
|
59
|
+
skiprows=skip_head_lines,
|
|
60
|
+
src_encoding=conf["encoding"],
|
|
61
|
+
inplace=True,
|
|
62
|
+
)
|
|
63
|
+
return filename
|
|
64
|
+
|
|
65
|
+
def transform_file(self, conf, filename):
|
|
66
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
67
|
+
if not transform_func_code:
|
|
68
|
+
return filename
|
|
69
|
+
|
|
70
|
+
func = validate_transform(transform_func_code)
|
|
71
|
+
if not func:
|
|
72
|
+
return filename
|
|
73
|
+
|
|
74
|
+
logger.info("calling transform function with %s", (filename,))
|
|
75
|
+
result_file = func(filename)
|
|
76
|
+
if result_file is None or not (isinstance(result_file, str) and os.path.isabs(result_file)):
|
|
77
|
+
raise ValueError("transform must return an absolute filepath, got %s instead", result_file)
|
|
78
|
+
logger.info("got %s", result_file)
|
|
79
|
+
return result_file
|
|
80
|
+
|
|
81
|
+
def _get_custom_csv_options(self, conf):
|
|
82
|
+
rv = {
|
|
83
|
+
"delimiter": unescape_backslash(conf["csv_delimiter"]),
|
|
84
|
+
"lineterminator": unescape_backslash(conf["csv_lineterminator"]),
|
|
85
|
+
}
|
|
86
|
+
quoting = conf["csv_quoting"]
|
|
87
|
+
rv["quoting"] = {
|
|
88
|
+
"QUOTE_ALL": csv.QUOTE_ALL,
|
|
89
|
+
"QUOTE_MINIMAL": csv.QUOTE_MINIMAL,
|
|
90
|
+
"QUOTE_NONE": csv.QUOTE_NONE,
|
|
91
|
+
"QUOTE_NONNUMERIC": csv.QUOTE_NONNUMERIC,
|
|
92
|
+
}[quoting]
|
|
93
|
+
return rv
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def config_schema(cls):
|
|
97
|
+
return {
|
|
98
|
+
"type": "object",
|
|
99
|
+
"properties": {
|
|
100
|
+
"data_source_name": {
|
|
101
|
+
"type": "string",
|
|
102
|
+
"title": _l("SFTP Connection"),
|
|
103
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
104
|
+
"ui:options": {
|
|
105
|
+
"supportTypes": [
|
|
106
|
+
"sftp",
|
|
107
|
+
],
|
|
108
|
+
},
|
|
109
|
+
},
|
|
110
|
+
"filepath": {
|
|
111
|
+
"type": "string",
|
|
112
|
+
"title": _l("Source File Path"),
|
|
113
|
+
"description": _l(
|
|
114
|
+
"Absolute path to the file on FTP server (e.g. /path/to/file.csv). Supports Jinja templating."
|
|
115
|
+
),
|
|
116
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
117
|
+
"ui:options": {
|
|
118
|
+
"type": "plain",
|
|
119
|
+
},
|
|
120
|
+
},
|
|
121
|
+
"decompress": {
|
|
122
|
+
"type": "string",
|
|
123
|
+
"title": _l("Decompression Method"),
|
|
124
|
+
"description": _l("Decompress downloaded file using specified method"),
|
|
125
|
+
"enum": ["None", "Gzip"],
|
|
126
|
+
"enumNames": ["None", "Gzip"],
|
|
127
|
+
"default": "None",
|
|
128
|
+
},
|
|
129
|
+
"file_format": {
|
|
130
|
+
"type": "string",
|
|
131
|
+
"title": _l("Input Format"),
|
|
132
|
+
"description": _l("Format of the source file to be converted to CSV"),
|
|
133
|
+
"enum": ["CSV", "Excel", "JSONLines"],
|
|
134
|
+
"enumNames": ["CSV", "Excel", "JSONLines"],
|
|
135
|
+
"default": "CSV",
|
|
136
|
+
},
|
|
137
|
+
"skip_head_lines": {
|
|
138
|
+
"type": "number",
|
|
139
|
+
"ui:options": {"controls": False},
|
|
140
|
+
"title": _l("Skip Header Rows"),
|
|
141
|
+
"description": _l("Number of rows to skip from the beginning of the file"),
|
|
142
|
+
"default": 0,
|
|
143
|
+
"minimum": 0,
|
|
144
|
+
},
|
|
145
|
+
"encoding": {
|
|
146
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
147
|
+
"type": "string",
|
|
148
|
+
"title": _l("File Encoding"),
|
|
149
|
+
"description": _l("Character encoding of the CSV file (e.g. utf-8, gbk)"),
|
|
150
|
+
"default": "utf-8",
|
|
151
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
152
|
+
"ui:options": {
|
|
153
|
+
"type": "plain",
|
|
154
|
+
},
|
|
155
|
+
},
|
|
156
|
+
"csv_delimiter": {
|
|
157
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
158
|
+
"type": "string",
|
|
159
|
+
"title": _l("Field Delimiter"),
|
|
160
|
+
"description": _l("Character used to separate fields in the CSV file"),
|
|
161
|
+
"default": ",",
|
|
162
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
163
|
+
"ui:options": {
|
|
164
|
+
"type": "plain",
|
|
165
|
+
},
|
|
166
|
+
},
|
|
167
|
+
"csv_lineterminator": {
|
|
168
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
169
|
+
"type": "string",
|
|
170
|
+
"title": _l("Line Ending"),
|
|
171
|
+
"description": _l("Character sequence used to terminate lines"),
|
|
172
|
+
"enum": [r"\n", r"\r\n"],
|
|
173
|
+
"enumNames": [r"\n", r"\r\n"],
|
|
174
|
+
"default": r"\r\n",
|
|
175
|
+
},
|
|
176
|
+
"csv_quoting": {
|
|
177
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
178
|
+
"type": "string",
|
|
179
|
+
"title": _l("Field Quoting"),
|
|
180
|
+
"description": _l("Strategy for quoting fields in the CSV file"),
|
|
181
|
+
"enum": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
|
|
182
|
+
"enumNames": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
|
|
183
|
+
"default": "QUOTE_MINIMAL",
|
|
184
|
+
},
|
|
185
|
+
"transform_func": {
|
|
186
|
+
"type": "string",
|
|
187
|
+
"title": _l("Custom Transformation"),
|
|
188
|
+
"description": _l(
|
|
189
|
+
"Python function to transform the downloaded file. Must accept a filepath argument and return "
|
|
190
|
+
"the path to the transformed file. Runs after built-in transformations."
|
|
191
|
+
),
|
|
192
|
+
"default": FILE_TRANSFORM_FUNC_DEFAULT_VALUE,
|
|
193
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
194
|
+
"ui:options": {
|
|
195
|
+
"type": "code",
|
|
196
|
+
"lang": "python",
|
|
197
|
+
},
|
|
198
|
+
},
|
|
199
|
+
},
|
|
200
|
+
"required": ["data_source_name", "filepath"],
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
@classmethod
|
|
204
|
+
def validate(cls, configuration):
|
|
205
|
+
conf = super().validate(configuration)
|
|
206
|
+
|
|
207
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
208
|
+
if transform_func_code:
|
|
209
|
+
validate_transform(transform_func_code)
|
|
210
|
+
return conf
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def validate_transform(raw_code):
|
|
214
|
+
code = compile(raw_code, "", "exec")
|
|
215
|
+
ns = {}
|
|
216
|
+
exec(code, ns)
|
|
217
|
+
func = ns.get("transform")
|
|
218
|
+
if not func:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
if not callable(func):
|
|
222
|
+
raise jsonschema.ValidationError(message="transform should be callable", path=("transform_func",))
|
|
223
|
+
|
|
224
|
+
sig = inspect.signature(func)
|
|
225
|
+
if tuple(sig.parameters.keys()) != ("filename",):
|
|
226
|
+
raise jsonschema.ValidationError(
|
|
227
|
+
message="transform must accept and only accept filename as parameter", path=("transform_func",)
|
|
228
|
+
)
|
|
229
|
+
return func
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
from recurvedata.pigeon.connector.aliyun_oss import OSSBucketConnector
|
|
6
|
+
from recurvedata.pigeon.utils import fs
|
|
7
|
+
except ImportError:
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
from recurvedata.core.translation import _l
|
|
11
|
+
from recurvedata.operators.transfer_operator.task import LoadTask
|
|
12
|
+
from recurvedata.utils import extract_dict
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AliyunOSSLoadTask(LoadTask):
|
|
18
|
+
ds_name_fields = ("data_source_name",)
|
|
19
|
+
ds_types = ("oss",)
|
|
20
|
+
should_write_header = True
|
|
21
|
+
worker_install_require = ["pigeon[aliyun]"]
|
|
22
|
+
|
|
23
|
+
def execute_impl(self, *args, **kwargs):
|
|
24
|
+
if fs.is_file_empty(self.filename):
|
|
25
|
+
logger.warning("file %s not exists or has no content, skip.", self.filename)
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
ds = self.must_get_connection_by_name(self.config["data_source_name"])
|
|
29
|
+
|
|
30
|
+
opt_keys = ["access_key_id", "access_key_secret", "endpoint", "bucket_name", "proxies"]
|
|
31
|
+
oss = OSSBucketConnector(**extract_dict(ds.extra, opt_keys))
|
|
32
|
+
|
|
33
|
+
load_options = self.rendered_config.copy()
|
|
34
|
+
|
|
35
|
+
# 文件压缩
|
|
36
|
+
compress_mode = load_options["compress_mode"]
|
|
37
|
+
if compress_mode != "None" and not load_options["key"].endswith(("/",)):
|
|
38
|
+
target_filename = os.path.join(os.path.dirname(self.filename), os.path.basename(load_options["key"]))
|
|
39
|
+
else:
|
|
40
|
+
target_filename = None
|
|
41
|
+
file_upload, ext = self.compress_file(
|
|
42
|
+
filename=self.filename, target_filename=target_filename, compress_mode=compress_mode
|
|
43
|
+
)
|
|
44
|
+
if compress_mode != "None" and not load_options["key"].endswith(("/", ext)):
|
|
45
|
+
load_options["key"] = f"{load_options['key']}{ext}"
|
|
46
|
+
|
|
47
|
+
# 根据 key 的内容创建 upload 方法需要的 key, folder 参数
|
|
48
|
+
upload_conf = {
|
|
49
|
+
"filename": file_upload,
|
|
50
|
+
"overwrite": load_options["overwrite"],
|
|
51
|
+
}
|
|
52
|
+
if load_options["key"].endswith("/"):
|
|
53
|
+
upload_conf.update({"folder": load_options["key"]})
|
|
54
|
+
elif load_options["key"]:
|
|
55
|
+
upload_conf.update({"key": load_options["key"]})
|
|
56
|
+
else:
|
|
57
|
+
upload_conf.update({"key": os.path.basename(file_upload)})
|
|
58
|
+
|
|
59
|
+
logger.info("uploading...")
|
|
60
|
+
oss.upload(**upload_conf)
|
|
61
|
+
return fs.remove_files_safely([self.filename, file_upload])
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def config_schema(cls):
|
|
65
|
+
# get_choices_by_type = cls.get_connection_names_by_type
|
|
66
|
+
# dss = get_choices_by_type(cls.ds_types)
|
|
67
|
+
schema = {
|
|
68
|
+
"type": "object",
|
|
69
|
+
"properties": {
|
|
70
|
+
"data_source_name": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"title": _l("OSS Connection"),
|
|
73
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
74
|
+
"ui:options": {
|
|
75
|
+
"supportTypes": cls.ds_types,
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
"key": {
|
|
79
|
+
"type": "string",
|
|
80
|
+
"title": _l("Upload Path"),
|
|
81
|
+
"description": _l(
|
|
82
|
+
"Target path in the bucket. Can be an object key or folder path (ending with /). "
|
|
83
|
+
"Supports Jinja templating."
|
|
84
|
+
),
|
|
85
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
86
|
+
"ui:options": {
|
|
87
|
+
"type": "plain",
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
"compress_mode": {
|
|
91
|
+
"type": "string",
|
|
92
|
+
"title": _l("Compression Method"),
|
|
93
|
+
"description": _l("Compress file before uploading using specified method"),
|
|
94
|
+
"enum": ["None", "Gzip", "Zip"],
|
|
95
|
+
"enumNames": ["None", "Gzip", "Zip"],
|
|
96
|
+
"default": "None",
|
|
97
|
+
},
|
|
98
|
+
"overwrite": {
|
|
99
|
+
"type": "boolean",
|
|
100
|
+
"title": _l("Overwrite Existing"),
|
|
101
|
+
"description": _l("Whether to overwrite if target object already exists"),
|
|
102
|
+
"default": True,
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
"required": ["compress_mode", "data_source_name"],
|
|
106
|
+
}
|
|
107
|
+
return schema
|