recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
from copy import deepcopy
|
|
7
|
+
|
|
8
|
+
import jsonschema
|
|
9
|
+
|
|
10
|
+
from recurvedata.core.translation import _l
|
|
11
|
+
from recurvedata.operators.transfer_operator.const import FILE_TRANSFORM_FUNC_DEFAULT_VALUE
|
|
12
|
+
from recurvedata.operators.transfer_operator.task import DumpTask
|
|
13
|
+
from recurvedata.operators.utils import file_factory as ff
|
|
14
|
+
from recurvedata.utils import unescape_backslash
|
|
15
|
+
from recurvedata.utils.files import merge_files
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AliyunOSSDumpTask(DumpTask):
|
|
21
|
+
ds_name_fields = ("data_source_name",)
|
|
22
|
+
|
|
23
|
+
def execute_impl(self, *args, **kwargs):
|
|
24
|
+
tmp_dirname = f"{self.filename}_dir"
|
|
25
|
+
if os.path.exists(tmp_dirname):
|
|
26
|
+
shutil.rmtree(tmp_dirname)
|
|
27
|
+
os.makedirs(tmp_dirname)
|
|
28
|
+
|
|
29
|
+
conf = self.rendered_config.copy()
|
|
30
|
+
|
|
31
|
+
ds = self.must_get_connection_by_name(conf["data_source_name"])
|
|
32
|
+
|
|
33
|
+
object_path = conf["object_path"]
|
|
34
|
+
delimiter = "/" if object_path.endswith("/") else ""
|
|
35
|
+
keys = self.get_keys(ds, prefix=object_path, delimiter=delimiter)
|
|
36
|
+
logger.info(f"[start] downloading keys:{keys} from oss")
|
|
37
|
+
local_files = []
|
|
38
|
+
for key in keys:
|
|
39
|
+
ds.connector.download(key, folder=tmp_dirname)
|
|
40
|
+
local_files.append(os.path.join(tmp_dirname, os.path.basename(key)))
|
|
41
|
+
logger.info(f"[finish] downloading files to {local_files}")
|
|
42
|
+
|
|
43
|
+
filename = self.process_file(conf=conf, files=local_files)
|
|
44
|
+
filename = self.transform_file(conf, filename)
|
|
45
|
+
|
|
46
|
+
if filename != self.filename:
|
|
47
|
+
logger.info("renaming %s to %s", filename, self.filename)
|
|
48
|
+
os.rename(filename, self.filename)
|
|
49
|
+
|
|
50
|
+
shutil.rmtree(tmp_dirname)
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def get_keys(ds, prefix, delimiter=""):
|
|
55
|
+
keys = ds.connector.get_keys(prefix=prefix, delimiter=delimiter)
|
|
56
|
+
return keys
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _infer_compression(filename: str, default_compression: str) -> str:
|
|
60
|
+
"""infer compression method from filename"""
|
|
61
|
+
ext = os.path.splitext(filename)[1].lower()
|
|
62
|
+
compression_map = {".gz": "Gzip", ".zip": "Zip"}
|
|
63
|
+
return compression_map.get(ext, default_compression)
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def _infer_file_format(filename: str, default_format: str) -> str:
|
|
67
|
+
"""infer file format from filename"""
|
|
68
|
+
ext = os.path.splitext(filename)[1].lower()
|
|
69
|
+
format_map = {
|
|
70
|
+
".xlsx": "Excel",
|
|
71
|
+
".xls": "Excel",
|
|
72
|
+
}
|
|
73
|
+
return format_map.get(ext, default_format)
|
|
74
|
+
|
|
75
|
+
def process_file(self, conf, files):
|
|
76
|
+
filename = self.filename
|
|
77
|
+
|
|
78
|
+
for f in files:
|
|
79
|
+
compression = self._infer_compression(f, conf["decompress"])
|
|
80
|
+
|
|
81
|
+
file_format = self._infer_file_format(f, conf["file_format"])
|
|
82
|
+
|
|
83
|
+
if compression == "Gzip":
|
|
84
|
+
logger.info("decompressing %s using gzip", f)
|
|
85
|
+
ff.gzip_decompress(f, inplace=True)
|
|
86
|
+
elif compression == "Zip":
|
|
87
|
+
logger.info("decompressing %s using zip", f)
|
|
88
|
+
ff.zip_decompress(f, inplace=True)
|
|
89
|
+
|
|
90
|
+
skip_head_lines = conf.get("skip_head_lines", 0)
|
|
91
|
+
|
|
92
|
+
if file_format == "Excel":
|
|
93
|
+
logger.info("converting Excel to CSV...")
|
|
94
|
+
ff.convert_excel_to_csv(f, skiprows=skip_head_lines, inplace=True)
|
|
95
|
+
elif file_format == "JSONLines":
|
|
96
|
+
logger.info("converting JSON lines to CSV...")
|
|
97
|
+
ff.convert_jsonlines_to_csv(f, skiprows=skip_head_lines, src_encoding=conf["encoding"], inplace=True)
|
|
98
|
+
elif file_format == "CSV":
|
|
99
|
+
logger.info("converting CSV dialect and encoding if necessary...")
|
|
100
|
+
dialect_options = self._get_custom_csv_options(conf)
|
|
101
|
+
src_dialect_options = deepcopy(dialect_options)
|
|
102
|
+
src_dialect_options.pop("quoting")
|
|
103
|
+
src_dialect_options.pop("doublequote")
|
|
104
|
+
src_dialect_options.pop("escapechar")
|
|
105
|
+
ff.convert_csv_dialect(
|
|
106
|
+
f,
|
|
107
|
+
src_dialect_options=src_dialect_options,
|
|
108
|
+
dst_dialect_options=dialect_options,
|
|
109
|
+
skiprows=skip_head_lines,
|
|
110
|
+
src_encoding=conf["encoding"],
|
|
111
|
+
inplace=True,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if files:
|
|
115
|
+
merge_files(files=files, filename=filename)
|
|
116
|
+
return filename
|
|
117
|
+
|
|
118
|
+
def transform_file(self, conf, filename):
|
|
119
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
120
|
+
if not transform_func_code:
|
|
121
|
+
return filename
|
|
122
|
+
|
|
123
|
+
func = self._validate_transform(transform_func_code)
|
|
124
|
+
if not func:
|
|
125
|
+
return filename
|
|
126
|
+
|
|
127
|
+
logger.info("calling transform function with %s", (filename,))
|
|
128
|
+
result_file = func(filename)
|
|
129
|
+
if result_file is None or not (isinstance(result_file, str) and os.path.isabs(result_file)):
|
|
130
|
+
raise ValueError("transform must return an absolute filepath, got %s instead", result_file)
|
|
131
|
+
logger.info("got %s", result_file)
|
|
132
|
+
return result_file
|
|
133
|
+
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _get_custom_csv_options(conf):
|
|
136
|
+
rv = {
|
|
137
|
+
"delimiter": unescape_backslash(conf["csv_delimiter"]),
|
|
138
|
+
"lineterminator": unescape_backslash(conf["csv_lineterminator"]),
|
|
139
|
+
"quotechar": '"',
|
|
140
|
+
"doublequote": False,
|
|
141
|
+
"escapechar": "'",
|
|
142
|
+
}
|
|
143
|
+
quoting = conf["csv_quoting"]
|
|
144
|
+
rv["quoting"] = {
|
|
145
|
+
"QUOTE_ALL": csv.QUOTE_ALL,
|
|
146
|
+
"QUOTE_MINIMAL": csv.QUOTE_MINIMAL,
|
|
147
|
+
"QUOTE_NONE": csv.QUOTE_NONE,
|
|
148
|
+
"QUOTE_NONNUMERIC": csv.QUOTE_NONNUMERIC,
|
|
149
|
+
}[quoting]
|
|
150
|
+
return rv
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def config_schema(cls):
|
|
154
|
+
# get_choices_by_type = cls.get_connection_names_by_type
|
|
155
|
+
return {
|
|
156
|
+
"type": "object",
|
|
157
|
+
"properties": {
|
|
158
|
+
"data_source_name": {
|
|
159
|
+
"type": "string",
|
|
160
|
+
"title": _l("Aliyun OSS Connection"),
|
|
161
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
162
|
+
"ui:options": {
|
|
163
|
+
"supportTypes": [
|
|
164
|
+
"oss",
|
|
165
|
+
],
|
|
166
|
+
},
|
|
167
|
+
},
|
|
168
|
+
"object_path": {
|
|
169
|
+
"type": "string",
|
|
170
|
+
"title": _l("OSS Object Path"),
|
|
171
|
+
"description": _l("Object path or prefix pattern to download. Supports Jinja templating syntax."),
|
|
172
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
173
|
+
"ui:options": {
|
|
174
|
+
"type": "plain",
|
|
175
|
+
},
|
|
176
|
+
},
|
|
177
|
+
"decompress": {
|
|
178
|
+
"type": "string",
|
|
179
|
+
"title": _l("Decompression Method"),
|
|
180
|
+
"description": _l("Decompress downloaded file using specified method"),
|
|
181
|
+
"enum": ["None", "Gzip", "Zip"],
|
|
182
|
+
"enumNames": ["None", "Gzip", "Zip"],
|
|
183
|
+
"default": "None",
|
|
184
|
+
},
|
|
185
|
+
"file_format": {
|
|
186
|
+
"type": "string",
|
|
187
|
+
"title": _l("Input Format"),
|
|
188
|
+
"description": _l("Format of the source file to be converted to CSV"),
|
|
189
|
+
"enum": ["CSV", "Excel", "JSONLines"],
|
|
190
|
+
"enumNames": ["CSV", "Excel", "JSONLines"],
|
|
191
|
+
"default": "CSV",
|
|
192
|
+
},
|
|
193
|
+
"skip_head_lines": {
|
|
194
|
+
"type": "number",
|
|
195
|
+
"ui:options": {"controls": False},
|
|
196
|
+
"title": _l("Skip Header Rows"),
|
|
197
|
+
"description": _l("Number of rows to skip from the beginning of the file"),
|
|
198
|
+
"default": 0,
|
|
199
|
+
"minimum": 0,
|
|
200
|
+
},
|
|
201
|
+
"encoding": {
|
|
202
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
203
|
+
"type": "string",
|
|
204
|
+
"title": _l("File Encoding"),
|
|
205
|
+
"description": _l("Character encoding of the CSV file (e.g. utf-8, gbk)"),
|
|
206
|
+
"default": "utf-8",
|
|
207
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
208
|
+
"ui:options": {
|
|
209
|
+
"type": "plain",
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
"csv_delimiter": {
|
|
213
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
214
|
+
"type": "string",
|
|
215
|
+
"title": _l("Field Delimiter"),
|
|
216
|
+
"description": _l("Character used to separate fields in the CSV file"),
|
|
217
|
+
"default": ",",
|
|
218
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
219
|
+
"ui:options": {
|
|
220
|
+
"type": "plain",
|
|
221
|
+
},
|
|
222
|
+
},
|
|
223
|
+
"csv_lineterminator": {
|
|
224
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
225
|
+
"type": "string",
|
|
226
|
+
"title": _l("Line Ending"),
|
|
227
|
+
"description": _l("Character sequence used to terminate lines"),
|
|
228
|
+
"enum": [r"\n", r"\r\n"],
|
|
229
|
+
"enumNames": [r"\n", r"\r\n"],
|
|
230
|
+
"default": r"\r\n",
|
|
231
|
+
},
|
|
232
|
+
"csv_quoting": {
|
|
233
|
+
"ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
|
|
234
|
+
"type": "string",
|
|
235
|
+
"title": _l("Field Quoting"),
|
|
236
|
+
"description": _l("Strategy for quoting fields in the CSV file"),
|
|
237
|
+
"enum": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
|
|
238
|
+
"enumNames": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
|
|
239
|
+
"default": "QUOTE_MINIMAL",
|
|
240
|
+
},
|
|
241
|
+
"transform_func": {
|
|
242
|
+
"type": "string",
|
|
243
|
+
"title": _l("Custom Transformation"),
|
|
244
|
+
"description": _l(
|
|
245
|
+
"Python function to transform the downloaded file. Must accept a filepath argument and return "
|
|
246
|
+
"the path to the transformed file. Runs after built-in transformations."
|
|
247
|
+
),
|
|
248
|
+
"default": FILE_TRANSFORM_FUNC_DEFAULT_VALUE,
|
|
249
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
250
|
+
"ui:options": {
|
|
251
|
+
"type": "code",
|
|
252
|
+
"lang": "python",
|
|
253
|
+
},
|
|
254
|
+
},
|
|
255
|
+
},
|
|
256
|
+
"required": ["data_source_name", "object_path"],
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
@staticmethod
|
|
260
|
+
def _validate_transform(raw_code):
|
|
261
|
+
code = compile(raw_code, "", "exec")
|
|
262
|
+
ns = {}
|
|
263
|
+
exec(code, ns)
|
|
264
|
+
func = ns.get("transform")
|
|
265
|
+
if not func:
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
if not callable(func):
|
|
269
|
+
raise jsonschema.ValidationError(message="transform should be callable", path=("transform_func",))
|
|
270
|
+
|
|
271
|
+
sig = inspect.signature(func)
|
|
272
|
+
if tuple(sig.parameters.keys()) != ("filename",):
|
|
273
|
+
raise jsonschema.ValidationError(
|
|
274
|
+
message="transform must accept and only accept filename as parameter", path=("transform_func",)
|
|
275
|
+
)
|
|
276
|
+
return func
|
|
277
|
+
|
|
278
|
+
@classmethod
|
|
279
|
+
def validate(cls, configuration):
|
|
280
|
+
conf = super().validate(configuration)
|
|
281
|
+
|
|
282
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
283
|
+
if transform_func_code:
|
|
284
|
+
cls._validate_transform(transform_func_code)
|
|
285
|
+
return conf
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
import jsonschema
|
|
7
|
+
|
|
8
|
+
from recurvedata.config import RECURVE_EXECUTOR_PYENV_NAME
|
|
9
|
+
from recurvedata.core.translation import _l
|
|
10
|
+
from recurvedata.operators.python_operator.operator import PythonRequirementsMixin
|
|
11
|
+
from recurvedata.operators.transfer_operator.mixin import HiveTextfileConverterMixin
|
|
12
|
+
from recurvedata.operators.transfer_operator.task import DumpTask
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_SOURCE_SKELETON = _l(
|
|
17
|
+
'''
|
|
18
|
+
def execute(filename: str, *args, **kwargs):
|
|
19
|
+
"""
|
|
20
|
+
The execute function must be implemented as the entry point for ReOrc.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
filename: Required. Output data to this file in CSV format. This file will be used as input for the Loader.
|
|
24
|
+
|
|
25
|
+
Data Source Parameters:
|
|
26
|
+
For database configurations, use ReOrc's Data Sources instead of hardcoding credentials in the code.
|
|
27
|
+
When defining the execute function, use special parameter names to specify required data sources.
|
|
28
|
+
ReOrc will pass corresponding pigeon connector objects that can be used for database interactions.
|
|
29
|
+
|
|
30
|
+
Parameter naming convention:
|
|
31
|
+
- Must have 'datasource_' prefix, e.g. datasource_xxx
|
|
32
|
+
- Example: datasource_mysql='my_mysql_default'
|
|
33
|
+
At runtime, ReOrc will pass a pigeon.connector.MysqlConnector object
|
|
34
|
+
|
|
35
|
+
Example usage:
|
|
36
|
+
def execute(filename, datasource_mysql='my_mysql_default'):
|
|
37
|
+
df = datasource_mysql.get_pandas_df('SELECT * FROM my_database.my_table')
|
|
38
|
+
df.to_csv(filename, header=False)
|
|
39
|
+
"""
|
|
40
|
+
pass
|
|
41
|
+
'''
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# FIXME: record all supported template variables, find a way to keep consistent with `get_template_context` method
|
|
46
|
+
_TEMPLATE_VARIABLES = {
|
|
47
|
+
"dt",
|
|
48
|
+
"yesterday",
|
|
49
|
+
"yesterday_dt",
|
|
50
|
+
"tomorrow",
|
|
51
|
+
"tomorrow_dt",
|
|
52
|
+
"logical_date",
|
|
53
|
+
"data_interval_start",
|
|
54
|
+
"data_interval_end",
|
|
55
|
+
"data_interval_start_dt",
|
|
56
|
+
"data_interval_end_dt",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class PythonCodeRunner(object):
|
|
61
|
+
# test page: https://regex101.com/r/p8YCQc/1
|
|
62
|
+
_JINJA2_VAR_PATTERN = re.compile(r"^{{\s*([^\d\W]\w*)\s*}}$")
|
|
63
|
+
|
|
64
|
+
def __init__(self, source):
|
|
65
|
+
self.source = source
|
|
66
|
+
|
|
67
|
+
self.__namespace = {}
|
|
68
|
+
self.__parameters = {}
|
|
69
|
+
self.__datasource_params = {}
|
|
70
|
+
self.__jinja2_variables_params = {}
|
|
71
|
+
self.__ready_for_execution = False
|
|
72
|
+
self.__compiled = False
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def entrypoint(self):
|
|
76
|
+
if not self.__compiled:
|
|
77
|
+
raise ValueError("entrypoint is not ready, inspect first")
|
|
78
|
+
return self.__namespace.get("execute")
|
|
79
|
+
|
|
80
|
+
def inspect(self):
|
|
81
|
+
logger.info("compiling source code\n%s", self.source)
|
|
82
|
+
code = compile(self.source, "", "exec")
|
|
83
|
+
exec(code, self.__namespace)
|
|
84
|
+
self.__compiled = True
|
|
85
|
+
entrypoint = self.entrypoint
|
|
86
|
+
|
|
87
|
+
if not (entrypoint and inspect.isfunction(entrypoint)):
|
|
88
|
+
raise jsonschema.ValidationError(message="execute function is required", path=("source",))
|
|
89
|
+
|
|
90
|
+
sig = inspect.signature(entrypoint)
|
|
91
|
+
for name, param in sig.parameters.items():
|
|
92
|
+
value = param.default
|
|
93
|
+
logger.info("found parameter %s=%s", name, value)
|
|
94
|
+
|
|
95
|
+
# special naming for data source parameters: `datasource_xxx`
|
|
96
|
+
if self.is_datasource_param(name):
|
|
97
|
+
if self._is_empty(value):
|
|
98
|
+
raise jsonschema.ValidationError(message=f"{name} must be known data source name", path=("source",))
|
|
99
|
+
ds = DumpTask.get_connection_by_name(value)
|
|
100
|
+
if not ds:
|
|
101
|
+
raise jsonschema.ValidationError(message=f"Unknown data source {repr(name)}", path=("source",))
|
|
102
|
+
self.__datasource_params[name] = value
|
|
103
|
+
|
|
104
|
+
# jinja2 template `{{ dt }}`, no Jinja2 rendering, directly replace
|
|
105
|
+
elif self.is_jinja2_variable(value):
|
|
106
|
+
variable = self._JINJA2_VAR_PATTERN.search(value).groups()[0]
|
|
107
|
+
# unsupported variables
|
|
108
|
+
if variable not in _TEMPLATE_VARIABLES:
|
|
109
|
+
raise jsonschema.ValidationError(
|
|
110
|
+
message=f"Unsupport template variable {repr(value)}", path=("source",)
|
|
111
|
+
)
|
|
112
|
+
self.__jinja2_variables_params[name] = variable
|
|
113
|
+
|
|
114
|
+
else:
|
|
115
|
+
# keep default value, data source and template variable parameters are injected at runtime by calling `bind_parameters`
|
|
116
|
+
self.__parameters[name] = value
|
|
117
|
+
self.__parameters.update(self.__datasource_params)
|
|
118
|
+
self.__parameters.update(self.__jinja2_variables_params)
|
|
119
|
+
|
|
120
|
+
def is_datasource_param(self, name: str) -> bool:
|
|
121
|
+
return name.startswith("datasource_")
|
|
122
|
+
|
|
123
|
+
def is_jinja2_variable(self, name: str) -> bool:
|
|
124
|
+
return isinstance(name, str) and self._JINJA2_VAR_PATTERN.match(name)
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _is_empty(obj) -> bool:
|
|
128
|
+
return obj is inspect.Signature.empty
|
|
129
|
+
|
|
130
|
+
def bind_parameters(self, filename, template_context, **kwargs):
|
|
131
|
+
params = copy.deepcopy(kwargs)
|
|
132
|
+
params["filename"] = filename
|
|
133
|
+
|
|
134
|
+
logger.info("binding data source connectors %s", self.__datasource_params)
|
|
135
|
+
for param_name, ds_name in self.__datasource_params.items():
|
|
136
|
+
params[param_name] = DumpTask.get_connection_by_name(ds_name).connector
|
|
137
|
+
|
|
138
|
+
logger.info("binding jinja2 variables %s", self.__jinja2_variables_params)
|
|
139
|
+
for param_name, variable in self.__jinja2_variables_params.items():
|
|
140
|
+
params[param_name] = template_context[variable]
|
|
141
|
+
|
|
142
|
+
# bind other parameters, or override default parameters
|
|
143
|
+
for k, v in params.items():
|
|
144
|
+
if k in self.__parameters:
|
|
145
|
+
self.__parameters[k] = v
|
|
146
|
+
|
|
147
|
+
# check if there are any parameters not passed
|
|
148
|
+
for name, value in self.__parameters.items():
|
|
149
|
+
if name not in ["args", "kwargs"] and self._is_empty(value):
|
|
150
|
+
raise TypeError(f"parameter {repr(name)} is not bound")
|
|
151
|
+
|
|
152
|
+
logger.info("bounded parameters %s", self.__parameters)
|
|
153
|
+
self.__ready_for_execution = True
|
|
154
|
+
|
|
155
|
+
def execute(self):
|
|
156
|
+
if not self.__ready_for_execution:
|
|
157
|
+
raise RuntimeError("must call inspect and bind_parameters before calling execute")
|
|
158
|
+
logger.info("calling entrypoint %s with parameters %s...", self.entrypoint, self.__parameters)
|
|
159
|
+
self.entrypoint(**self.__parameters)
|
|
160
|
+
logger.info("done.")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class PythonDumpTask(DumpTask, HiveTextfileConverterMixin, PythonRequirementsMixin):
|
|
164
|
+
no_template_fields = ("source",)
|
|
165
|
+
|
|
166
|
+
def execute_impl(self, *args, **kwargs):
|
|
167
|
+
config = self.rendered_config.copy()
|
|
168
|
+
runner = PythonCodeRunner(config["source"])
|
|
169
|
+
|
|
170
|
+
# Get and install requirements if any
|
|
171
|
+
py_conn_configs = self.client.get_py_conn_configs()
|
|
172
|
+
if py_conn_configs and isinstance(py_conn_configs, dict):
|
|
173
|
+
requirements = "\n".join(py_conn_configs.get("requirements", []))
|
|
174
|
+
self._install_requirements(requirements, RECURVE_EXECUTOR_PYENV_NAME)
|
|
175
|
+
|
|
176
|
+
runner.inspect()
|
|
177
|
+
context = self.get_template_context()
|
|
178
|
+
runner.bind_parameters(filename=self.filename, template_context=context)
|
|
179
|
+
runner.execute()
|
|
180
|
+
|
|
181
|
+
self.convert_csv_to_hive_text_if_needed()
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def config_schema(cls):
|
|
186
|
+
return {
|
|
187
|
+
"type": "object",
|
|
188
|
+
"properties": {
|
|
189
|
+
"source": {
|
|
190
|
+
"type": "string",
|
|
191
|
+
"title": _l("Python Source Code"),
|
|
192
|
+
"description": _l(
|
|
193
|
+
"Python code that extracts data and writes to a CSV file. Must implement an execute() function that takes a filename parameter. Note: The Load step must specify a Create Table DDL when using PythonDump."
|
|
194
|
+
),
|
|
195
|
+
"default": _SOURCE_SKELETON,
|
|
196
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
197
|
+
"ui:options": {
|
|
198
|
+
"type": "code",
|
|
199
|
+
"lang": "python",
|
|
200
|
+
},
|
|
201
|
+
},
|
|
202
|
+
},
|
|
203
|
+
"required": ["source"],
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def validate(cls, configuration):
|
|
208
|
+
config = super().validate(configuration)
|
|
209
|
+
|
|
210
|
+
runner = PythonCodeRunner(config["source"])
|
|
211
|
+
runner.inspect()
|
|
212
|
+
return config
|