recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import urllib.parse
|
|
5
|
+
from functools import cached_property
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
7
|
+
|
|
8
|
+
import jsonschema
|
|
9
|
+
|
|
10
|
+
from recurvedata.consts import ETLExecutionStatus
|
|
11
|
+
from recurvedata.core.templating import Renderer
|
|
12
|
+
from recurvedata.operators.base import Configurable
|
|
13
|
+
from recurvedata.operators.context import context
|
|
14
|
+
from recurvedata.operators.models import DagBase, NodeBase
|
|
15
|
+
from recurvedata.utils.attrdict import AttrDict
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from recurvedata.executors.client import ExecutorClient
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LineageTaskMixin(object):
|
|
25
|
+
# todo: move to utils
|
|
26
|
+
def process_lineage(self):
|
|
27
|
+
try:
|
|
28
|
+
lineage = self.parse_lineage()
|
|
29
|
+
self.save_lineage(lineage)
|
|
30
|
+
except Exception as e:
|
|
31
|
+
# lineage_fail_notify(self)
|
|
32
|
+
logger.exception(f"failed to process lineage, error: {e}")
|
|
33
|
+
|
|
34
|
+
def parse_lineage(self):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
def save_lineage(self, lineage):
|
|
38
|
+
if not lineage:
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
self.save_lineage(self, lineage)
|
|
42
|
+
# todo: worker sdk
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BaseTask(Configurable, LineageTaskMixin):
|
|
46
|
+
no_template_fields = () # 不使用 jinja 渲染的字段
|
|
47
|
+
ds_name_fields = ()
|
|
48
|
+
|
|
49
|
+
def __init__(self, dag: DagBase, node: NodeBase, execution_date: datetime.datetime, variables: dict = None):
|
|
50
|
+
self.dag: DagBase = dag
|
|
51
|
+
self.node: NodeBase = node
|
|
52
|
+
self.execution_date: datetime.datetime = execution_date
|
|
53
|
+
self.variables: dict = variables or {}
|
|
54
|
+
|
|
55
|
+
self.config = AttrDict(self.node.configuration)
|
|
56
|
+
self.task_instance_id: int = 0
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def validate(cls, configuration: dict) -> dict:
|
|
60
|
+
config = super().validate(configuration)
|
|
61
|
+
|
|
62
|
+
# validate data sources
|
|
63
|
+
for name in cls.ds_name_fields:
|
|
64
|
+
ds = context.get_connection_by_name(connection_name=configuration[name])
|
|
65
|
+
if not ds:
|
|
66
|
+
raise jsonschema.ValidationError(
|
|
67
|
+
message=f"Unknown data source {repr(configuration[name])}", path=(name,)
|
|
68
|
+
)
|
|
69
|
+
return config
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def get_ds_name_field_values(cls, rendered_config: dict) -> list[str]:
|
|
73
|
+
res = set()
|
|
74
|
+
for field in cls.ds_name_fields:
|
|
75
|
+
if field in rendered_config:
|
|
76
|
+
ds_name = rendered_config[field]
|
|
77
|
+
res.add(ds_name)
|
|
78
|
+
elif "." in field:
|
|
79
|
+
tmp_rendered_config = rendered_config
|
|
80
|
+
for sub_field in field.split("."):
|
|
81
|
+
if sub_field not in tmp_rendered_config:
|
|
82
|
+
break
|
|
83
|
+
tmp_rendered_config = tmp_rendered_config[sub_field]
|
|
84
|
+
else:
|
|
85
|
+
if isinstance(tmp_rendered_config, str):
|
|
86
|
+
ds_name = tmp_rendered_config
|
|
87
|
+
res.add(ds_name)
|
|
88
|
+
return list(res)
|
|
89
|
+
|
|
90
|
+
@cached_property
|
|
91
|
+
def rendered_config(self) -> AttrDict:
|
|
92
|
+
return self.render_config()
|
|
93
|
+
|
|
94
|
+
def render_config(self) -> AttrDict:
|
|
95
|
+
result = {}
|
|
96
|
+
env = Renderer()
|
|
97
|
+
ctx = self.get_template_context()
|
|
98
|
+
|
|
99
|
+
for k, v in self.config.items():
|
|
100
|
+
if v is None or k in self.__class__.no_template_fields or not isinstance(v, (str, dict, list, tuple)):
|
|
101
|
+
result[k] = v
|
|
102
|
+
else:
|
|
103
|
+
result[k] = env.render_template(v, ctx)
|
|
104
|
+
return AttrDict(result)
|
|
105
|
+
|
|
106
|
+
def get_template_context(self) -> dict[str, Any]:
|
|
107
|
+
ctx = Renderer.init_context(self.execution_date, self.dag.schedule_interval)
|
|
108
|
+
ctx.update(self.variables)
|
|
109
|
+
return ctx
|
|
110
|
+
|
|
111
|
+
def execute(self, *args, **kwargs):
|
|
112
|
+
# TODO: create new task instance, send request to server or message queue?
|
|
113
|
+
|
|
114
|
+
self.on_task_start()
|
|
115
|
+
|
|
116
|
+
self.before_execute_hook()
|
|
117
|
+
|
|
118
|
+
error = None
|
|
119
|
+
meta = None
|
|
120
|
+
error_stack = None
|
|
121
|
+
|
|
122
|
+
logger.info("task configuration: %s", json.dumps(self.rendered_config, indent=2, ensure_ascii=False))
|
|
123
|
+
try:
|
|
124
|
+
meta = self.execute_impl(*args, **kwargs)
|
|
125
|
+
except Exception as exc:
|
|
126
|
+
error = exc
|
|
127
|
+
error_stack = exc.__repr__()
|
|
128
|
+
self.on_execute_impl_error(exc)
|
|
129
|
+
|
|
130
|
+
self.after_execute_hook()
|
|
131
|
+
|
|
132
|
+
self.on_task_finish(meta, error, error_stack) # todo: try except?
|
|
133
|
+
|
|
134
|
+
if error is not None:
|
|
135
|
+
raise error
|
|
136
|
+
|
|
137
|
+
def on_task_start(self):
|
|
138
|
+
self.task_instance_id = context.init_task_instance_on_task_start(self)
|
|
139
|
+
|
|
140
|
+
def on_task_finish(self, meta: Any, error: Exception, error_stack: str):
|
|
141
|
+
try:
|
|
142
|
+
if meta:
|
|
143
|
+
meta = meta.to_json()
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.debug(f"failed to get json from meta {meta}, error: {e}")
|
|
146
|
+
meta = None
|
|
147
|
+
if error_stack:
|
|
148
|
+
task_status = ETLExecutionStatus.FAILED
|
|
149
|
+
else:
|
|
150
|
+
task_status = ETLExecutionStatus.SUCCESS
|
|
151
|
+
context.update_task_instance_on_task_finish(self, self.task_instance_id, task_status, meta, error, error_stack)
|
|
152
|
+
|
|
153
|
+
def before_execute_hook(self):
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
def after_execute_hook(self):
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
def on_execute_impl_error(self, exc: Exception):
|
|
160
|
+
"""callback function to be called if `execute_impl` throws exceptions"""
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
def execute_impl(self, *args, **kwargs):
|
|
164
|
+
raise NotImplementedError
|
|
165
|
+
|
|
166
|
+
def get_query_comment_conf(self) -> str:
|
|
167
|
+
query_config = {
|
|
168
|
+
"Source": "Recurve",
|
|
169
|
+
"Owner": self.dag.owner,
|
|
170
|
+
"Node": self.node_url,
|
|
171
|
+
}
|
|
172
|
+
return ", ".join(["{}: {}".format(k, v) for k, v in query_config.items()])
|
|
173
|
+
|
|
174
|
+
def set_execution_date(self, execution_date):
|
|
175
|
+
if execution_date == self.execution_date:
|
|
176
|
+
return
|
|
177
|
+
_ = self.rendered_config
|
|
178
|
+
# rendered_config 依赖 self.execution_date
|
|
179
|
+
# 需要用旧的 execution_date 渲染后,再替换掉 self.execution_date
|
|
180
|
+
self.execution_date = execution_date
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def node_url(self) -> str:
|
|
184
|
+
# https://dev-test.recurve.test.recurvedata.com/datawork/workflow?p_id=257942399102349312&wf_id=258282502478635008&open_drawer=true&node_key=D2f0I
|
|
185
|
+
host = context.client.base_url # todo: correct it
|
|
186
|
+
query_string = urllib.parse.urlencode(
|
|
187
|
+
{"p_id": self.dag.project_id, "job_id": self.dag.id, "node_key": self.node.node_key, "open_drawer": "true"}
|
|
188
|
+
)
|
|
189
|
+
return f"{host}/datawork/workspace/job?{query_string}"
|
|
190
|
+
|
|
191
|
+
# add proxy methods to avoid importing context everywhere
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def get_connection_by_name(name: str):
|
|
195
|
+
return context.get_connection_by_name(name)
|
|
196
|
+
|
|
197
|
+
@staticmethod
|
|
198
|
+
def must_get_connection_by_name(name: str):
|
|
199
|
+
return context.must_get_connection_by_name(name)
|
|
200
|
+
|
|
201
|
+
@staticmethod
|
|
202
|
+
def get_connection_names_by_type(connection_type: Union[str, list[str]]) -> list[str]:
|
|
203
|
+
return context.get_connection_names_by_type(connection_type)
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def stage(self) -> Optional[str]:
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def client(self) -> "ExecutorClient":
|
|
211
|
+
return context.client
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from recurvedata.operators.transfer_operator.dump_aliyun_sls import AliyunSLSDumpTask
|
|
2
|
+
from recurvedata.operators.transfer_operator.dump_task_dbapi import DBAPIDumpTask
|
|
3
|
+
from recurvedata.operators.transfer_operator.dump_task_es import ElasticSearchDumpTask
|
|
4
|
+
from recurvedata.operators.transfer_operator.dump_task_feishu_sheet import FeishuSheetDumpTask
|
|
5
|
+
from recurvedata.operators.transfer_operator.dump_task_ftp import FTPDumpTask
|
|
6
|
+
from recurvedata.operators.transfer_operator.dump_task_google_sheet import GoogleSheetDumpTask
|
|
7
|
+
|
|
8
|
+
# from recurvedata.operators.transfer_operator.dump_task_cass import CassandraDumpTask
|
|
9
|
+
from recurvedata.operators.transfer_operator.dump_task_mongodb import MongoDBDumpTask
|
|
10
|
+
from recurvedata.operators.transfer_operator.dump_task_oss import AliyunOSSDumpTask
|
|
11
|
+
from recurvedata.operators.transfer_operator.dump_task_python import PythonDumpTask
|
|
12
|
+
from recurvedata.operators.transfer_operator.dump_task_s3 import S3DumpTask
|
|
13
|
+
from recurvedata.operators.transfer_operator.dump_task_sftp import SFTPDumpTask
|
|
14
|
+
from recurvedata.operators.transfer_operator.load_task_aliyun_oss import AliyunOSSLoadTask
|
|
15
|
+
from recurvedata.operators.transfer_operator.load_task_azure_blob import AzureBlobStorageLoadTask
|
|
16
|
+
from recurvedata.operators.transfer_operator.load_task_clickhouse import ClickHouseLoadTask
|
|
17
|
+
|
|
18
|
+
# from recurvedata.operators.transfer_operator.load_task_filebrowser import FileBrowserLoadTask
|
|
19
|
+
# from recurvedata.operators.transfer_operator.load_task_hive import HiveLoadTask
|
|
20
|
+
# from recurvedata.operators.transfer_operator.load_task_owncloud import OwnCloudLoadTask
|
|
21
|
+
# from recurvedata.operators.transfer_operator.load_task_recurve_data_prep import DataPrepLoadTask
|
|
22
|
+
# from recurvedata.operators.transfer_operator.load_task_yicrowds import YiCrowdsLoadTask
|
|
23
|
+
# from recurvedata.operators.transfer_operator.load_task_azure_synapse import AzureSynapseLoadTask
|
|
24
|
+
# from recurvedata.operators.transfer_operator.load_task_email import EmailLoadTask
|
|
25
|
+
from recurvedata.operators.transfer_operator.load_task_doris import DorisLoadTask
|
|
26
|
+
from recurvedata.operators.transfer_operator.load_task_es import ElasticSearchLoadTask
|
|
27
|
+
from recurvedata.operators.transfer_operator.load_task_ftp import FTPLoadTask
|
|
28
|
+
from recurvedata.operators.transfer_operator.load_task_google_bigquery import GoogleBigqueryLoadTask
|
|
29
|
+
from recurvedata.operators.transfer_operator.load_task_google_cloud_storage import GoogleCloudStorageLoadTask
|
|
30
|
+
from recurvedata.operators.transfer_operator.load_task_google_sheet import GoogleSheetLoadTask
|
|
31
|
+
from recurvedata.operators.transfer_operator.load_task_microsoft_fabric import MicrosoftFabricLoadTask
|
|
32
|
+
from recurvedata.operators.transfer_operator.load_task_mssql import MsSQLLoadTask
|
|
33
|
+
from recurvedata.operators.transfer_operator.load_task_mysql import MySQLLoadTask
|
|
34
|
+
from recurvedata.operators.transfer_operator.load_task_postgresql import PostgresqlLoadTask
|
|
35
|
+
from recurvedata.operators.transfer_operator.load_task_qcloud_cos import TencentCOSLoadTask
|
|
36
|
+
from recurvedata.operators.transfer_operator.load_task_redshift import RedshiftLoadTask
|
|
37
|
+
from recurvedata.operators.transfer_operator.load_task_s3 import S3LoadTask
|
|
38
|
+
from recurvedata.operators.transfer_operator.load_task_sftp import SFTPLoadTask
|
|
39
|
+
from recurvedata.operators.transfer_operator.load_task_starrocks import StarRocksLoadTask
|
|
40
|
+
from recurvedata.operators.transfer_operator.operator import TransferOperator
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
|
|
3
|
+
from recurvedata.core.translation import _l
|
|
4
|
+
from recurvedata.operators.transfer_operator import utils
|
|
5
|
+
from recurvedata.operators.transfer_operator.task import DumpTask
|
|
6
|
+
from recurvedata.pigeon.dumper.aliyun_sls import AliyunSLSDumper
|
|
7
|
+
from recurvedata.utils import extract_dict
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AliyunSLSDumpTask(DumpTask):
|
|
11
|
+
ds_name_fields = ("data_source_name",)
|
|
12
|
+
|
|
13
|
+
def execute_impl(self, *args, **kwargs):
|
|
14
|
+
ds = self.must_get_connection_by_name(self.config["data_source_name"])
|
|
15
|
+
hf = self.create_handler_factory()
|
|
16
|
+
dump_options = extract_dict(
|
|
17
|
+
self.rendered_config, keys=["project", "logstore", "query", "start_time", "end_time", "fields"]
|
|
18
|
+
)
|
|
19
|
+
access_key_id = ds.data.get("access_key_id")
|
|
20
|
+
access_key_secret = ds.data.get("access_key_secret")
|
|
21
|
+
endpoint = ds.data.get("endpoint")
|
|
22
|
+
dump_options.update(
|
|
23
|
+
{
|
|
24
|
+
"endpoint": endpoint,
|
|
25
|
+
"access_key_id": access_key_id,
|
|
26
|
+
"access_key_secret": access_key_secret,
|
|
27
|
+
"handler_factories": [hf],
|
|
28
|
+
}
|
|
29
|
+
)
|
|
30
|
+
dumper = AliyunSLSDumper(**dump_options)
|
|
31
|
+
return dumper.execute()
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def config_schema(cls):
|
|
35
|
+
return {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"data_source_name": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"title": _l("Aliyun Access Key"),
|
|
41
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
42
|
+
"ui:options": {
|
|
43
|
+
"supportTypes": ["aliyun_access_key"],
|
|
44
|
+
},
|
|
45
|
+
},
|
|
46
|
+
"project": {"type": "string", "title": _l("Project Name")},
|
|
47
|
+
"logstore": {"type": "string", "title": _l("Logstore Name")},
|
|
48
|
+
"query": {
|
|
49
|
+
"type": "string",
|
|
50
|
+
"title": _l("Query"),
|
|
51
|
+
"description": _l("Query to retrieve logs from Aliyun SLS."),
|
|
52
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
53
|
+
"ui:options": {
|
|
54
|
+
"type": "code",
|
|
55
|
+
"lang": "sql",
|
|
56
|
+
},
|
|
57
|
+
},
|
|
58
|
+
"start_time": {
|
|
59
|
+
"type": "string",
|
|
60
|
+
"description": _l(
|
|
61
|
+
"Start time of the data to retrieve, supports Jinja templating for dynamic. Format: %Y-%m-%d %H:%M:%S"
|
|
62
|
+
),
|
|
63
|
+
"title": _l("Start Time"),
|
|
64
|
+
"default": "{{ data_interval_start }}",
|
|
65
|
+
},
|
|
66
|
+
"end_time": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": _l(
|
|
69
|
+
"End time of the data to retrieve, supports Jinja templating for dynamic. Format: %Y-%m-%d %H:%M:%S"
|
|
70
|
+
),
|
|
71
|
+
"title": _l("End Time"),
|
|
72
|
+
"default": "{{ data_interval_end }}",
|
|
73
|
+
},
|
|
74
|
+
"fields": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"title": _l("Fields"),
|
|
77
|
+
"description": _l("Comma-separated list of fields to retrieve. Leave empty to get all fields."),
|
|
78
|
+
},
|
|
79
|
+
"transform": copy.deepcopy(utils.TRANSFORM),
|
|
80
|
+
},
|
|
81
|
+
"required": ["data_source_name", "project", "logstore", "start_time", "end_time"],
|
|
82
|
+
}
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
import jsonschema
|
|
7
|
+
|
|
8
|
+
from recurvedata.pigeon.handler.csv_handler import CSVFileHandler
|
|
9
|
+
from recurvedata.pigeon.utils import ensure_str_list, fs
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
except ImportError:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
from recurvedata.core.translation import _l
|
|
18
|
+
from recurvedata.operators.transfer_operator.task import DumpTask
|
|
19
|
+
from recurvedata.operators.utils import infer_schema_from_dataframe, parse_to_date
|
|
20
|
+
from recurvedata.utils.attrdict import AttrDict
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
_transform_default_value = """\
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def transform(df: pd.DataFrame) -> pd.DataFrame:
|
|
28
|
+
return df
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SheetDumpTaskBase(DumpTask):
|
|
33
|
+
_AUTO_REGISTER = False
|
|
34
|
+
|
|
35
|
+
common_config_schema_properties = {
|
|
36
|
+
"extra_read_kwargs": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"title": _l("Additional Read Parameters"),
|
|
39
|
+
"description": _l(
|
|
40
|
+
"Additional parameters to pass to pandas read_csv or read_excel functions in JSON format"
|
|
41
|
+
),
|
|
42
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
43
|
+
"ui:options": {
|
|
44
|
+
"type": "code",
|
|
45
|
+
"lang": "json",
|
|
46
|
+
},
|
|
47
|
+
},
|
|
48
|
+
"type_mapping": {
|
|
49
|
+
"type": "string",
|
|
50
|
+
"title": _l("Column Type Mapping"),
|
|
51
|
+
"description": _l(
|
|
52
|
+
'Specify data types for columns using format {"column_name": "data_type"}. '
|
|
53
|
+
"This mapping is passed to DataFrame.astype() - see "
|
|
54
|
+
'<a target="_blank" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html">'
|
|
55
|
+
"pandas documentation</a> for supported types."
|
|
56
|
+
),
|
|
57
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
58
|
+
"ui:options": {
|
|
59
|
+
"type": "code",
|
|
60
|
+
"lang": "json",
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
"date_columns": {
|
|
64
|
+
"type": "string",
|
|
65
|
+
"title": _l("Date Format Columns"),
|
|
66
|
+
"description": _l("Comma-separated list of column names to parse as dates"),
|
|
67
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
68
|
+
"ui:options": {
|
|
69
|
+
"type": "plain",
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
"fillna_to_null": {
|
|
73
|
+
"type": "boolean",
|
|
74
|
+
"title": _l("Convert NaN to NULL"),
|
|
75
|
+
"default": True,
|
|
76
|
+
},
|
|
77
|
+
"order_by": {
|
|
78
|
+
"type": "string",
|
|
79
|
+
"title": _l("Sort Order"),
|
|
80
|
+
"description": _l(
|
|
81
|
+
"Comma-separated list of columns to sort rows by. Original order is preserved if not specified."
|
|
82
|
+
),
|
|
83
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
84
|
+
"ui:options": {
|
|
85
|
+
"type": "plain",
|
|
86
|
+
},
|
|
87
|
+
},
|
|
88
|
+
"column_name_mapping": {
|
|
89
|
+
"type": "string",
|
|
90
|
+
"title": _l("Rename Columns"),
|
|
91
|
+
"description": _l('Map old column names to new names using format {"old_name": "new_name"}'),
|
|
92
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
93
|
+
"ui:options": {
|
|
94
|
+
"type": "code",
|
|
95
|
+
"lang": "json",
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
"result_columns": {
|
|
99
|
+
"type": "string",
|
|
100
|
+
"title": _l("Output Columns"),
|
|
101
|
+
"description": _l(
|
|
102
|
+
"Comma-separated list of columns to include in output and their order. All columns included if not specified."
|
|
103
|
+
),
|
|
104
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
105
|
+
"ui:options": {
|
|
106
|
+
"type": "plain",
|
|
107
|
+
},
|
|
108
|
+
},
|
|
109
|
+
"primary_keys": {
|
|
110
|
+
"type": "string",
|
|
111
|
+
"title": _l("Unique Key Columns"),
|
|
112
|
+
"description": _l(
|
|
113
|
+
"Comma-separated list of columns that should contain unique values. "
|
|
114
|
+
"Task will fail if duplicates are found. Leave empty to skip uniqueness check."
|
|
115
|
+
),
|
|
116
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
117
|
+
"ui:options": {
|
|
118
|
+
"type": "plain",
|
|
119
|
+
},
|
|
120
|
+
},
|
|
121
|
+
"not_nullable_columns": {
|
|
122
|
+
"type": "string",
|
|
123
|
+
"title": _l("Required Columns"),
|
|
124
|
+
"description": _l(
|
|
125
|
+
"Comma-separated list of columns that must not contain NULL values. "
|
|
126
|
+
"Task will fail if NULL values are found in these columns."
|
|
127
|
+
),
|
|
128
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
129
|
+
"ui:options": {
|
|
130
|
+
"type": "plain",
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
"transform_func": {
|
|
134
|
+
"type": "string",
|
|
135
|
+
"title": _l("Custom Transform"),
|
|
136
|
+
"description": _l(
|
|
137
|
+
"Optional Python function to transform the DataFrame. Must accept and return a pandas DataFrame. "
|
|
138
|
+
"This transformation is applied after all other processing steps."
|
|
139
|
+
),
|
|
140
|
+
"default": _transform_default_value,
|
|
141
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
142
|
+
"ui:options": {
|
|
143
|
+
"type": "code",
|
|
144
|
+
"lang": "python",
|
|
145
|
+
},
|
|
146
|
+
},
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
custom_config_schema_properties = {}
|
|
150
|
+
|
|
151
|
+
custom_config_schema_required = []
|
|
152
|
+
|
|
153
|
+
@classmethod
|
|
154
|
+
def config_schema(cls):
|
|
155
|
+
schema = {
|
|
156
|
+
"type": "object",
|
|
157
|
+
"properties": {},
|
|
158
|
+
"required": cls.custom_config_schema_required,
|
|
159
|
+
}
|
|
160
|
+
schema["properties"].update(cls.custom_config_schema_properties)
|
|
161
|
+
for k, v in cls.common_config_schema_properties.items():
|
|
162
|
+
if k not in schema["properties"]:
|
|
163
|
+
schema["properties"][k] = v
|
|
164
|
+
return schema
|
|
165
|
+
|
|
166
|
+
def execute_impl(self, *args, **kwargs):
|
|
167
|
+
conf = self.rendered_config
|
|
168
|
+
df = self.read_origin_df()
|
|
169
|
+
|
|
170
|
+
df = self.apply_builtin_transform(conf, df)
|
|
171
|
+
df = self.apply_validations(conf, df)
|
|
172
|
+
df = self.apply_custom_transform_func(conf, df)
|
|
173
|
+
|
|
174
|
+
self.df_to_csv(df)
|
|
175
|
+
|
|
176
|
+
def read_origin_df(self) -> "pd.DataFrame":
|
|
177
|
+
raise NotImplementedError
|
|
178
|
+
|
|
179
|
+
def df_to_csv(self, df):
|
|
180
|
+
logger.info(f"result DataFrame shape {df.shape}, dtypes:\n{df.dtypes}")
|
|
181
|
+
logger.info(df.head())
|
|
182
|
+
|
|
183
|
+
handler: CSVFileHandler = self.create_handler_factory().create_handler()
|
|
184
|
+
for row in df.itertuples(index=False):
|
|
185
|
+
handler.handle(row)
|
|
186
|
+
handler.close()
|
|
187
|
+
if handler.filename != self.filename and os.path.exists(handler.filename):
|
|
188
|
+
os.rename(handler.filename, self.filename)
|
|
189
|
+
logger.info(f"exported {len(df)} rows into {self.filename}")
|
|
190
|
+
|
|
191
|
+
schema = infer_schema_from_dataframe(df)
|
|
192
|
+
schema_filename = fs.schema_filename(self.filename)
|
|
193
|
+
schema.dump(schema_filename)
|
|
194
|
+
logger.info(f"saving schema to {schema_filename}")
|
|
195
|
+
|
|
196
|
+
@staticmethod
|
|
197
|
+
def apply_builtin_transform(conf: AttrDict, df: "pd.DataFrame") -> "pd.DataFrame":
|
|
198
|
+
logger.info("apply_builtin_transform...")
|
|
199
|
+
if conf.type_mapping:
|
|
200
|
+
logger.info(f" * convert dtypes with {conf.type_mapping}")
|
|
201
|
+
df = df.astype(json.loads(conf.type_mapping))
|
|
202
|
+
|
|
203
|
+
if conf.date_columns:
|
|
204
|
+
cols = ensure_str_list(conf.date_columns)
|
|
205
|
+
logger.info(f" * parse {cols} to date")
|
|
206
|
+
for col in cols:
|
|
207
|
+
df[col] = df[col].map(parse_to_date)
|
|
208
|
+
|
|
209
|
+
if conf.fillna_to_null:
|
|
210
|
+
logger.info(" * fillna with None")
|
|
211
|
+
df = df.fillna(np.nan).replace([np.nan], [None])
|
|
212
|
+
|
|
213
|
+
if conf.order_by:
|
|
214
|
+
cols = ensure_str_list(conf.order_by)
|
|
215
|
+
logger.info(" * sort by {cols")
|
|
216
|
+
df = df.sort_values(by=cols)
|
|
217
|
+
|
|
218
|
+
if conf.column_name_mapping:
|
|
219
|
+
logger.info(f" * apply column name mapping {conf.column_name_mapping}")
|
|
220
|
+
df = df.rename(json.loads(conf.column_name_mapping), axis=1)
|
|
221
|
+
|
|
222
|
+
if conf.result_columns:
|
|
223
|
+
cols = ensure_str_list(conf.result_columns)
|
|
224
|
+
logger.info(f" * change result columns with {cols}")
|
|
225
|
+
df = df[cols]
|
|
226
|
+
|
|
227
|
+
return df
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def apply_validations(conf: AttrDict, df: "pd.DataFrame") -> "pd.DataFrame":
|
|
231
|
+
logger.info("apply_validations...")
|
|
232
|
+
if conf.primary_keys:
|
|
233
|
+
logger.info(" * checking duplication...")
|
|
234
|
+
duplicate = df[df.duplicated(subset=ensure_str_list(conf.primary_keys))]
|
|
235
|
+
if not duplicate.empty:
|
|
236
|
+
logger.error(f"duplicate rows: {duplicate}")
|
|
237
|
+
raise ValueError("duplication detected")
|
|
238
|
+
|
|
239
|
+
if conf.not_nullable_columns:
|
|
240
|
+
cols = ensure_str_list(conf.not_nullable_columns)
|
|
241
|
+
logger.info(f" * checking null to columns {cols}...")
|
|
242
|
+
null_cols = []
|
|
243
|
+
for col in cols:
|
|
244
|
+
if df[col].isnull().values.any():
|
|
245
|
+
null_cols.append(col)
|
|
246
|
+
if null_cols:
|
|
247
|
+
logger.error(f"{null_cols} contains null values")
|
|
248
|
+
raise ValueError(f"{null_cols} contains null values")
|
|
249
|
+
|
|
250
|
+
return df
|
|
251
|
+
|
|
252
|
+
@staticmethod
|
|
253
|
+
def apply_custom_transform_func(conf: AttrDict, df: "pd.DataFrame") -> "pd.DataFrame":
|
|
254
|
+
if not conf.transform_func:
|
|
255
|
+
return df
|
|
256
|
+
func = validate_transform(conf.transform_func)
|
|
257
|
+
if not func:
|
|
258
|
+
return df
|
|
259
|
+
|
|
260
|
+
logger.info("apply transform function...")
|
|
261
|
+
df = func(df)
|
|
262
|
+
if not isinstance(df, pd.DataFrame):
|
|
263
|
+
raise ValueError(f"transform function must return an Pandas DataFrame object, got {type(df)} instead")
|
|
264
|
+
return df
|
|
265
|
+
|
|
266
|
+
@classmethod
|
|
267
|
+
def validate(cls, configuration):
|
|
268
|
+
conf = super().validate(configuration)
|
|
269
|
+
|
|
270
|
+
transform_func_code = conf.get("transform_func", "").strip()
|
|
271
|
+
if transform_func_code:
|
|
272
|
+
validate_transform(transform_func_code)
|
|
273
|
+
return conf
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def validate_transform(raw_code):
|
|
277
|
+
code = compile(raw_code, "", "exec")
|
|
278
|
+
ns = {}
|
|
279
|
+
exec(code, ns)
|
|
280
|
+
func = ns.get("transform")
|
|
281
|
+
if not func:
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
if not callable(func):
|
|
285
|
+
raise jsonschema.ValidationError(message="transform should be callable", path=("transform_func",))
|
|
286
|
+
|
|
287
|
+
sig = inspect.signature(func)
|
|
288
|
+
if tuple(sig.parameters.keys()) != ("df",):
|
|
289
|
+
raise jsonschema.ValidationError(
|
|
290
|
+
message="transform must accept and only accept df as parameter", path=("transform_func",)
|
|
291
|
+
)
|
|
292
|
+
return func
|