recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from tempfile import NamedTemporaryFile
|
|
4
|
+
|
|
5
|
+
from recurvedata.core.translation import _l
|
|
6
|
+
from recurvedata.operators.operator import BaseOperator
|
|
7
|
+
from recurvedata.operators.task import BaseTask
|
|
8
|
+
from recurvedata.utils.mp import robust_run_subprocess
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
AIRFLOW_PYTHON_PATH = "python" # system python path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SensorTask(BaseTask):
|
|
16
|
+
"""
|
|
17
|
+
Sensor operator is used to create a sensor that could check upstream task status,
|
|
18
|
+
will wait until the upstream task is success.
|
|
19
|
+
It uses airflow ExternalTaskSensor to check the status of the upstream task.
|
|
20
|
+
For modeling pipeline,
|
|
21
|
+
the node_key is the node_key of the selected model.
|
|
22
|
+
for advanced pipeline,
|
|
23
|
+
- if normal Operator like SQLOperator, the node_key is the node_key of the SQLOperator.
|
|
24
|
+
- if Modeling Pipeline ( which is LinkModelPipelineOperator), the node_key is "{node_key of the LinkModelPipelineOperator}.{node_key of the model}"
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
# todo: same schedule interval dependency
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def config_schema(cls):
|
|
32
|
+
return {
|
|
33
|
+
"type": "object",
|
|
34
|
+
"properties": {
|
|
35
|
+
"project_id": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"title": _l("Project Name"),
|
|
38
|
+
"description": _l("Project Name containing the external task"),
|
|
39
|
+
"ui:field": "SensorOperatorProjectSelectField",
|
|
40
|
+
},
|
|
41
|
+
"job_id": {
|
|
42
|
+
"type": "string",
|
|
43
|
+
"title": _l("Job Name"),
|
|
44
|
+
"description": _l("Job Name of the external task"),
|
|
45
|
+
"ui:field": "SensorOperatorJobSelectField",
|
|
46
|
+
},
|
|
47
|
+
"node_key": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"title": _l("Node Name"),
|
|
50
|
+
"description": _l("Node Name of the external task"),
|
|
51
|
+
"ui:field": "SensorOperatorNodeSelectField",
|
|
52
|
+
},
|
|
53
|
+
"wait_time": {
|
|
54
|
+
"type": "integer",
|
|
55
|
+
"title": _l("Wait Time"),
|
|
56
|
+
"description": _l("Wait time in seconds between checks"),
|
|
57
|
+
"ui:options": {
|
|
58
|
+
"min": 0,
|
|
59
|
+
"step": 1,
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
"timeout": {
|
|
63
|
+
"type": "integer",
|
|
64
|
+
"title": _l("Timeout"),
|
|
65
|
+
"description": _l("Timeout"),
|
|
66
|
+
"default": 60,
|
|
67
|
+
"ui:options": {
|
|
68
|
+
"min": 0,
|
|
69
|
+
"step": 1,
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
"required": ["project_id", "job_id", "node_key"],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def generate_airflow_operator_code(self):
|
|
77
|
+
config = self.rendered_config
|
|
78
|
+
external_job_id = config["job_id"]
|
|
79
|
+
external_node_key = config["node_key"]
|
|
80
|
+
execution_delta = config.get("wait_time", 0)
|
|
81
|
+
timeout = config.get("timeout", 3600)
|
|
82
|
+
dag_name = self.dag.name
|
|
83
|
+
|
|
84
|
+
return f"""
|
|
85
|
+
import sys
|
|
86
|
+
from recurvedata.operators.sensor_operator.airflow_utils import format_external_dag_id, format_external_task_id, data_interval_end_to_data_interval_start
|
|
87
|
+
from airflow.sensors.external_task import ExternalTaskSensor
|
|
88
|
+
import datetime
|
|
89
|
+
import logging
|
|
90
|
+
from recurvedata.schedulers.consts import is_dev_run_job
|
|
91
|
+
|
|
92
|
+
logger = logging.getLogger()
|
|
93
|
+
external_dag_id = format_external_dag_id({external_job_id!r})
|
|
94
|
+
external_task_id = format_external_task_id({external_node_key!r})
|
|
95
|
+
|
|
96
|
+
external_dag = get_dag_from_db(external_dag_id)
|
|
97
|
+
if not external_dag:
|
|
98
|
+
raise ValueError("External DAG not found")
|
|
99
|
+
external_task = external_dag.get_task(external_task_id)
|
|
100
|
+
if not external_task:
|
|
101
|
+
raise ValueError("External Task not found")
|
|
102
|
+
|
|
103
|
+
data_interval_end = context["data_interval_end"]
|
|
104
|
+
external_data_interval_end = data_interval_end - datetime.timedelta(seconds={execution_delta})
|
|
105
|
+
external_data_interval_start = data_interval_end_to_data_interval_start(external_dag, external_data_interval_end)
|
|
106
|
+
|
|
107
|
+
logger.debug("external_data_interval_start " + str(external_data_interval_start))
|
|
108
|
+
|
|
109
|
+
tmp_task_id="tmp_task_id_for_external_task_sensor"
|
|
110
|
+
operator = ExternalTaskSensor(
|
|
111
|
+
dag=dag,
|
|
112
|
+
task_id=tmp_task_id,
|
|
113
|
+
external_dag_id=external_dag_id,
|
|
114
|
+
external_task_id=external_task_id,
|
|
115
|
+
execution_date_fn = lambda *args, **kwargs: external_data_interval_start,
|
|
116
|
+
execution_timeout = datetime.timedelta(seconds={timeout}),
|
|
117
|
+
)
|
|
118
|
+
if is_dev_run_job({dag_name!r}):
|
|
119
|
+
logger.info(f"dag_name: {dag_name!r}")
|
|
120
|
+
logger.info(f"skip: SensorOperator is not working in dev mode")
|
|
121
|
+
sys.exit(0)
|
|
122
|
+
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def generate_airflow_code(self) -> str:
|
|
126
|
+
config = self.rendered_config
|
|
127
|
+
timeout = config.get("timeout", 3600)
|
|
128
|
+
operator_code = self.generate_airflow_operator_code()
|
|
129
|
+
return """
|
|
130
|
+
import os
|
|
131
|
+
import time
|
|
132
|
+
from recurvedata.operators.sensor_operator.airflow_utils import prepare_airflow_env, get_dag_from_db, \
|
|
133
|
+
build_execute_context
|
|
134
|
+
from recurvedata.utils.timeout import timeout
|
|
135
|
+
|
|
136
|
+
prepare_airflow_env()
|
|
137
|
+
|
|
138
|
+
dag_id = os.environ.get("AIRFLOW_CTX_DAG_ID")
|
|
139
|
+
task_id = os.environ.get("AIRFLOW_CTX_TASK_ID")
|
|
140
|
+
run_id = os.environ.get("AIRFLOW_CTX_DAG_RUN_ID")
|
|
141
|
+
|
|
142
|
+
dag = get_dag_from_db(dag_id)
|
|
143
|
+
task = dag.get_task(task_id)
|
|
144
|
+
context = build_execute_context(dag, task, run_id)
|
|
145
|
+
|
|
146
|
+
{operator_code}
|
|
147
|
+
|
|
148
|
+
with timeout({timeout}):
|
|
149
|
+
operator.execute(context)
|
|
150
|
+
""".format(
|
|
151
|
+
operator_code=operator_code,
|
|
152
|
+
timeout=timeout,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def __run_airflow_operator(self, filename: str):
|
|
156
|
+
script_path = os.path.abspath(filename)
|
|
157
|
+
env = os.environ.copy()
|
|
158
|
+
output, ret_code = robust_run_subprocess([AIRFLOW_PYTHON_PATH, script_path], _logger=logger, env=env)
|
|
159
|
+
if ret_code:
|
|
160
|
+
raise RuntimeError(f"Airflow Error:\n{output}")
|
|
161
|
+
|
|
162
|
+
def execute_impl(self, *args, **kwargs):
|
|
163
|
+
code = self.generate_airflow_code()
|
|
164
|
+
prefix = f"reorc_sensor_operator_{self.dag.id}_{self.node.id}_"
|
|
165
|
+
with NamedTemporaryFile(mode="w+t", prefix=prefix, suffix=".py") as tmp_file:
|
|
166
|
+
tmp_file.write(code)
|
|
167
|
+
tmp_file.flush()
|
|
168
|
+
self.__run_airflow_operator(tmp_file.name)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class SensorOperator(BaseOperator):
|
|
172
|
+
task_cls = SensorTask
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from recurvedata.operators.spark_operator.operator import SparkOperator
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
from importlib import resources
|
|
6
|
+
from subprocess import PIPE, STDOUT, Popen
|
|
7
|
+
from tempfile import NamedTemporaryFile
|
|
8
|
+
from textwrap import dedent
|
|
9
|
+
|
|
10
|
+
from recurvedata.core.translation import _l
|
|
11
|
+
from recurvedata.operators.operator import BaseOperator
|
|
12
|
+
from recurvedata.operators.task import BaseTask
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@lru_cache()
|
|
18
|
+
def get_sample_code():
|
|
19
|
+
return resources.files("recurvedata.operators.spark_operator").joinpath("spark_sample.py").read_text()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SparkTask(BaseTask):
|
|
23
|
+
@classmethod
|
|
24
|
+
def config_schema(cls):
|
|
25
|
+
return {
|
|
26
|
+
"type": "object",
|
|
27
|
+
"properties": {
|
|
28
|
+
"spark_source": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"title": _l("Spark Environment"),
|
|
31
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
32
|
+
"ui:options": {
|
|
33
|
+
"supportTypes": [
|
|
34
|
+
"spark",
|
|
35
|
+
],
|
|
36
|
+
},
|
|
37
|
+
"description": _l("Select the Spark environment and version to use for this task"),
|
|
38
|
+
},
|
|
39
|
+
"env": {
|
|
40
|
+
"type": "string",
|
|
41
|
+
"title": _l("Environment Variables"),
|
|
42
|
+
"default": "{}",
|
|
43
|
+
"description": _l(
|
|
44
|
+
'Additional environment variables in JSON format (e.g. {"HADOOP_CONF_DIR": "/etc/hadoop/conf"})'
|
|
45
|
+
),
|
|
46
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
47
|
+
"ui:options": {
|
|
48
|
+
"type": "code",
|
|
49
|
+
"lang": "json",
|
|
50
|
+
},
|
|
51
|
+
},
|
|
52
|
+
"execution_config": {
|
|
53
|
+
"type": "string",
|
|
54
|
+
"title": _l("Spark Configuration"),
|
|
55
|
+
"default": dedent(
|
|
56
|
+
"""\
|
|
57
|
+
{
|
|
58
|
+
"master": "yarn",
|
|
59
|
+
"executor-memory": "4g",
|
|
60
|
+
"num-executors": "10",
|
|
61
|
+
"executor-cores": "2",
|
|
62
|
+
"queue": "default",
|
|
63
|
+
"conf": {
|
|
64
|
+
"spark.dynamicAllocation.enabled": "False"
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
"""
|
|
68
|
+
),
|
|
69
|
+
"description": _l(
|
|
70
|
+
"Spark execution parameters and configurations. See "
|
|
71
|
+
"<a target='_blank' href='https://spark.apache.org/docs/latest/configuration.html'>"
|
|
72
|
+
"Spark Docs</a> for available options"
|
|
73
|
+
),
|
|
74
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
75
|
+
"ui:options": {
|
|
76
|
+
"type": "code",
|
|
77
|
+
"lang": "json",
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
"code": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"title": _l("Spark Code"),
|
|
83
|
+
"default": get_sample_code(),
|
|
84
|
+
"description": _l(
|
|
85
|
+
"PySpark code to execute. The default template shows how to create a SparkSession "
|
|
86
|
+
"(Spark 2.3+). Supports Jinja templating for dynamic code generation."
|
|
87
|
+
),
|
|
88
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
89
|
+
"ui:options": {
|
|
90
|
+
"type": "code",
|
|
91
|
+
"lang": "python",
|
|
92
|
+
},
|
|
93
|
+
},
|
|
94
|
+
},
|
|
95
|
+
"required": ["spark_source", "env", "execution_config", "code"],
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
def __create_env(self, source_env, extra_env):
|
|
99
|
+
env = os.environ.copy()
|
|
100
|
+
env.update(source_env)
|
|
101
|
+
env.update(extra_env)
|
|
102
|
+
return env
|
|
103
|
+
|
|
104
|
+
def __create_bash_command(self, script_path, submitter, execution_config, source_conf: dict):
|
|
105
|
+
execution_conf_list = []
|
|
106
|
+
conf_names = []
|
|
107
|
+
for k, v in execution_config.items():
|
|
108
|
+
if k == "conf":
|
|
109
|
+
for k2, v2 in v.items():
|
|
110
|
+
execution_conf_list.append(f"--conf {k2}={v2}")
|
|
111
|
+
conf_names.append(k2)
|
|
112
|
+
else:
|
|
113
|
+
execution_conf_list.append(f"--{k} {v}")
|
|
114
|
+
for k, v in source_conf.items():
|
|
115
|
+
if k in conf_names:
|
|
116
|
+
continue
|
|
117
|
+
execution_conf_list.append(f"--conf {k}={v}")
|
|
118
|
+
execution_conf_str = " ".join(execution_conf_list)
|
|
119
|
+
bash_command = submitter + " " + execution_conf_str + " " + script_path
|
|
120
|
+
return bash_command
|
|
121
|
+
|
|
122
|
+
def __execute_command(self, bash_command, env):
|
|
123
|
+
logger.info("Running command: %s", bash_command)
|
|
124
|
+
sub_process = Popen(["bash", "-c", bash_command], stdout=PIPE, stderr=STDOUT, env=env)
|
|
125
|
+
logger.info("Output:")
|
|
126
|
+
for raw_line in iter(sub_process.stdout.readline, b""):
|
|
127
|
+
line = raw_line.decode("utf8").rstrip()
|
|
128
|
+
logger.info(line)
|
|
129
|
+
sub_process.wait()
|
|
130
|
+
logger.info("Node exited with return code %s", sub_process.returncode)
|
|
131
|
+
if sub_process.returncode:
|
|
132
|
+
raise Exception("Spark node failed")
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def __filter_empty_value_in_dict(cls, dct: dict):
|
|
136
|
+
if not dct:
|
|
137
|
+
return dct
|
|
138
|
+
return {k: v for (k, v) in dct.items() if (v is not None and v != "" and v != {})}
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def _merge_dict(cls, priority_dct: dict, other_dct: dict):
|
|
142
|
+
"""
|
|
143
|
+
Filter out empty values
|
|
144
|
+
"""
|
|
145
|
+
if not (other_dct and priority_dct):
|
|
146
|
+
return priority_dct or other_dct
|
|
147
|
+
result_dct = {}
|
|
148
|
+
for key in set(list(priority_dct.keys()) + list(other_dct.keys())):
|
|
149
|
+
if key not in priority_dct:
|
|
150
|
+
result_dct[key] = other_dct[key]
|
|
151
|
+
continue
|
|
152
|
+
if key not in other_dct:
|
|
153
|
+
result_dct[key] = priority_dct[key]
|
|
154
|
+
continue
|
|
155
|
+
if isinstance(priority_dct[key], dict) and isinstance(other_dct, dict):
|
|
156
|
+
result_dct[key] = cls._merge_dict(priority_dct[key], other_dct[key])
|
|
157
|
+
continue
|
|
158
|
+
result_dct[key] = priority_dct[key]
|
|
159
|
+
return result_dct
|
|
160
|
+
|
|
161
|
+
def __excute_spark_code(self, config):
|
|
162
|
+
spark_source = self.must_get_connection_by_name(config.spark_source)
|
|
163
|
+
submitter = spark_source.extra.get("submitter")
|
|
164
|
+
source_conf = spark_source.extra.get("conf", {})
|
|
165
|
+
source_env = self.__filter_empty_value_in_dict(
|
|
166
|
+
spark_source.extra.get("env")
|
|
167
|
+
) # Some empty values may be saved when saving on the page
|
|
168
|
+
execution_config = self._merge_dict(
|
|
169
|
+
self.__filter_empty_value_in_dict(json.loads(config.execution_config)),
|
|
170
|
+
self.__filter_empty_value_in_dict(spark_source.extra.get("execution_config")),
|
|
171
|
+
)
|
|
172
|
+
extra_env = json.loads(config.env) # User input, don't filter empty values
|
|
173
|
+
code = config.code
|
|
174
|
+
|
|
175
|
+
prefix = f"recurve_pyspark_{self.dag.dag_id}_{self.node.id}_"
|
|
176
|
+
with NamedTemporaryFile(mode="w+t", prefix=prefix, suffix=".py") as tmp_file:
|
|
177
|
+
tmp_file.write(code)
|
|
178
|
+
tmp_file.flush()
|
|
179
|
+
logger.info(code)
|
|
180
|
+
script_path = os.path.abspath(tmp_file.name)
|
|
181
|
+
bash_command = self.__create_bash_command(script_path, submitter, execution_config, source_conf)
|
|
182
|
+
env = self.__create_env(source_env, extra_env)
|
|
183
|
+
self.__execute_command(bash_command, env)
|
|
184
|
+
|
|
185
|
+
def execute_impl(self, *args, **kwargs):
|
|
186
|
+
config = self.rendered_config
|
|
187
|
+
self.__excute_spark_code(config)
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class SparkOperator(BaseOperator):
|
|
192
|
+
task_cls = SparkTask
|
|
193
|
+
|
|
194
|
+
@classmethod
|
|
195
|
+
def validate(cls, configuration):
|
|
196
|
+
config = super().validate(configuration)
|
|
197
|
+
# execution_config = json.loads(config['execution_config'])
|
|
198
|
+
# if execution_config['master'] != 'yarn':
|
|
199
|
+
# raise jsonschema.ValidationError(message='master should be yarn')
|
|
200
|
+
return config
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from pyspark.sql import SparkSession
|
|
3
|
+
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
|
|
4
|
+
|
|
5
|
+
# Initialize a Spark session
|
|
6
|
+
spark = SparkSession.builder.appName("PySpark SQL Example").getOrCreate()
|
|
7
|
+
|
|
8
|
+
# Define the schema
|
|
9
|
+
schema = StructType(
|
|
10
|
+
[
|
|
11
|
+
StructField("name", StringType(), True),
|
|
12
|
+
StructField("age", IntegerType(), True),
|
|
13
|
+
StructField("city", StringType(), True),
|
|
14
|
+
]
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Create a DataFrame manually
|
|
18
|
+
data = [
|
|
19
|
+
("Alice", 34, "New York"),
|
|
20
|
+
("Bob", 45, "Los Angeles"),
|
|
21
|
+
("Cathy", 29, "Chicago"),
|
|
22
|
+
("David", 31, "New York"),
|
|
23
|
+
("Emma", 42, "San Francisco"),
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
df = spark.createDataFrame(data, schema)
|
|
27
|
+
|
|
28
|
+
# Show the DataFrame
|
|
29
|
+
df.show()
|
|
30
|
+
|
|
31
|
+
# Register the DataFrame as a temporary view
|
|
32
|
+
df.createOrReplaceTempView("people")
|
|
33
|
+
|
|
34
|
+
# Perform a SQL query
|
|
35
|
+
result_df = spark.sql("SELECT * FROM people WHERE age > 30")
|
|
36
|
+
|
|
37
|
+
# Show the result of the SQL query
|
|
38
|
+
result_df.show()
|
|
39
|
+
|
|
40
|
+
# Write the result to another CSV file
|
|
41
|
+
result_df.write.csv("output.csv", header=True)
|
|
42
|
+
|
|
43
|
+
# Stop the Spark session
|
|
44
|
+
spark.stop()
|
|
45
|
+
|
|
46
|
+
except ImportError:
|
|
47
|
+
pass
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from recurvedata.operators.sql_operator.operator import SQLOperator
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import jsonschema
|
|
4
|
+
|
|
5
|
+
from recurvedata.connectors.service import list_sql_operator_types
|
|
6
|
+
from recurvedata.core.translation import _l
|
|
7
|
+
from recurvedata.operators.operator import BaseOperator
|
|
8
|
+
from recurvedata.operators.task import BaseTask
|
|
9
|
+
from recurvedata.operators.utils import lineage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SQLTask(BaseTask):
|
|
13
|
+
no_template_fields = ("autocommit", "data_source_name")
|
|
14
|
+
|
|
15
|
+
@classmethod
|
|
16
|
+
def config_schema(cls):
|
|
17
|
+
# get_names_by_type = cls.get_connection_names_by_type
|
|
18
|
+
return {
|
|
19
|
+
"type": "object",
|
|
20
|
+
"properties": {
|
|
21
|
+
"data_source_name": {
|
|
22
|
+
"type": "string",
|
|
23
|
+
"title": _l("Data Source"),
|
|
24
|
+
"ui:field": "ProjectConnectionSelectorField",
|
|
25
|
+
"ui:options": {"supportTypes": list_sql_operator_types()},
|
|
26
|
+
},
|
|
27
|
+
# "database": {
|
|
28
|
+
# "type": "string",
|
|
29
|
+
# "title": _l("Database"),
|
|
30
|
+
# "ui:field": "CodeEditorWithReferencesField",
|
|
31
|
+
# "ui:options": {
|
|
32
|
+
# "type": "plain",
|
|
33
|
+
# },
|
|
34
|
+
# },
|
|
35
|
+
"sql": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"title": _l("SQL Query"),
|
|
38
|
+
"description": _l(
|
|
39
|
+
"Execute single or multiple SQL statements. "
|
|
40
|
+
"Supports Jinja templating for variables and dynamic queries."
|
|
41
|
+
),
|
|
42
|
+
"ui:field": "CodeEditorWithReferencesField",
|
|
43
|
+
"ui:options": {
|
|
44
|
+
"type": "code",
|
|
45
|
+
"lang": "sql",
|
|
46
|
+
"sqlLang": "sql",
|
|
47
|
+
},
|
|
48
|
+
},
|
|
49
|
+
},
|
|
50
|
+
"required": ["data_source_name", "sql"],
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def validate(cls, configuration: dict[str, Any]) -> dict[str, Any]:
|
|
55
|
+
config = super().validate(configuration)
|
|
56
|
+
|
|
57
|
+
ds = cls.must_get_connection_by_name(configuration["data_source_name"])
|
|
58
|
+
if not ds.is_dbapi:
|
|
59
|
+
raise jsonschema.ValidationError(message="only DBAPI is supported", path=("data_source_name",))
|
|
60
|
+
return config
|
|
61
|
+
|
|
62
|
+
def execute_impl(self, *args, **kwargs):
|
|
63
|
+
config = self.rendered_config
|
|
64
|
+
ds = self.get_connection_by_name(config.data_source_name)
|
|
65
|
+
connector = ds.connector
|
|
66
|
+
# if config.database:
|
|
67
|
+
# connector.database = config.database
|
|
68
|
+
|
|
69
|
+
queries = config.sql
|
|
70
|
+
if connector.is_hive():
|
|
71
|
+
# Set spark.app.name to help locate the specific Recurve task in the YARN UI
|
|
72
|
+
queries = f"SET spark.app.name=recurve.{self.dag.name}.{self.node.name};\n{queries}"
|
|
73
|
+
|
|
74
|
+
comment = self.get_query_comment_conf()
|
|
75
|
+
annotated_queries = connector.add_leading_comment(queries, comment)
|
|
76
|
+
|
|
77
|
+
connector.execute(annotated_queries, autocommit=config.get("autocommit", False))
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
def parse_lineage(self):
|
|
81
|
+
config = self.rendered_config
|
|
82
|
+
ds = self.get_connection_by_name(config.data_source_name)
|
|
83
|
+
if not lineage.supported_recurve_ds_type(ds.ds_type):
|
|
84
|
+
return
|
|
85
|
+
res = lineage.parse_lineage(config.sql, config.database, ds.name, ds.ds_type)
|
|
86
|
+
return res
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class SQLOperator(BaseOperator):
|
|
90
|
+
task_cls = SQLTask
|