recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,974 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import datetime
|
|
3
|
+
import inspect
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from functools import lru_cache
|
|
8
|
+
from typing import Any, Callable, Generator, Union
|
|
9
|
+
|
|
10
|
+
import pendulum
|
|
11
|
+
from airflow.models import DAG, BaseOperator, DagRun, TaskInstance
|
|
12
|
+
from airflow.operators.empty import EmptyOperator
|
|
13
|
+
from airflow.operators.latest_only import LatestOnlyOperator
|
|
14
|
+
from airflow.operators.python import ShortCircuitOperator
|
|
15
|
+
from airflow.utils.task_group import TaskGroup
|
|
16
|
+
from airflow.utils.trigger_rule import TriggerRule
|
|
17
|
+
from slugify import slugify
|
|
18
|
+
|
|
19
|
+
from recurvedata.config import RECURVE_EXECUTOR_CLI, RECURVE_EXECUTOR_DBT_CLI
|
|
20
|
+
from recurvedata.consts import Operator
|
|
21
|
+
from recurvedata.schedulers.airflow_operators import LinkNodeBashOperator, RecurveBashOperator, SkipSelfBashOperator
|
|
22
|
+
from recurvedata.schedulers.base import DagSchema, SchedulerBase
|
|
23
|
+
from recurvedata.schedulers.consts import (
|
|
24
|
+
DEFAULT_RETRY_DELAY,
|
|
25
|
+
DEFAULT_RETRY_NUMBER,
|
|
26
|
+
WORK_DIR,
|
|
27
|
+
format_recurve_env_key,
|
|
28
|
+
is_dev_run_job,
|
|
29
|
+
)
|
|
30
|
+
from recurvedata.schedulers.model import LinkNodeItem, LinkWorkflowItem, SchedulerNode
|
|
31
|
+
from recurvedata.schedulers.utils import format_dag_id
|
|
32
|
+
from recurvedata.utils.crontab import get_schedule
|
|
33
|
+
from recurvedata.utils.dataclass import init_dataclass_from_dict
|
|
34
|
+
from recurvedata.utils.date_time import ensure_datetime, now_aware
|
|
35
|
+
from recurvedata.utils.helpers import extract_dict
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@lru_cache()
|
|
41
|
+
def _get_function_param_names(function: Callable) -> list[str]:
|
|
42
|
+
sig = inspect.signature(function)
|
|
43
|
+
return list(sig.parameters.keys())
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
AIRFLOW_DAG_INIT_PARAMS = _get_function_param_names(DAG.__init__)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AirflowScheduler(SchedulerBase):
|
|
50
|
+
def __init__(self, sharding_size: int = 1, sharding_key: int = 0):
|
|
51
|
+
"""Initialize the Airflow scheduler.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
sharding_size: Number of shards to split DAGs across
|
|
55
|
+
sharding_key: Which shard this scheduler instance handles
|
|
56
|
+
"""
|
|
57
|
+
logger.debug(f"Initializing AirflowScheduler with sharding_size={sharding_size}, sharding_key={sharding_key}")
|
|
58
|
+
|
|
59
|
+
# Temporarily removed sharding key extraction from environment due to DAG leakage issues
|
|
60
|
+
# job_id = self.extract_sharding_key_from_environment()
|
|
61
|
+
# if job_id:
|
|
62
|
+
# logger.info(
|
|
63
|
+
# f"Switching sharding_size from {sharding_size} to {job_id}, sharding_key from {sharding_key} to 0"
|
|
64
|
+
# )
|
|
65
|
+
# sharding_size = job_id
|
|
66
|
+
# sharding_key = 0
|
|
67
|
+
|
|
68
|
+
super().__init__(sharding_size, sharding_key)
|
|
69
|
+
self.link_node_dict = {}
|
|
70
|
+
self.link_workflow_dict = {}
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def extract_sharding_key_from_environment() -> int | None:
|
|
74
|
+
"""Extract sharding key from environment variables.
|
|
75
|
+
|
|
76
|
+
When Airflow worker runs 'airflow task run {dag_id} {task_id}',
|
|
77
|
+
the dag_id is written to environment variables which we can use
|
|
78
|
+
to determine the sharding key.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Extracted sharding key as integer if found, None otherwise
|
|
82
|
+
"""
|
|
83
|
+
dag_id = os.environ.get("_AIRFLOW_PARSING_CONTEXT_DAG_ID")
|
|
84
|
+
if not dag_id:
|
|
85
|
+
job_id = os.environ.get("RECURVE_AUTOGEN_SINGLE_SHARDING_SIZE")
|
|
86
|
+
return int(job_id) if job_id else None
|
|
87
|
+
|
|
88
|
+
job_id = dag_id.rsplit(".")[-1]
|
|
89
|
+
return int(job_id) if job_id.isdigit() else None
|
|
90
|
+
|
|
91
|
+
def list_scheduler_dag(self) -> Generator[DagSchema, None, None]:
|
|
92
|
+
"""Get all DAG information from SDK that matches sharding criteria.
|
|
93
|
+
|
|
94
|
+
Yields:
|
|
95
|
+
DagSchema objects for each matching DAG
|
|
96
|
+
"""
|
|
97
|
+
response = self.client.list_jobs(sharding_size=self.sharding_size, sharding_key=self.sharding_key)
|
|
98
|
+
|
|
99
|
+
# Build link node dictionary
|
|
100
|
+
self.link_node_dict = {
|
|
101
|
+
node.node_id: init_dataclass_from_dict(LinkNodeItem, node.model_dump()) for node in response.link_nodes
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# Process link workflows
|
|
105
|
+
for workflow in response.link_workflows:
|
|
106
|
+
workflow_item: LinkWorkflowItem = init_dataclass_from_dict(LinkWorkflowItem, workflow.model_dump())
|
|
107
|
+
|
|
108
|
+
# Process link nodes within workflow
|
|
109
|
+
processed_link_nodes = []
|
|
110
|
+
for node_dict in workflow_item.link_nodes:
|
|
111
|
+
node_item: LinkNodeItem = init_dataclass_from_dict(LinkNodeItem, node_dict)
|
|
112
|
+
node_item.node_id = workflow_item.node_id
|
|
113
|
+
node_item.link_wf_id = workflow_item.link_wf_id
|
|
114
|
+
processed_link_nodes.append(node_item)
|
|
115
|
+
|
|
116
|
+
workflow_item.link_nodes = processed_link_nodes
|
|
117
|
+
self.link_workflow_dict[workflow.node_id] = workflow_item
|
|
118
|
+
|
|
119
|
+
# Yield DAG schemas
|
|
120
|
+
for job in response.jobs:
|
|
121
|
+
dag_schema: DagSchema = init_dataclass_from_dict(DagSchema, job.model_dump())
|
|
122
|
+
yield dag_schema
|
|
123
|
+
|
|
124
|
+
def execute(self) -> dict[str, DAG]:
|
|
125
|
+
"""Execute scheduler to create Airflow DAGs.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Dictionary mapping DAG IDs to DAG objects
|
|
129
|
+
"""
|
|
130
|
+
dag_dict = {}
|
|
131
|
+
for dag_schema in self.list_scheduler_dag():
|
|
132
|
+
airflow_dag = self.create_dag(dag_schema)
|
|
133
|
+
if airflow_dag:
|
|
134
|
+
dag_dict[airflow_dag.dag_id] = airflow_dag
|
|
135
|
+
return dag_dict
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def dag_date_2_airflow_date(
|
|
139
|
+
scheduler_interval: str, dag_date: Union[datetime.datetime], timezone: str, is_end_date: bool = False
|
|
140
|
+
) -> datetime.datetime:
|
|
141
|
+
"""Convert DAG date to Airflow date with timezone handling.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
scheduler_interval: DAG schedule interval
|
|
145
|
+
dag_date: Date to convert
|
|
146
|
+
timezone: Target timezone
|
|
147
|
+
is_end_date: Whether this is an end date requiring special handling
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Converted datetime with proper timezone
|
|
151
|
+
"""
|
|
152
|
+
if not dag_date:
|
|
153
|
+
return dag_date
|
|
154
|
+
|
|
155
|
+
# Add timezone
|
|
156
|
+
dag_date = ensure_datetime(dag_date).replace(tzinfo=pendulum.timezone(timezone))
|
|
157
|
+
|
|
158
|
+
# Handle manual/once-off DAGs
|
|
159
|
+
if scheduler_interval == "@once" or not scheduler_interval:
|
|
160
|
+
return dag_date
|
|
161
|
+
|
|
162
|
+
# Calculate execution dates
|
|
163
|
+
next_execution_date = get_schedule(schedule_interval=scheduler_interval, dttm=dag_date, is_next=True)
|
|
164
|
+
current_execution_date = get_schedule(
|
|
165
|
+
schedule_interval=scheduler_interval, dttm=next_execution_date, is_next=False
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if not is_end_date:
|
|
169
|
+
if current_execution_date != dag_date:
|
|
170
|
+
return current_execution_date
|
|
171
|
+
|
|
172
|
+
previous_execution_date = get_schedule(
|
|
173
|
+
schedule_interval=scheduler_interval, dttm=current_execution_date, is_next=False
|
|
174
|
+
)
|
|
175
|
+
return previous_execution_date
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def _cal_retry_number(dag_schema: DagSchema) -> int:
|
|
179
|
+
"""Calculate retry number for a DAG.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
dag_schema: DAG schema to calculate retries for
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Number of retries to configure
|
|
186
|
+
"""
|
|
187
|
+
if is_dev_run_job(dag_schema.name):
|
|
188
|
+
return 0
|
|
189
|
+
if dag_schema.retries is not None:
|
|
190
|
+
return dag_schema.retries
|
|
191
|
+
return DEFAULT_RETRY_NUMBER
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _cal_retry_delay(dag_schema: DagSchema) -> datetime.timedelta:
|
|
195
|
+
"""Calculate retry delay for a DAG.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
dag_schema: DAG schema to calculate retries for
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Retry delay to configure
|
|
202
|
+
"""
|
|
203
|
+
return datetime.timedelta(seconds=3600)
|
|
204
|
+
if dag_schema.retry_delay is not None:
|
|
205
|
+
return datetime.timedelta(seconds=dag_schema.retry_delay)
|
|
206
|
+
return datetime.timedelta(seconds=DEFAULT_RETRY_DELAY)
|
|
207
|
+
|
|
208
|
+
def create_dag_impl(self, dag_schema: DagSchema) -> DAG | None:
|
|
209
|
+
"""Create Airflow DAG from schema.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
dag_schema: Schema defining the DAG
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
Created Airflow DAG object or None if creation fails
|
|
216
|
+
"""
|
|
217
|
+
# Calculate dates
|
|
218
|
+
airflow_end_date = self.dag_date_2_airflow_date(
|
|
219
|
+
dag_schema.schedule_interval, dag_schema.end_date, dag_schema.timezone, is_end_date=True
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
airflow_start_date = (
|
|
223
|
+
self.dag_date_2_airflow_date(dag_schema.schedule_interval, dag_schema.start_date, dag_schema.timezone)
|
|
224
|
+
or now_aware()
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Set up default arguments
|
|
228
|
+
default_args = {
|
|
229
|
+
"owner": dag_schema.owner_username or self.DEFAULT_DAG_OWNER,
|
|
230
|
+
"start_date": airflow_start_date,
|
|
231
|
+
"end_date": airflow_end_date,
|
|
232
|
+
"depends_on_past": False,
|
|
233
|
+
"retries": self._cal_retry_number(dag_schema),
|
|
234
|
+
"retry_delay": self._cal_retry_delay(dag_schema),
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
# Process Airflow-specific arguments
|
|
238
|
+
airflow_args = dag_schema.scheduler_settings or {}
|
|
239
|
+
if airflow_args:
|
|
240
|
+
custom_defaults = airflow_args.pop("default_args", None)
|
|
241
|
+
if custom_defaults:
|
|
242
|
+
for key, value in custom_defaults.items():
|
|
243
|
+
if key in ("execution_timeout", "retry_delay"):
|
|
244
|
+
custom_defaults[key] = datetime.timedelta(seconds=value)
|
|
245
|
+
else:
|
|
246
|
+
custom_defaults[key] = value
|
|
247
|
+
default_args.update(custom_defaults)
|
|
248
|
+
|
|
249
|
+
# Remove reserved keys
|
|
250
|
+
for reserved in ["dag_id", "default_args", "schedule_interval"]:
|
|
251
|
+
airflow_args.pop(reserved, None)
|
|
252
|
+
|
|
253
|
+
airflow_args = self._clean_airflow_args(airflow_args) or {}
|
|
254
|
+
|
|
255
|
+
# Determine schedule interval
|
|
256
|
+
schedule_interval = None if dag_schema.schedule_type == "manual" else dag_schema.schedule_interval
|
|
257
|
+
|
|
258
|
+
# Create DAG
|
|
259
|
+
dag = DAG(
|
|
260
|
+
dag_id=self.format_dag_id(dag_schema),
|
|
261
|
+
default_args=default_args,
|
|
262
|
+
schedule=schedule_interval,
|
|
263
|
+
start_date=airflow_start_date,
|
|
264
|
+
end_date=airflow_end_date,
|
|
265
|
+
dag_display_name=dag_schema.name,
|
|
266
|
+
on_success_callback=self._on_finished_callback,
|
|
267
|
+
on_failure_callback=self._on_finished_callback,
|
|
268
|
+
**airflow_args,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Add Recurve metadata
|
|
272
|
+
dag._is_generated_by_recurve = True
|
|
273
|
+
dag.job_id = dag_schema.job_id
|
|
274
|
+
|
|
275
|
+
# Set up DAG structure
|
|
276
|
+
self.setup_graph(dag, dag_schema)
|
|
277
|
+
|
|
278
|
+
return dag
|
|
279
|
+
|
|
280
|
+
def setup_graph(self, dag: DAG, recurve_dag: DagSchema):
|
|
281
|
+
"""Set up the DAG graph structure.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
dag: Airflow DAG to configure
|
|
285
|
+
recurve_dag: Schema defining the DAG structure
|
|
286
|
+
"""
|
|
287
|
+
operator_dict = {}
|
|
288
|
+
|
|
289
|
+
# Create operators for each node
|
|
290
|
+
for node_dict in recurve_dag.nodes:
|
|
291
|
+
node: SchedulerNode = init_dataclass_from_dict(SchedulerNode, node_dict)
|
|
292
|
+
node.id = int(node.id)
|
|
293
|
+
|
|
294
|
+
try:
|
|
295
|
+
operators = self.convert_node_to_operators(dag, recurve_dag, node)
|
|
296
|
+
except Exception as exc:
|
|
297
|
+
logger.exception(f"Failed to create node {dag.dag_id} {node.id}: {exc}")
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
if not operators:
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
# Add Recurve metadata to operators
|
|
304
|
+
doc_metadata = {
|
|
305
|
+
"recurve_node_id": node.id,
|
|
306
|
+
"recurve_node_key": node.node_key,
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
for operator in operators:
|
|
310
|
+
if isinstance(operator, TaskGroup):
|
|
311
|
+
for sub_op in operator:
|
|
312
|
+
sub_doc = json.loads(sub_op.doc_json) if sub_op.doc_json else {}
|
|
313
|
+
sub_doc.update(doc_metadata)
|
|
314
|
+
sub_op.doc_json = json.dumps(sub_doc)
|
|
315
|
+
else:
|
|
316
|
+
operator.doc_json = json.dumps(doc_metadata)
|
|
317
|
+
|
|
318
|
+
operator_dict[node_dict["node_key"]] = operators
|
|
319
|
+
|
|
320
|
+
# Set up dependencies
|
|
321
|
+
already_set = set()
|
|
322
|
+
for upstream_key, downstream_key in recurve_dag.graph:
|
|
323
|
+
edge = (upstream_key, downstream_key)
|
|
324
|
+
if edge in already_set:
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
if not (operator_dict.get(upstream_key) and operator_dict.get(downstream_key)):
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
upstream = operator_dict[upstream_key][-1]
|
|
331
|
+
downstream = operator_dict[downstream_key][0]
|
|
332
|
+
upstream.set_downstream(downstream)
|
|
333
|
+
already_set.add(edge)
|
|
334
|
+
|
|
335
|
+
def convert_node_to_operators(self, dag: DAG, recurve_dag: DagSchema, node: SchedulerNode) -> list[BaseOperator]:
|
|
336
|
+
"""Convert a DAG node to Airflow operators.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
dag: Parent Airflow DAG
|
|
340
|
+
recurve_dag: Schema defining the DAG
|
|
341
|
+
node: Node to convert
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
List of created operators or None if conversion fails
|
|
345
|
+
"""
|
|
346
|
+
# Prepare environment
|
|
347
|
+
bash_env = self._prepare_bash_env(recurve_dag, node)
|
|
348
|
+
kwargs = {
|
|
349
|
+
"env": bash_env,
|
|
350
|
+
"executor_config": {"workflow_version": recurve_dag.workflow_version},
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
# Handle link operators
|
|
354
|
+
if Operator.is_link(node.operator):
|
|
355
|
+
if node.id in self.link_workflow_dict:
|
|
356
|
+
return self.convert_link_workflow_node_to_operators(dag, node, **kwargs)
|
|
357
|
+
return self.convert_link_node_to_operators(dag, node, self.link_node_dict.get(node.id), **kwargs)
|
|
358
|
+
|
|
359
|
+
# Get node-specific Airflow args
|
|
360
|
+
node_airflow_args = self.get_node_airflow_args(node)
|
|
361
|
+
kwargs.update(node_airflow_args)
|
|
362
|
+
|
|
363
|
+
operators = []
|
|
364
|
+
|
|
365
|
+
# Add latest-only operator if needed
|
|
366
|
+
if dag.schedule_interval != "@once" and node.latest_only:
|
|
367
|
+
task_id = self.format_task_id(node, "latest_only")
|
|
368
|
+
latest_only = LatestOnlyOperator(task_id=task_id, dag=dag)
|
|
369
|
+
operators.append(latest_only)
|
|
370
|
+
|
|
371
|
+
# Add skip operator if needed
|
|
372
|
+
if node.skip_downstream:
|
|
373
|
+
skip_task = ShortCircuitOperator(
|
|
374
|
+
dag=dag, task_id=self.format_task_id(node, "skip_downstream"), python_callable=lambda: False
|
|
375
|
+
)
|
|
376
|
+
operators.append(skip_task)
|
|
377
|
+
|
|
378
|
+
# Add main operator
|
|
379
|
+
task_id = self.format_task_id(node)
|
|
380
|
+
main_operator = self._create_operator(dag, node, task_id, **kwargs)
|
|
381
|
+
operators.append(main_operator)
|
|
382
|
+
|
|
383
|
+
# Add empty node after skip_self operator to ensure proper trigger rule handling
|
|
384
|
+
# Only add empty node if skip_downstream is False, to avoid conflicts
|
|
385
|
+
if node.skip_self and not node.skip_downstream:
|
|
386
|
+
empty_task_id = self.format_task_id(node, "skip_self")
|
|
387
|
+
empty_operator = EmptyOperator(task_id=empty_task_id, trigger_rule=TriggerRule.NONE_FAILED, dag=dag)
|
|
388
|
+
operators.append(empty_operator)
|
|
389
|
+
|
|
390
|
+
# Set up dependencies
|
|
391
|
+
for upstream, downstream in zip(operators[:-1], operators[1:]):
|
|
392
|
+
upstream.set_downstream(downstream)
|
|
393
|
+
|
|
394
|
+
return operators
|
|
395
|
+
|
|
396
|
+
@staticmethod
|
|
397
|
+
def _prepare_bash_env(recurve_dag: DagSchema, node: SchedulerNode) -> dict[str, Any]:
|
|
398
|
+
"""Prepare bash environment variables for operators.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
recurve_dag: DAG schema
|
|
402
|
+
node: Node to prepare environment for
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Dictionary of environment variables
|
|
406
|
+
"""
|
|
407
|
+
env = {
|
|
408
|
+
"AIRFLOW_RETRY_NUMBER": "{{ task_instance.try_number }}",
|
|
409
|
+
"AIRFLOW_MAX_RETRY_NUMBER": "{{ task_instance.max_tries }}",
|
|
410
|
+
"AIRFLOW_DATA_INTERVAL_END": "{{ task_instance.dag_run.data_interval_end.isoformat() }}",
|
|
411
|
+
format_recurve_env_key("workflow_version"): recurve_dag.workflow_version,
|
|
412
|
+
format_recurve_env_key("node_key"): node.node_key,
|
|
413
|
+
format_recurve_env_key("job_run_conf"): "{{ dag_run.conf | tojson }}",
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
# Copy relevant environment variables
|
|
417
|
+
for key, value in os.environ.items():
|
|
418
|
+
if key.startswith("RECURVE__"):
|
|
419
|
+
env[key] = value
|
|
420
|
+
elif key.startswith("AIRFLOW__") and node.operator == "SensorOperator":
|
|
421
|
+
env[key] = value
|
|
422
|
+
elif key in (
|
|
423
|
+
"AIRFLOW_CTX_DAG_RUN_ID",
|
|
424
|
+
"AIRFLOW_CTX_TRY_NUMBER",
|
|
425
|
+
"AIRFLOW_CTX_EXECUTION_DATE",
|
|
426
|
+
"PATH",
|
|
427
|
+
"PYENV_ROOT",
|
|
428
|
+
):
|
|
429
|
+
env[key] = value
|
|
430
|
+
|
|
431
|
+
return env
|
|
432
|
+
|
|
433
|
+
def _create_operator(
|
|
434
|
+
self, dag: DAG, node: SchedulerNode, task_id: str, stage: str = None, **kwargs
|
|
435
|
+
) -> BaseOperator:
|
|
436
|
+
"""Create an Airflow operator for a node.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
dag: Parent Airflow DAG
|
|
440
|
+
node: Node to create operator for
|
|
441
|
+
task_id: ID for the task
|
|
442
|
+
stage: Optional stage name
|
|
443
|
+
**kwargs: Additional operator arguments
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
Created operator
|
|
447
|
+
"""
|
|
448
|
+
cmd = self.format_command(dag, node, stage)
|
|
449
|
+
operator_class = SkipSelfBashOperator if node.skip_self else RecurveBashOperator
|
|
450
|
+
|
|
451
|
+
return operator_class(task_id=task_id, bash_command=cmd, dag=dag, task_display_name=node.name, **kwargs)
|
|
452
|
+
|
|
453
|
+
@staticmethod
|
|
454
|
+
def format_command(dag: DAG, node: SchedulerNode, stage: str) -> str:
|
|
455
|
+
"""Format command string for bash operator.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
dag: Parent Airflow DAG
|
|
459
|
+
node: Node to create command for
|
|
460
|
+
stage: Optional stage name
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
Formatted command string
|
|
464
|
+
"""
|
|
465
|
+
node_slug = f"{slugify(node.name)}.{node.id}"
|
|
466
|
+
|
|
467
|
+
# Determine execution date template
|
|
468
|
+
if dag.schedule_interval == "@once" or not dag.schedule_interval:
|
|
469
|
+
execution_date = "logical_date"
|
|
470
|
+
else:
|
|
471
|
+
execution_date = "data_interval_end if data_interval_end is not none else logical_date"
|
|
472
|
+
|
|
473
|
+
# Build command options
|
|
474
|
+
options = [
|
|
475
|
+
f"--dag_slug '{dag.dag_id}'",
|
|
476
|
+
f"--node_slug '{node_slug}'",
|
|
477
|
+
"--execution_date '{{ %s }}'" % execution_date,
|
|
478
|
+
]
|
|
479
|
+
|
|
480
|
+
if stage is not None:
|
|
481
|
+
options.append(f"--stage {stage}")
|
|
482
|
+
|
|
483
|
+
# Build full command
|
|
484
|
+
if node.operator == Operator.DBTOperator:
|
|
485
|
+
return f'cd {WORK_DIR} && {RECURVE_EXECUTOR_DBT_CLI} execute {" ".join(options)}'
|
|
486
|
+
return f'cd {WORK_DIR} && {RECURVE_EXECUTOR_CLI} execute {" ".join(options)}'
|
|
487
|
+
|
|
488
|
+
@staticmethod
|
|
489
|
+
def format_dag_id(row: DagSchema) -> str:
|
|
490
|
+
"""Format DAG ID from schema.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
row: DAG schema
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Formatted DAG ID
|
|
497
|
+
"""
|
|
498
|
+
return format_dag_id(row.job_id)
|
|
499
|
+
|
|
500
|
+
@staticmethod
|
|
501
|
+
def format_task_id(node: SchedulerNode, suffix=None) -> str:
|
|
502
|
+
"""Format task ID for a node.
|
|
503
|
+
|
|
504
|
+
WARNING: This function should not be modified arbitrarily as it affects
|
|
505
|
+
existing task IDs.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
node: Node to format ID for
|
|
509
|
+
suffix: Optional suffix to append
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
Formatted task ID
|
|
513
|
+
"""
|
|
514
|
+
task_id = f"{node.node_key}"
|
|
515
|
+
if suffix:
|
|
516
|
+
task_id = f"{task_id}-{suffix}"
|
|
517
|
+
return task_id
|
|
518
|
+
|
|
519
|
+
@staticmethod
|
|
520
|
+
def format_link_node_task_id(node: SchedulerNode, suffix=None) -> str:
|
|
521
|
+
"""Format task ID for a link node.
|
|
522
|
+
|
|
523
|
+
WARNING: This function should not be modified arbitrarily as it affects
|
|
524
|
+
existing task IDs.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
node: Node to format ID for
|
|
528
|
+
suffix: Optional suffix to append
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
Formatted task ID
|
|
532
|
+
"""
|
|
533
|
+
task_id = f"{node.node_key}"
|
|
534
|
+
if suffix:
|
|
535
|
+
task_id = f"{task_id}-{suffix}"
|
|
536
|
+
return task_id
|
|
537
|
+
|
|
538
|
+
@staticmethod
|
|
539
|
+
def get_node_airflow_args(node: SchedulerNode) -> dict:
|
|
540
|
+
"""Get Airflow arguments for a node.
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
node: Node to get arguments for
|
|
544
|
+
|
|
545
|
+
Returns:
|
|
546
|
+
Dictionary of Airflow arguments
|
|
547
|
+
"""
|
|
548
|
+
scheduler_settings = node.scheduler_settings or {}
|
|
549
|
+
|
|
550
|
+
# Get explicit Airflow args
|
|
551
|
+
if "airflow_args" in scheduler_settings:
|
|
552
|
+
airflow_args = json.loads(scheduler_settings["airflow_args"])
|
|
553
|
+
else:
|
|
554
|
+
airflow_args = {}
|
|
555
|
+
|
|
556
|
+
# Process other Airflow settings
|
|
557
|
+
for key, value in scheduler_settings.items():
|
|
558
|
+
if key == "airflow_args" or not key.startswith("airflow"):
|
|
559
|
+
continue
|
|
560
|
+
|
|
561
|
+
key = key.lstrip("airflow_")
|
|
562
|
+
|
|
563
|
+
# Convert time values to timedelta
|
|
564
|
+
if key in ["execution_timeout", "retry_delay", "sla"] and isinstance(value, (int, float)):
|
|
565
|
+
value = datetime.timedelta(seconds=value)
|
|
566
|
+
|
|
567
|
+
airflow_args[key] = value
|
|
568
|
+
|
|
569
|
+
return airflow_args
|
|
570
|
+
|
|
571
|
+
@staticmethod
|
|
572
|
+
def _clean_airflow_args(airflow_args: dict[str, Any] | None) -> dict[str, Any] | None:
|
|
573
|
+
"""Clean Airflow arguments to only include valid parameters.
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
airflow_args: Arguments to clean
|
|
577
|
+
|
|
578
|
+
Returns:
|
|
579
|
+
Cleaned arguments dictionary
|
|
580
|
+
"""
|
|
581
|
+
if not airflow_args:
|
|
582
|
+
return airflow_args
|
|
583
|
+
return extract_dict(airflow_args, list(AIRFLOW_DAG_INIT_PARAMS))
|
|
584
|
+
|
|
585
|
+
def __create_link_operator(
|
|
586
|
+
self,
|
|
587
|
+
dag: DAG,
|
|
588
|
+
node: SchedulerNode,
|
|
589
|
+
link_node: SchedulerNode,
|
|
590
|
+
link_item: LinkNodeItem,
|
|
591
|
+
task_id: str,
|
|
592
|
+
stage: str = None,
|
|
593
|
+
is_workflow: bool = False,
|
|
594
|
+
**kwargs,
|
|
595
|
+
) -> LinkNodeBashOperator:
|
|
596
|
+
"""Create a link node operator.
|
|
597
|
+
|
|
598
|
+
Args:
|
|
599
|
+
dag: Parent Airflow DAG
|
|
600
|
+
node: Parent node
|
|
601
|
+
link_node: Link node to create operator for
|
|
602
|
+
link_item: Link node details
|
|
603
|
+
task_id: ID for the task
|
|
604
|
+
stage: Optional stage name
|
|
605
|
+
is_workflow: Whether this is part of a workflow
|
|
606
|
+
**kwargs: Additional operator arguments
|
|
607
|
+
|
|
608
|
+
Returns:
|
|
609
|
+
Created link node operator
|
|
610
|
+
"""
|
|
611
|
+
cmd = self.format_link_node_command(dag, node, link_item, stage, is_workflow)
|
|
612
|
+
operator_class = SkipSelfBashOperator if link_node.skip_self else LinkNodeBashOperator
|
|
613
|
+
|
|
614
|
+
return operator_class(
|
|
615
|
+
task_id=task_id,
|
|
616
|
+
bash_command=cmd,
|
|
617
|
+
dag=dag,
|
|
618
|
+
task_display_name=f"{node.name}.{link_item.link_node_name}",
|
|
619
|
+
**kwargs,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
@staticmethod
|
|
623
|
+
def format_link_node_command(
|
|
624
|
+
dag: DAG, node: SchedulerNode, link_detail: LinkNodeItem, stage: str, is_workflow: bool
|
|
625
|
+
) -> str:
|
|
626
|
+
"""Format command for link node operator.
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
dag: Parent Airflow DAG
|
|
630
|
+
node: Parent node
|
|
631
|
+
link_detail: Link node details
|
|
632
|
+
stage: Optional stage name
|
|
633
|
+
is_workflow: Whether this is part of a workflow
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
Formatted command string
|
|
637
|
+
"""
|
|
638
|
+
node_slug = f"{slugify(node.name)}.{node.id}"
|
|
639
|
+
execution_date = "logical_date" if dag.schedule_interval == "@once" else "data_interval_end"
|
|
640
|
+
|
|
641
|
+
# Build command options
|
|
642
|
+
options = [
|
|
643
|
+
f"--dag_slug '{dag.dag_id}'",
|
|
644
|
+
f"--node_slug '{node_slug}'",
|
|
645
|
+
"--execution_date '{{ %s }}'" % execution_date,
|
|
646
|
+
f"--link_workflow_id {link_detail.link_wf_id}",
|
|
647
|
+
f"--link_node_id {link_detail.link_node_id}",
|
|
648
|
+
]
|
|
649
|
+
|
|
650
|
+
if stage is not None:
|
|
651
|
+
options.append(f"--stage {stage}")
|
|
652
|
+
|
|
653
|
+
if is_workflow:
|
|
654
|
+
options.append("--is_link_workflow")
|
|
655
|
+
|
|
656
|
+
# Build full command
|
|
657
|
+
if link_detail.link_operator == Operator.DBTOperator:
|
|
658
|
+
return f'cd {WORK_DIR} && {RECURVE_EXECUTOR_DBT_CLI} execute {" ".join(options)}'
|
|
659
|
+
return f'cd {WORK_DIR} && {RECURVE_EXECUTOR_CLI} execute {" ".join(options)}'
|
|
660
|
+
|
|
661
|
+
def convert_link_workflow_node_to_operators(self, dag: DAG, node: SchedulerNode, **kwargs) -> list[BaseOperator]:
|
|
662
|
+
"""Convert a link workflow node to operators.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
dag: Parent Airflow DAG
|
|
666
|
+
node: Node to convert
|
|
667
|
+
**kwargs: Additional operator arguments
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
List of created operators or None if conversion fails
|
|
671
|
+
"""
|
|
672
|
+
link_workflow_item: LinkWorkflowItem = self.link_workflow_dict.get(node.id)
|
|
673
|
+
if not link_workflow_item:
|
|
674
|
+
return []
|
|
675
|
+
|
|
676
|
+
operators = []
|
|
677
|
+
|
|
678
|
+
# Add latest-only operator if needed
|
|
679
|
+
if dag.schedule_interval != "@once" and node.latest_only:
|
|
680
|
+
task_id = self.format_task_id(node, "latest_only")
|
|
681
|
+
latest_only = LatestOnlyOperator(task_id=task_id, dag=dag)
|
|
682
|
+
operators.append(latest_only)
|
|
683
|
+
|
|
684
|
+
# Add skip operator if needed
|
|
685
|
+
if node.skip_downstream:
|
|
686
|
+
skip_task = ShortCircuitOperator(
|
|
687
|
+
task_id=self.format_task_id(node, "skip_downstream"), python_callable=lambda: False, dag=dag
|
|
688
|
+
)
|
|
689
|
+
operators.append(skip_task)
|
|
690
|
+
|
|
691
|
+
# Save original node properties
|
|
692
|
+
node_original_name = node.name
|
|
693
|
+
node_original_key = node.node_key
|
|
694
|
+
|
|
695
|
+
has_inner_skip_downstream = False
|
|
696
|
+
has_inner_latest_only = False
|
|
697
|
+
link_end_task_id = self.format_task_id(node, "link_end")
|
|
698
|
+
latest_only_task_id = self.format_task_id(node, "latest_only2")
|
|
699
|
+
|
|
700
|
+
# Create task group
|
|
701
|
+
group_id = f"{node.node_key}"
|
|
702
|
+
with TaskGroup(group_id=group_id, dag=dag) as task_group:
|
|
703
|
+
operator_dict = {}
|
|
704
|
+
|
|
705
|
+
# Process each link node
|
|
706
|
+
for link_item in link_workflow_item.link_nodes:
|
|
707
|
+
link_plan_id = str(link_item.plan_id) if link_item.plan_id else dag.dag_id
|
|
708
|
+
if link_plan_id != dag.dag_id:
|
|
709
|
+
logger.warning(
|
|
710
|
+
f"Link node {link_item.link_node_key} is not in the same plan as the current DAG, link_plan_id: {link_plan_id}, dag_id: {dag.dag_id}"
|
|
711
|
+
)
|
|
712
|
+
continue
|
|
713
|
+
|
|
714
|
+
node.node_key = link_item.link_node_key
|
|
715
|
+
|
|
716
|
+
# Prepare environment
|
|
717
|
+
tmp_kwargs = copy.deepcopy(kwargs)
|
|
718
|
+
tmp_env = tmp_kwargs.get("env", {})
|
|
719
|
+
tmp_env.update(
|
|
720
|
+
{
|
|
721
|
+
format_recurve_env_key("link_workflow_version"): link_item.link_wf_version,
|
|
722
|
+
format_recurve_env_key("link_node_key"): link_item.link_node_key,
|
|
723
|
+
format_recurve_env_key("node_key"): f"{group_id}.{node.node_key}",
|
|
724
|
+
}
|
|
725
|
+
)
|
|
726
|
+
tmp_kwargs["env"] = tmp_env
|
|
727
|
+
|
|
728
|
+
# Update executor config
|
|
729
|
+
tmp_executor_config = kwargs.get("executor_config", {})
|
|
730
|
+
tmp_executor_config.update(
|
|
731
|
+
{
|
|
732
|
+
"link_workflow_id": link_item.link_wf_id,
|
|
733
|
+
"link_workflow_version": link_item.link_wf_version,
|
|
734
|
+
}
|
|
735
|
+
)
|
|
736
|
+
tmp_kwargs["executor_config"] = tmp_executor_config
|
|
737
|
+
|
|
738
|
+
# Create operators
|
|
739
|
+
tmp_ops = self._convert_link_node_to_operators(
|
|
740
|
+
dag, node, link_item, is_workflow=True, workflow_skip_self=node.skip_self, **tmp_kwargs
|
|
741
|
+
)
|
|
742
|
+
operator_dict[link_item.link_node_key] = tmp_ops
|
|
743
|
+
|
|
744
|
+
# Track special operators
|
|
745
|
+
for op in tmp_ops:
|
|
746
|
+
if isinstance(op, ShortCircuitOperator):
|
|
747
|
+
has_inner_skip_downstream = True
|
|
748
|
+
if isinstance(op, LatestOnlyOperator):
|
|
749
|
+
has_inner_latest_only = True
|
|
750
|
+
|
|
751
|
+
# Set up dependencies within group
|
|
752
|
+
for upstream_key, downstream_key in link_workflow_item.link_graph:
|
|
753
|
+
if not (operator_dict.get(upstream_key) and operator_dict.get(downstream_key)):
|
|
754
|
+
continue
|
|
755
|
+
|
|
756
|
+
upstream = operator_dict[upstream_key][-1]
|
|
757
|
+
downstream = operator_dict[downstream_key][0]
|
|
758
|
+
upstream.set_downstream(downstream)
|
|
759
|
+
|
|
760
|
+
operators.append(task_group)
|
|
761
|
+
|
|
762
|
+
# Add end task if needed
|
|
763
|
+
if (has_inner_skip_downstream or has_inner_latest_only) and not node.skip_downstream:
|
|
764
|
+
operators.append(
|
|
765
|
+
EmptyOperator(
|
|
766
|
+
task_id=link_end_task_id,
|
|
767
|
+
trigger_rule=TriggerRule.NONE_FAILED,
|
|
768
|
+
dag=dag,
|
|
769
|
+
)
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
# Add second latest-only operator if needed
|
|
773
|
+
if node.latest_only and has_inner_skip_downstream:
|
|
774
|
+
latest_only2 = LatestOnlyOperator(task_id=latest_only_task_id, dag=dag)
|
|
775
|
+
operators.append(latest_only2)
|
|
776
|
+
|
|
777
|
+
# Set up dependencies between operators
|
|
778
|
+
for upstream, downstream in zip(operators[:-1], operators[1:]):
|
|
779
|
+
upstream.set_downstream(downstream)
|
|
780
|
+
|
|
781
|
+
# Restore original node properties
|
|
782
|
+
node.name = node_original_name
|
|
783
|
+
node.node_key = node_original_key
|
|
784
|
+
|
|
785
|
+
return operators
|
|
786
|
+
|
|
787
|
+
def convert_link_node_to_operators(self, dag: DAG, node: SchedulerNode, link_item: LinkNodeItem, **kwargs) -> list:
|
|
788
|
+
"""Convert a link node to operators.
|
|
789
|
+
|
|
790
|
+
Args:
|
|
791
|
+
dag: Parent Airflow DAG
|
|
792
|
+
node: Node to convert
|
|
793
|
+
link_item: Link node details
|
|
794
|
+
**kwargs: Additional operator arguments
|
|
795
|
+
|
|
796
|
+
Returns:
|
|
797
|
+
List of created operators
|
|
798
|
+
"""
|
|
799
|
+
operators = []
|
|
800
|
+
parent_node_key = node.node_key
|
|
801
|
+
|
|
802
|
+
with TaskGroup(group_id=node.node_key, dag=dag) as task_group:
|
|
803
|
+
node.node_key = link_item.link_node_key
|
|
804
|
+
|
|
805
|
+
# Prepare environment
|
|
806
|
+
tmp_kwargs = copy.deepcopy(kwargs)
|
|
807
|
+
tmp_env = tmp_kwargs.get("env", {})
|
|
808
|
+
tmp_env.update(
|
|
809
|
+
{
|
|
810
|
+
format_recurve_env_key("link_workflow_version"): link_item.link_wf_version,
|
|
811
|
+
format_recurve_env_key("link_node_key"): link_item.link_node_key,
|
|
812
|
+
format_recurve_env_key("node_key"): f"{parent_node_key}.{link_item.link_node_key}",
|
|
813
|
+
}
|
|
814
|
+
)
|
|
815
|
+
tmp_kwargs["env"] = tmp_env
|
|
816
|
+
|
|
817
|
+
# Update executor config
|
|
818
|
+
tmp_executor_config = tmp_kwargs.get("executor_config", {})
|
|
819
|
+
tmp_executor_config.update(
|
|
820
|
+
{
|
|
821
|
+
"link_workflow_id": link_item.link_wf_id,
|
|
822
|
+
"link_workflow_version": link_item.link_wf_version,
|
|
823
|
+
}
|
|
824
|
+
)
|
|
825
|
+
tmp_kwargs["executor_config"] = tmp_executor_config
|
|
826
|
+
|
|
827
|
+
self._convert_link_node_to_operators(dag, node, link_item, **tmp_kwargs)
|
|
828
|
+
operators.append(task_group)
|
|
829
|
+
|
|
830
|
+
return operators
|
|
831
|
+
|
|
832
|
+
def _convert_link_node_to_operators(
|
|
833
|
+
self,
|
|
834
|
+
dag: DAG,
|
|
835
|
+
node: SchedulerNode,
|
|
836
|
+
link_item: LinkNodeItem,
|
|
837
|
+
is_workflow: bool = False,
|
|
838
|
+
workflow_skip_self: bool = False,
|
|
839
|
+
**kwargs,
|
|
840
|
+
) -> list[BaseOperator]:
|
|
841
|
+
"""Internal helper to convert link node to operators.
|
|
842
|
+
|
|
843
|
+
Creates a sequence of operators for a link node, handling latest-only checks,
|
|
844
|
+
skip conditions, and the main link node operation.
|
|
845
|
+
|
|
846
|
+
Args:
|
|
847
|
+
dag: Parent Airflow DAG
|
|
848
|
+
node: Node to convert
|
|
849
|
+
link_item: Link node details
|
|
850
|
+
is_workflow: Whether this is part of a workflow
|
|
851
|
+
workflow_skip_self: Whether workflow has skip_self enabled
|
|
852
|
+
**kwargs: Additional operator arguments
|
|
853
|
+
|
|
854
|
+
Returns:
|
|
855
|
+
List of created operators or None if conversion fails
|
|
856
|
+
"""
|
|
857
|
+
operators = []
|
|
858
|
+
# if not link_item:
|
|
859
|
+
# task_id = self.format_task_id(node)
|
|
860
|
+
# operators.append(None) # TODO: Add fallback operator
|
|
861
|
+
# return operators
|
|
862
|
+
|
|
863
|
+
# Determine node execution properties based on workflow context
|
|
864
|
+
if not is_workflow:
|
|
865
|
+
skip_downstream = node.skip_downstream
|
|
866
|
+
latest_only = node.latest_only or link_item.link_latest_only
|
|
867
|
+
skip_self = node.skip_self or link_item.link_skip_self or link_item.link_skip_downstream
|
|
868
|
+
else:
|
|
869
|
+
skip_downstream = link_item.link_skip_downstream
|
|
870
|
+
latest_only = link_item.link_latest_only
|
|
871
|
+
skip_self = link_item.link_skip_self or workflow_skip_self
|
|
872
|
+
|
|
873
|
+
# Create link node with inherited properties
|
|
874
|
+
link_node = SchedulerNode(
|
|
875
|
+
operator=link_item.link_operator,
|
|
876
|
+
node_key=link_item.link_node_key,
|
|
877
|
+
name=link_item.link_node_name,
|
|
878
|
+
id=link_item.link_node_id,
|
|
879
|
+
scheduler_settings=link_item.link_scheduler_settings,
|
|
880
|
+
skip_self=skip_self,
|
|
881
|
+
skip_downstream=skip_downstream,
|
|
882
|
+
latest_only=latest_only,
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
# Merge Airflow arguments from parent and link nodes
|
|
886
|
+
parent_airflow_args = self.get_node_airflow_args(node)
|
|
887
|
+
link_airflow_args = self.get_node_airflow_args(link_node)
|
|
888
|
+
if parent_airflow_args:
|
|
889
|
+
link_airflow_args.update(parent_airflow_args)
|
|
890
|
+
kwargs.update(link_airflow_args)
|
|
891
|
+
|
|
892
|
+
# Add latest-only check for scheduled DAGs
|
|
893
|
+
if dag.schedule_interval != "@once" and link_node.latest_only:
|
|
894
|
+
latest_only_task_id = self.format_task_id(node, "latest_only")
|
|
895
|
+
latest_only_op = LatestOnlyOperator(task_id=latest_only_task_id, dag=dag)
|
|
896
|
+
operators.append(latest_only_op)
|
|
897
|
+
|
|
898
|
+
# Add skip operator if downstream tasks should be skipped
|
|
899
|
+
if link_node.skip_downstream:
|
|
900
|
+
skip_task_id = self.format_task_id(node, "skip_downstream")
|
|
901
|
+
skip_args = {"ignore_downstream_trigger_rules": False} if is_workflow else {}
|
|
902
|
+
|
|
903
|
+
skip_op = ShortCircuitOperator(task_id=skip_task_id, python_callable=lambda: False, dag=dag, **skip_args)
|
|
904
|
+
operators.append(skip_op)
|
|
905
|
+
|
|
906
|
+
# Create main link operator
|
|
907
|
+
main_task_id = self.format_task_id(node)
|
|
908
|
+
if Operator.is_link(link_node.operator):
|
|
909
|
+
# Prevent nested link operators
|
|
910
|
+
main_op = self.__create_link_operator(
|
|
911
|
+
dag=dag,
|
|
912
|
+
node=node,
|
|
913
|
+
link_node=link_node,
|
|
914
|
+
link_item=link_item,
|
|
915
|
+
task_id=main_task_id,
|
|
916
|
+
is_workflow=is_workflow,
|
|
917
|
+
)
|
|
918
|
+
else:
|
|
919
|
+
# Add workflow metadata to executor config
|
|
920
|
+
executor_config = copy.deepcopy(kwargs)
|
|
921
|
+
executor_config["executor_config"].update(
|
|
922
|
+
{
|
|
923
|
+
"link_workflow_id": link_item.link_wf_id,
|
|
924
|
+
"link_workflow_version": link_item.link_wf_version,
|
|
925
|
+
}
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
main_op = self.__create_link_operator(
|
|
929
|
+
dag=dag,
|
|
930
|
+
node=node,
|
|
931
|
+
link_node=link_node,
|
|
932
|
+
link_item=link_item,
|
|
933
|
+
task_id=main_task_id,
|
|
934
|
+
is_workflow=is_workflow,
|
|
935
|
+
**executor_config,
|
|
936
|
+
)
|
|
937
|
+
operators.append(main_op)
|
|
938
|
+
|
|
939
|
+
# Add empty node after skip_self operator to ensure proper trigger rule handling
|
|
940
|
+
# Only add empty node if skip_downstream is False, to avoid conflicts
|
|
941
|
+
if link_node.skip_self and not link_node.skip_downstream:
|
|
942
|
+
empty_task_id = self.format_task_id(node, "skip_self")
|
|
943
|
+
empty_operator = EmptyOperator(task_id=empty_task_id, trigger_rule=TriggerRule.NONE_FAILED, dag=dag)
|
|
944
|
+
operators.append(empty_operator)
|
|
945
|
+
|
|
946
|
+
# Set up dependencies between operators
|
|
947
|
+
for upstream_op, downstream_op in zip(operators[:-1], operators[1:]):
|
|
948
|
+
upstream_op.set_downstream(downstream_op)
|
|
949
|
+
|
|
950
|
+
return operators
|
|
951
|
+
|
|
952
|
+
def _on_finished_callback(self, callback_context):
|
|
953
|
+
dag_run: DagRun = callback_context["dag_run"]
|
|
954
|
+
tis: list[TaskInstance] = dag_run.get_task_instances()
|
|
955
|
+
task_info_map = {}
|
|
956
|
+
for ti in tis:
|
|
957
|
+
task_info_map[ti.task_id] = {
|
|
958
|
+
"state": ti.state,
|
|
959
|
+
"task_display_name": ti.task_display_name,
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
job_run_result = {
|
|
963
|
+
"job_id": getattr(callback_context["dag"], "job_id", None), # from Recurve metadata
|
|
964
|
+
"run_id": callback_context["run_id"],
|
|
965
|
+
"task_info_map": task_info_map,
|
|
966
|
+
"state": dag_run.get_state(),
|
|
967
|
+
"data_interval_end": dag_run.data_interval_end.isoformat(),
|
|
968
|
+
}
|
|
969
|
+
self.client.on_job_run_finished(job_run_result)
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
if __name__ == "__main__":
|
|
973
|
+
scheduler = AirflowScheduler()
|
|
974
|
+
globals().update(scheduler.execute())
|