recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,974 @@
1
+ import copy
2
+ import datetime
3
+ import inspect
4
+ import json
5
+ import logging
6
+ import os
7
+ from functools import lru_cache
8
+ from typing import Any, Callable, Generator, Union
9
+
10
+ import pendulum
11
+ from airflow.models import DAG, BaseOperator, DagRun, TaskInstance
12
+ from airflow.operators.empty import EmptyOperator
13
+ from airflow.operators.latest_only import LatestOnlyOperator
14
+ from airflow.operators.python import ShortCircuitOperator
15
+ from airflow.utils.task_group import TaskGroup
16
+ from airflow.utils.trigger_rule import TriggerRule
17
+ from slugify import slugify
18
+
19
+ from recurvedata.config import RECURVE_EXECUTOR_CLI, RECURVE_EXECUTOR_DBT_CLI
20
+ from recurvedata.consts import Operator
21
+ from recurvedata.schedulers.airflow_operators import LinkNodeBashOperator, RecurveBashOperator, SkipSelfBashOperator
22
+ from recurvedata.schedulers.base import DagSchema, SchedulerBase
23
+ from recurvedata.schedulers.consts import (
24
+ DEFAULT_RETRY_DELAY,
25
+ DEFAULT_RETRY_NUMBER,
26
+ WORK_DIR,
27
+ format_recurve_env_key,
28
+ is_dev_run_job,
29
+ )
30
+ from recurvedata.schedulers.model import LinkNodeItem, LinkWorkflowItem, SchedulerNode
31
+ from recurvedata.schedulers.utils import format_dag_id
32
+ from recurvedata.utils.crontab import get_schedule
33
+ from recurvedata.utils.dataclass import init_dataclass_from_dict
34
+ from recurvedata.utils.date_time import ensure_datetime, now_aware
35
+ from recurvedata.utils.helpers import extract_dict
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ @lru_cache()
41
+ def _get_function_param_names(function: Callable) -> list[str]:
42
+ sig = inspect.signature(function)
43
+ return list(sig.parameters.keys())
44
+
45
+
46
+ AIRFLOW_DAG_INIT_PARAMS = _get_function_param_names(DAG.__init__)
47
+
48
+
49
+ class AirflowScheduler(SchedulerBase):
50
+ def __init__(self, sharding_size: int = 1, sharding_key: int = 0):
51
+ """Initialize the Airflow scheduler.
52
+
53
+ Args:
54
+ sharding_size: Number of shards to split DAGs across
55
+ sharding_key: Which shard this scheduler instance handles
56
+ """
57
+ logger.debug(f"Initializing AirflowScheduler with sharding_size={sharding_size}, sharding_key={sharding_key}")
58
+
59
+ # Temporarily removed sharding key extraction from environment due to DAG leakage issues
60
+ # job_id = self.extract_sharding_key_from_environment()
61
+ # if job_id:
62
+ # logger.info(
63
+ # f"Switching sharding_size from {sharding_size} to {job_id}, sharding_key from {sharding_key} to 0"
64
+ # )
65
+ # sharding_size = job_id
66
+ # sharding_key = 0
67
+
68
+ super().__init__(sharding_size, sharding_key)
69
+ self.link_node_dict = {}
70
+ self.link_workflow_dict = {}
71
+
72
+ @staticmethod
73
+ def extract_sharding_key_from_environment() -> int | None:
74
+ """Extract sharding key from environment variables.
75
+
76
+ When Airflow worker runs 'airflow task run {dag_id} {task_id}',
77
+ the dag_id is written to environment variables which we can use
78
+ to determine the sharding key.
79
+
80
+ Returns:
81
+ Extracted sharding key as integer if found, None otherwise
82
+ """
83
+ dag_id = os.environ.get("_AIRFLOW_PARSING_CONTEXT_DAG_ID")
84
+ if not dag_id:
85
+ job_id = os.environ.get("RECURVE_AUTOGEN_SINGLE_SHARDING_SIZE")
86
+ return int(job_id) if job_id else None
87
+
88
+ job_id = dag_id.rsplit(".")[-1]
89
+ return int(job_id) if job_id.isdigit() else None
90
+
91
+ def list_scheduler_dag(self) -> Generator[DagSchema, None, None]:
92
+ """Get all DAG information from SDK that matches sharding criteria.
93
+
94
+ Yields:
95
+ DagSchema objects for each matching DAG
96
+ """
97
+ response = self.client.list_jobs(sharding_size=self.sharding_size, sharding_key=self.sharding_key)
98
+
99
+ # Build link node dictionary
100
+ self.link_node_dict = {
101
+ node.node_id: init_dataclass_from_dict(LinkNodeItem, node.model_dump()) for node in response.link_nodes
102
+ }
103
+
104
+ # Process link workflows
105
+ for workflow in response.link_workflows:
106
+ workflow_item: LinkWorkflowItem = init_dataclass_from_dict(LinkWorkflowItem, workflow.model_dump())
107
+
108
+ # Process link nodes within workflow
109
+ processed_link_nodes = []
110
+ for node_dict in workflow_item.link_nodes:
111
+ node_item: LinkNodeItem = init_dataclass_from_dict(LinkNodeItem, node_dict)
112
+ node_item.node_id = workflow_item.node_id
113
+ node_item.link_wf_id = workflow_item.link_wf_id
114
+ processed_link_nodes.append(node_item)
115
+
116
+ workflow_item.link_nodes = processed_link_nodes
117
+ self.link_workflow_dict[workflow.node_id] = workflow_item
118
+
119
+ # Yield DAG schemas
120
+ for job in response.jobs:
121
+ dag_schema: DagSchema = init_dataclass_from_dict(DagSchema, job.model_dump())
122
+ yield dag_schema
123
+
124
+ def execute(self) -> dict[str, DAG]:
125
+ """Execute scheduler to create Airflow DAGs.
126
+
127
+ Returns:
128
+ Dictionary mapping DAG IDs to DAG objects
129
+ """
130
+ dag_dict = {}
131
+ for dag_schema in self.list_scheduler_dag():
132
+ airflow_dag = self.create_dag(dag_schema)
133
+ if airflow_dag:
134
+ dag_dict[airflow_dag.dag_id] = airflow_dag
135
+ return dag_dict
136
+
137
+ @staticmethod
138
+ def dag_date_2_airflow_date(
139
+ scheduler_interval: str, dag_date: Union[datetime.datetime], timezone: str, is_end_date: bool = False
140
+ ) -> datetime.datetime:
141
+ """Convert DAG date to Airflow date with timezone handling.
142
+
143
+ Args:
144
+ scheduler_interval: DAG schedule interval
145
+ dag_date: Date to convert
146
+ timezone: Target timezone
147
+ is_end_date: Whether this is an end date requiring special handling
148
+
149
+ Returns:
150
+ Converted datetime with proper timezone
151
+ """
152
+ if not dag_date:
153
+ return dag_date
154
+
155
+ # Add timezone
156
+ dag_date = ensure_datetime(dag_date).replace(tzinfo=pendulum.timezone(timezone))
157
+
158
+ # Handle manual/once-off DAGs
159
+ if scheduler_interval == "@once" or not scheduler_interval:
160
+ return dag_date
161
+
162
+ # Calculate execution dates
163
+ next_execution_date = get_schedule(schedule_interval=scheduler_interval, dttm=dag_date, is_next=True)
164
+ current_execution_date = get_schedule(
165
+ schedule_interval=scheduler_interval, dttm=next_execution_date, is_next=False
166
+ )
167
+
168
+ if not is_end_date:
169
+ if current_execution_date != dag_date:
170
+ return current_execution_date
171
+
172
+ previous_execution_date = get_schedule(
173
+ schedule_interval=scheduler_interval, dttm=current_execution_date, is_next=False
174
+ )
175
+ return previous_execution_date
176
+
177
+ @staticmethod
178
+ def _cal_retry_number(dag_schema: DagSchema) -> int:
179
+ """Calculate retry number for a DAG.
180
+
181
+ Args:
182
+ dag_schema: DAG schema to calculate retries for
183
+
184
+ Returns:
185
+ Number of retries to configure
186
+ """
187
+ if is_dev_run_job(dag_schema.name):
188
+ return 0
189
+ if dag_schema.retries is not None:
190
+ return dag_schema.retries
191
+ return DEFAULT_RETRY_NUMBER
192
+
193
+ @staticmethod
194
+ def _cal_retry_delay(dag_schema: DagSchema) -> datetime.timedelta:
195
+ """Calculate retry delay for a DAG.
196
+
197
+ Args:
198
+ dag_schema: DAG schema to calculate retries for
199
+
200
+ Returns:
201
+ Retry delay to configure
202
+ """
203
+ return datetime.timedelta(seconds=3600)
204
+ if dag_schema.retry_delay is not None:
205
+ return datetime.timedelta(seconds=dag_schema.retry_delay)
206
+ return datetime.timedelta(seconds=DEFAULT_RETRY_DELAY)
207
+
208
+ def create_dag_impl(self, dag_schema: DagSchema) -> DAG | None:
209
+ """Create Airflow DAG from schema.
210
+
211
+ Args:
212
+ dag_schema: Schema defining the DAG
213
+
214
+ Returns:
215
+ Created Airflow DAG object or None if creation fails
216
+ """
217
+ # Calculate dates
218
+ airflow_end_date = self.dag_date_2_airflow_date(
219
+ dag_schema.schedule_interval, dag_schema.end_date, dag_schema.timezone, is_end_date=True
220
+ )
221
+
222
+ airflow_start_date = (
223
+ self.dag_date_2_airflow_date(dag_schema.schedule_interval, dag_schema.start_date, dag_schema.timezone)
224
+ or now_aware()
225
+ )
226
+
227
+ # Set up default arguments
228
+ default_args = {
229
+ "owner": dag_schema.owner_username or self.DEFAULT_DAG_OWNER,
230
+ "start_date": airflow_start_date,
231
+ "end_date": airflow_end_date,
232
+ "depends_on_past": False,
233
+ "retries": self._cal_retry_number(dag_schema),
234
+ "retry_delay": self._cal_retry_delay(dag_schema),
235
+ }
236
+
237
+ # Process Airflow-specific arguments
238
+ airflow_args = dag_schema.scheduler_settings or {}
239
+ if airflow_args:
240
+ custom_defaults = airflow_args.pop("default_args", None)
241
+ if custom_defaults:
242
+ for key, value in custom_defaults.items():
243
+ if key in ("execution_timeout", "retry_delay"):
244
+ custom_defaults[key] = datetime.timedelta(seconds=value)
245
+ else:
246
+ custom_defaults[key] = value
247
+ default_args.update(custom_defaults)
248
+
249
+ # Remove reserved keys
250
+ for reserved in ["dag_id", "default_args", "schedule_interval"]:
251
+ airflow_args.pop(reserved, None)
252
+
253
+ airflow_args = self._clean_airflow_args(airflow_args) or {}
254
+
255
+ # Determine schedule interval
256
+ schedule_interval = None if dag_schema.schedule_type == "manual" else dag_schema.schedule_interval
257
+
258
+ # Create DAG
259
+ dag = DAG(
260
+ dag_id=self.format_dag_id(dag_schema),
261
+ default_args=default_args,
262
+ schedule=schedule_interval,
263
+ start_date=airflow_start_date,
264
+ end_date=airflow_end_date,
265
+ dag_display_name=dag_schema.name,
266
+ on_success_callback=self._on_finished_callback,
267
+ on_failure_callback=self._on_finished_callback,
268
+ **airflow_args,
269
+ )
270
+
271
+ # Add Recurve metadata
272
+ dag._is_generated_by_recurve = True
273
+ dag.job_id = dag_schema.job_id
274
+
275
+ # Set up DAG structure
276
+ self.setup_graph(dag, dag_schema)
277
+
278
+ return dag
279
+
280
+ def setup_graph(self, dag: DAG, recurve_dag: DagSchema):
281
+ """Set up the DAG graph structure.
282
+
283
+ Args:
284
+ dag: Airflow DAG to configure
285
+ recurve_dag: Schema defining the DAG structure
286
+ """
287
+ operator_dict = {}
288
+
289
+ # Create operators for each node
290
+ for node_dict in recurve_dag.nodes:
291
+ node: SchedulerNode = init_dataclass_from_dict(SchedulerNode, node_dict)
292
+ node.id = int(node.id)
293
+
294
+ try:
295
+ operators = self.convert_node_to_operators(dag, recurve_dag, node)
296
+ except Exception as exc:
297
+ logger.exception(f"Failed to create node {dag.dag_id} {node.id}: {exc}")
298
+ continue
299
+
300
+ if not operators:
301
+ continue
302
+
303
+ # Add Recurve metadata to operators
304
+ doc_metadata = {
305
+ "recurve_node_id": node.id,
306
+ "recurve_node_key": node.node_key,
307
+ }
308
+
309
+ for operator in operators:
310
+ if isinstance(operator, TaskGroup):
311
+ for sub_op in operator:
312
+ sub_doc = json.loads(sub_op.doc_json) if sub_op.doc_json else {}
313
+ sub_doc.update(doc_metadata)
314
+ sub_op.doc_json = json.dumps(sub_doc)
315
+ else:
316
+ operator.doc_json = json.dumps(doc_metadata)
317
+
318
+ operator_dict[node_dict["node_key"]] = operators
319
+
320
+ # Set up dependencies
321
+ already_set = set()
322
+ for upstream_key, downstream_key in recurve_dag.graph:
323
+ edge = (upstream_key, downstream_key)
324
+ if edge in already_set:
325
+ continue
326
+
327
+ if not (operator_dict.get(upstream_key) and operator_dict.get(downstream_key)):
328
+ continue
329
+
330
+ upstream = operator_dict[upstream_key][-1]
331
+ downstream = operator_dict[downstream_key][0]
332
+ upstream.set_downstream(downstream)
333
+ already_set.add(edge)
334
+
335
+ def convert_node_to_operators(self, dag: DAG, recurve_dag: DagSchema, node: SchedulerNode) -> list[BaseOperator]:
336
+ """Convert a DAG node to Airflow operators.
337
+
338
+ Args:
339
+ dag: Parent Airflow DAG
340
+ recurve_dag: Schema defining the DAG
341
+ node: Node to convert
342
+
343
+ Returns:
344
+ List of created operators or None if conversion fails
345
+ """
346
+ # Prepare environment
347
+ bash_env = self._prepare_bash_env(recurve_dag, node)
348
+ kwargs = {
349
+ "env": bash_env,
350
+ "executor_config": {"workflow_version": recurve_dag.workflow_version},
351
+ }
352
+
353
+ # Handle link operators
354
+ if Operator.is_link(node.operator):
355
+ if node.id in self.link_workflow_dict:
356
+ return self.convert_link_workflow_node_to_operators(dag, node, **kwargs)
357
+ return self.convert_link_node_to_operators(dag, node, self.link_node_dict.get(node.id), **kwargs)
358
+
359
+ # Get node-specific Airflow args
360
+ node_airflow_args = self.get_node_airflow_args(node)
361
+ kwargs.update(node_airflow_args)
362
+
363
+ operators = []
364
+
365
+ # Add latest-only operator if needed
366
+ if dag.schedule_interval != "@once" and node.latest_only:
367
+ task_id = self.format_task_id(node, "latest_only")
368
+ latest_only = LatestOnlyOperator(task_id=task_id, dag=dag)
369
+ operators.append(latest_only)
370
+
371
+ # Add skip operator if needed
372
+ if node.skip_downstream:
373
+ skip_task = ShortCircuitOperator(
374
+ dag=dag, task_id=self.format_task_id(node, "skip_downstream"), python_callable=lambda: False
375
+ )
376
+ operators.append(skip_task)
377
+
378
+ # Add main operator
379
+ task_id = self.format_task_id(node)
380
+ main_operator = self._create_operator(dag, node, task_id, **kwargs)
381
+ operators.append(main_operator)
382
+
383
+ # Add empty node after skip_self operator to ensure proper trigger rule handling
384
+ # Only add empty node if skip_downstream is False, to avoid conflicts
385
+ if node.skip_self and not node.skip_downstream:
386
+ empty_task_id = self.format_task_id(node, "skip_self")
387
+ empty_operator = EmptyOperator(task_id=empty_task_id, trigger_rule=TriggerRule.NONE_FAILED, dag=dag)
388
+ operators.append(empty_operator)
389
+
390
+ # Set up dependencies
391
+ for upstream, downstream in zip(operators[:-1], operators[1:]):
392
+ upstream.set_downstream(downstream)
393
+
394
+ return operators
395
+
396
+ @staticmethod
397
+ def _prepare_bash_env(recurve_dag: DagSchema, node: SchedulerNode) -> dict[str, Any]:
398
+ """Prepare bash environment variables for operators.
399
+
400
+ Args:
401
+ recurve_dag: DAG schema
402
+ node: Node to prepare environment for
403
+
404
+ Returns:
405
+ Dictionary of environment variables
406
+ """
407
+ env = {
408
+ "AIRFLOW_RETRY_NUMBER": "{{ task_instance.try_number }}",
409
+ "AIRFLOW_MAX_RETRY_NUMBER": "{{ task_instance.max_tries }}",
410
+ "AIRFLOW_DATA_INTERVAL_END": "{{ task_instance.dag_run.data_interval_end.isoformat() }}",
411
+ format_recurve_env_key("workflow_version"): recurve_dag.workflow_version,
412
+ format_recurve_env_key("node_key"): node.node_key,
413
+ format_recurve_env_key("job_run_conf"): "{{ dag_run.conf | tojson }}",
414
+ }
415
+
416
+ # Copy relevant environment variables
417
+ for key, value in os.environ.items():
418
+ if key.startswith("RECURVE__"):
419
+ env[key] = value
420
+ elif key.startswith("AIRFLOW__") and node.operator == "SensorOperator":
421
+ env[key] = value
422
+ elif key in (
423
+ "AIRFLOW_CTX_DAG_RUN_ID",
424
+ "AIRFLOW_CTX_TRY_NUMBER",
425
+ "AIRFLOW_CTX_EXECUTION_DATE",
426
+ "PATH",
427
+ "PYENV_ROOT",
428
+ ):
429
+ env[key] = value
430
+
431
+ return env
432
+
433
+ def _create_operator(
434
+ self, dag: DAG, node: SchedulerNode, task_id: str, stage: str = None, **kwargs
435
+ ) -> BaseOperator:
436
+ """Create an Airflow operator for a node.
437
+
438
+ Args:
439
+ dag: Parent Airflow DAG
440
+ node: Node to create operator for
441
+ task_id: ID for the task
442
+ stage: Optional stage name
443
+ **kwargs: Additional operator arguments
444
+
445
+ Returns:
446
+ Created operator
447
+ """
448
+ cmd = self.format_command(dag, node, stage)
449
+ operator_class = SkipSelfBashOperator if node.skip_self else RecurveBashOperator
450
+
451
+ return operator_class(task_id=task_id, bash_command=cmd, dag=dag, task_display_name=node.name, **kwargs)
452
+
453
+ @staticmethod
454
+ def format_command(dag: DAG, node: SchedulerNode, stage: str) -> str:
455
+ """Format command string for bash operator.
456
+
457
+ Args:
458
+ dag: Parent Airflow DAG
459
+ node: Node to create command for
460
+ stage: Optional stage name
461
+
462
+ Returns:
463
+ Formatted command string
464
+ """
465
+ node_slug = f"{slugify(node.name)}.{node.id}"
466
+
467
+ # Determine execution date template
468
+ if dag.schedule_interval == "@once" or not dag.schedule_interval:
469
+ execution_date = "logical_date"
470
+ else:
471
+ execution_date = "data_interval_end if data_interval_end is not none else logical_date"
472
+
473
+ # Build command options
474
+ options = [
475
+ f"--dag_slug '{dag.dag_id}'",
476
+ f"--node_slug '{node_slug}'",
477
+ "--execution_date '{{ %s }}'" % execution_date,
478
+ ]
479
+
480
+ if stage is not None:
481
+ options.append(f"--stage {stage}")
482
+
483
+ # Build full command
484
+ if node.operator == Operator.DBTOperator:
485
+ return f'cd {WORK_DIR} && {RECURVE_EXECUTOR_DBT_CLI} execute {" ".join(options)}'
486
+ return f'cd {WORK_DIR} && {RECURVE_EXECUTOR_CLI} execute {" ".join(options)}'
487
+
488
+ @staticmethod
489
+ def format_dag_id(row: DagSchema) -> str:
490
+ """Format DAG ID from schema.
491
+
492
+ Args:
493
+ row: DAG schema
494
+
495
+ Returns:
496
+ Formatted DAG ID
497
+ """
498
+ return format_dag_id(row.job_id)
499
+
500
+ @staticmethod
501
+ def format_task_id(node: SchedulerNode, suffix=None) -> str:
502
+ """Format task ID for a node.
503
+
504
+ WARNING: This function should not be modified arbitrarily as it affects
505
+ existing task IDs.
506
+
507
+ Args:
508
+ node: Node to format ID for
509
+ suffix: Optional suffix to append
510
+
511
+ Returns:
512
+ Formatted task ID
513
+ """
514
+ task_id = f"{node.node_key}"
515
+ if suffix:
516
+ task_id = f"{task_id}-{suffix}"
517
+ return task_id
518
+
519
+ @staticmethod
520
+ def format_link_node_task_id(node: SchedulerNode, suffix=None) -> str:
521
+ """Format task ID for a link node.
522
+
523
+ WARNING: This function should not be modified arbitrarily as it affects
524
+ existing task IDs.
525
+
526
+ Args:
527
+ node: Node to format ID for
528
+ suffix: Optional suffix to append
529
+
530
+ Returns:
531
+ Formatted task ID
532
+ """
533
+ task_id = f"{node.node_key}"
534
+ if suffix:
535
+ task_id = f"{task_id}-{suffix}"
536
+ return task_id
537
+
538
+ @staticmethod
539
+ def get_node_airflow_args(node: SchedulerNode) -> dict:
540
+ """Get Airflow arguments for a node.
541
+
542
+ Args:
543
+ node: Node to get arguments for
544
+
545
+ Returns:
546
+ Dictionary of Airflow arguments
547
+ """
548
+ scheduler_settings = node.scheduler_settings or {}
549
+
550
+ # Get explicit Airflow args
551
+ if "airflow_args" in scheduler_settings:
552
+ airflow_args = json.loads(scheduler_settings["airflow_args"])
553
+ else:
554
+ airflow_args = {}
555
+
556
+ # Process other Airflow settings
557
+ for key, value in scheduler_settings.items():
558
+ if key == "airflow_args" or not key.startswith("airflow"):
559
+ continue
560
+
561
+ key = key.lstrip("airflow_")
562
+
563
+ # Convert time values to timedelta
564
+ if key in ["execution_timeout", "retry_delay", "sla"] and isinstance(value, (int, float)):
565
+ value = datetime.timedelta(seconds=value)
566
+
567
+ airflow_args[key] = value
568
+
569
+ return airflow_args
570
+
571
+ @staticmethod
572
+ def _clean_airflow_args(airflow_args: dict[str, Any] | None) -> dict[str, Any] | None:
573
+ """Clean Airflow arguments to only include valid parameters.
574
+
575
+ Args:
576
+ airflow_args: Arguments to clean
577
+
578
+ Returns:
579
+ Cleaned arguments dictionary
580
+ """
581
+ if not airflow_args:
582
+ return airflow_args
583
+ return extract_dict(airflow_args, list(AIRFLOW_DAG_INIT_PARAMS))
584
+
585
+ def __create_link_operator(
586
+ self,
587
+ dag: DAG,
588
+ node: SchedulerNode,
589
+ link_node: SchedulerNode,
590
+ link_item: LinkNodeItem,
591
+ task_id: str,
592
+ stage: str = None,
593
+ is_workflow: bool = False,
594
+ **kwargs,
595
+ ) -> LinkNodeBashOperator:
596
+ """Create a link node operator.
597
+
598
+ Args:
599
+ dag: Parent Airflow DAG
600
+ node: Parent node
601
+ link_node: Link node to create operator for
602
+ link_item: Link node details
603
+ task_id: ID for the task
604
+ stage: Optional stage name
605
+ is_workflow: Whether this is part of a workflow
606
+ **kwargs: Additional operator arguments
607
+
608
+ Returns:
609
+ Created link node operator
610
+ """
611
+ cmd = self.format_link_node_command(dag, node, link_item, stage, is_workflow)
612
+ operator_class = SkipSelfBashOperator if link_node.skip_self else LinkNodeBashOperator
613
+
614
+ return operator_class(
615
+ task_id=task_id,
616
+ bash_command=cmd,
617
+ dag=dag,
618
+ task_display_name=f"{node.name}.{link_item.link_node_name}",
619
+ **kwargs,
620
+ )
621
+
622
+ @staticmethod
623
+ def format_link_node_command(
624
+ dag: DAG, node: SchedulerNode, link_detail: LinkNodeItem, stage: str, is_workflow: bool
625
+ ) -> str:
626
+ """Format command for link node operator.
627
+
628
+ Args:
629
+ dag: Parent Airflow DAG
630
+ node: Parent node
631
+ link_detail: Link node details
632
+ stage: Optional stage name
633
+ is_workflow: Whether this is part of a workflow
634
+
635
+ Returns:
636
+ Formatted command string
637
+ """
638
+ node_slug = f"{slugify(node.name)}.{node.id}"
639
+ execution_date = "logical_date" if dag.schedule_interval == "@once" else "data_interval_end"
640
+
641
+ # Build command options
642
+ options = [
643
+ f"--dag_slug '{dag.dag_id}'",
644
+ f"--node_slug '{node_slug}'",
645
+ "--execution_date '{{ %s }}'" % execution_date,
646
+ f"--link_workflow_id {link_detail.link_wf_id}",
647
+ f"--link_node_id {link_detail.link_node_id}",
648
+ ]
649
+
650
+ if stage is not None:
651
+ options.append(f"--stage {stage}")
652
+
653
+ if is_workflow:
654
+ options.append("--is_link_workflow")
655
+
656
+ # Build full command
657
+ if link_detail.link_operator == Operator.DBTOperator:
658
+ return f'cd {WORK_DIR} && {RECURVE_EXECUTOR_DBT_CLI} execute {" ".join(options)}'
659
+ return f'cd {WORK_DIR} && {RECURVE_EXECUTOR_CLI} execute {" ".join(options)}'
660
+
661
+ def convert_link_workflow_node_to_operators(self, dag: DAG, node: SchedulerNode, **kwargs) -> list[BaseOperator]:
662
+ """Convert a link workflow node to operators.
663
+
664
+ Args:
665
+ dag: Parent Airflow DAG
666
+ node: Node to convert
667
+ **kwargs: Additional operator arguments
668
+
669
+ Returns:
670
+ List of created operators or None if conversion fails
671
+ """
672
+ link_workflow_item: LinkWorkflowItem = self.link_workflow_dict.get(node.id)
673
+ if not link_workflow_item:
674
+ return []
675
+
676
+ operators = []
677
+
678
+ # Add latest-only operator if needed
679
+ if dag.schedule_interval != "@once" and node.latest_only:
680
+ task_id = self.format_task_id(node, "latest_only")
681
+ latest_only = LatestOnlyOperator(task_id=task_id, dag=dag)
682
+ operators.append(latest_only)
683
+
684
+ # Add skip operator if needed
685
+ if node.skip_downstream:
686
+ skip_task = ShortCircuitOperator(
687
+ task_id=self.format_task_id(node, "skip_downstream"), python_callable=lambda: False, dag=dag
688
+ )
689
+ operators.append(skip_task)
690
+
691
+ # Save original node properties
692
+ node_original_name = node.name
693
+ node_original_key = node.node_key
694
+
695
+ has_inner_skip_downstream = False
696
+ has_inner_latest_only = False
697
+ link_end_task_id = self.format_task_id(node, "link_end")
698
+ latest_only_task_id = self.format_task_id(node, "latest_only2")
699
+
700
+ # Create task group
701
+ group_id = f"{node.node_key}"
702
+ with TaskGroup(group_id=group_id, dag=dag) as task_group:
703
+ operator_dict = {}
704
+
705
+ # Process each link node
706
+ for link_item in link_workflow_item.link_nodes:
707
+ link_plan_id = str(link_item.plan_id) if link_item.plan_id else dag.dag_id
708
+ if link_plan_id != dag.dag_id:
709
+ logger.warning(
710
+ f"Link node {link_item.link_node_key} is not in the same plan as the current DAG, link_plan_id: {link_plan_id}, dag_id: {dag.dag_id}"
711
+ )
712
+ continue
713
+
714
+ node.node_key = link_item.link_node_key
715
+
716
+ # Prepare environment
717
+ tmp_kwargs = copy.deepcopy(kwargs)
718
+ tmp_env = tmp_kwargs.get("env", {})
719
+ tmp_env.update(
720
+ {
721
+ format_recurve_env_key("link_workflow_version"): link_item.link_wf_version,
722
+ format_recurve_env_key("link_node_key"): link_item.link_node_key,
723
+ format_recurve_env_key("node_key"): f"{group_id}.{node.node_key}",
724
+ }
725
+ )
726
+ tmp_kwargs["env"] = tmp_env
727
+
728
+ # Update executor config
729
+ tmp_executor_config = kwargs.get("executor_config", {})
730
+ tmp_executor_config.update(
731
+ {
732
+ "link_workflow_id": link_item.link_wf_id,
733
+ "link_workflow_version": link_item.link_wf_version,
734
+ }
735
+ )
736
+ tmp_kwargs["executor_config"] = tmp_executor_config
737
+
738
+ # Create operators
739
+ tmp_ops = self._convert_link_node_to_operators(
740
+ dag, node, link_item, is_workflow=True, workflow_skip_self=node.skip_self, **tmp_kwargs
741
+ )
742
+ operator_dict[link_item.link_node_key] = tmp_ops
743
+
744
+ # Track special operators
745
+ for op in tmp_ops:
746
+ if isinstance(op, ShortCircuitOperator):
747
+ has_inner_skip_downstream = True
748
+ if isinstance(op, LatestOnlyOperator):
749
+ has_inner_latest_only = True
750
+
751
+ # Set up dependencies within group
752
+ for upstream_key, downstream_key in link_workflow_item.link_graph:
753
+ if not (operator_dict.get(upstream_key) and operator_dict.get(downstream_key)):
754
+ continue
755
+
756
+ upstream = operator_dict[upstream_key][-1]
757
+ downstream = operator_dict[downstream_key][0]
758
+ upstream.set_downstream(downstream)
759
+
760
+ operators.append(task_group)
761
+
762
+ # Add end task if needed
763
+ if (has_inner_skip_downstream or has_inner_latest_only) and not node.skip_downstream:
764
+ operators.append(
765
+ EmptyOperator(
766
+ task_id=link_end_task_id,
767
+ trigger_rule=TriggerRule.NONE_FAILED,
768
+ dag=dag,
769
+ )
770
+ )
771
+
772
+ # Add second latest-only operator if needed
773
+ if node.latest_only and has_inner_skip_downstream:
774
+ latest_only2 = LatestOnlyOperator(task_id=latest_only_task_id, dag=dag)
775
+ operators.append(latest_only2)
776
+
777
+ # Set up dependencies between operators
778
+ for upstream, downstream in zip(operators[:-1], operators[1:]):
779
+ upstream.set_downstream(downstream)
780
+
781
+ # Restore original node properties
782
+ node.name = node_original_name
783
+ node.node_key = node_original_key
784
+
785
+ return operators
786
+
787
+ def convert_link_node_to_operators(self, dag: DAG, node: SchedulerNode, link_item: LinkNodeItem, **kwargs) -> list:
788
+ """Convert a link node to operators.
789
+
790
+ Args:
791
+ dag: Parent Airflow DAG
792
+ node: Node to convert
793
+ link_item: Link node details
794
+ **kwargs: Additional operator arguments
795
+
796
+ Returns:
797
+ List of created operators
798
+ """
799
+ operators = []
800
+ parent_node_key = node.node_key
801
+
802
+ with TaskGroup(group_id=node.node_key, dag=dag) as task_group:
803
+ node.node_key = link_item.link_node_key
804
+
805
+ # Prepare environment
806
+ tmp_kwargs = copy.deepcopy(kwargs)
807
+ tmp_env = tmp_kwargs.get("env", {})
808
+ tmp_env.update(
809
+ {
810
+ format_recurve_env_key("link_workflow_version"): link_item.link_wf_version,
811
+ format_recurve_env_key("link_node_key"): link_item.link_node_key,
812
+ format_recurve_env_key("node_key"): f"{parent_node_key}.{link_item.link_node_key}",
813
+ }
814
+ )
815
+ tmp_kwargs["env"] = tmp_env
816
+
817
+ # Update executor config
818
+ tmp_executor_config = tmp_kwargs.get("executor_config", {})
819
+ tmp_executor_config.update(
820
+ {
821
+ "link_workflow_id": link_item.link_wf_id,
822
+ "link_workflow_version": link_item.link_wf_version,
823
+ }
824
+ )
825
+ tmp_kwargs["executor_config"] = tmp_executor_config
826
+
827
+ self._convert_link_node_to_operators(dag, node, link_item, **tmp_kwargs)
828
+ operators.append(task_group)
829
+
830
+ return operators
831
+
832
+ def _convert_link_node_to_operators(
833
+ self,
834
+ dag: DAG,
835
+ node: SchedulerNode,
836
+ link_item: LinkNodeItem,
837
+ is_workflow: bool = False,
838
+ workflow_skip_self: bool = False,
839
+ **kwargs,
840
+ ) -> list[BaseOperator]:
841
+ """Internal helper to convert link node to operators.
842
+
843
+ Creates a sequence of operators for a link node, handling latest-only checks,
844
+ skip conditions, and the main link node operation.
845
+
846
+ Args:
847
+ dag: Parent Airflow DAG
848
+ node: Node to convert
849
+ link_item: Link node details
850
+ is_workflow: Whether this is part of a workflow
851
+ workflow_skip_self: Whether workflow has skip_self enabled
852
+ **kwargs: Additional operator arguments
853
+
854
+ Returns:
855
+ List of created operators or None if conversion fails
856
+ """
857
+ operators = []
858
+ # if not link_item:
859
+ # task_id = self.format_task_id(node)
860
+ # operators.append(None) # TODO: Add fallback operator
861
+ # return operators
862
+
863
+ # Determine node execution properties based on workflow context
864
+ if not is_workflow:
865
+ skip_downstream = node.skip_downstream
866
+ latest_only = node.latest_only or link_item.link_latest_only
867
+ skip_self = node.skip_self or link_item.link_skip_self or link_item.link_skip_downstream
868
+ else:
869
+ skip_downstream = link_item.link_skip_downstream
870
+ latest_only = link_item.link_latest_only
871
+ skip_self = link_item.link_skip_self or workflow_skip_self
872
+
873
+ # Create link node with inherited properties
874
+ link_node = SchedulerNode(
875
+ operator=link_item.link_operator,
876
+ node_key=link_item.link_node_key,
877
+ name=link_item.link_node_name,
878
+ id=link_item.link_node_id,
879
+ scheduler_settings=link_item.link_scheduler_settings,
880
+ skip_self=skip_self,
881
+ skip_downstream=skip_downstream,
882
+ latest_only=latest_only,
883
+ )
884
+
885
+ # Merge Airflow arguments from parent and link nodes
886
+ parent_airflow_args = self.get_node_airflow_args(node)
887
+ link_airflow_args = self.get_node_airflow_args(link_node)
888
+ if parent_airflow_args:
889
+ link_airflow_args.update(parent_airflow_args)
890
+ kwargs.update(link_airflow_args)
891
+
892
+ # Add latest-only check for scheduled DAGs
893
+ if dag.schedule_interval != "@once" and link_node.latest_only:
894
+ latest_only_task_id = self.format_task_id(node, "latest_only")
895
+ latest_only_op = LatestOnlyOperator(task_id=latest_only_task_id, dag=dag)
896
+ operators.append(latest_only_op)
897
+
898
+ # Add skip operator if downstream tasks should be skipped
899
+ if link_node.skip_downstream:
900
+ skip_task_id = self.format_task_id(node, "skip_downstream")
901
+ skip_args = {"ignore_downstream_trigger_rules": False} if is_workflow else {}
902
+
903
+ skip_op = ShortCircuitOperator(task_id=skip_task_id, python_callable=lambda: False, dag=dag, **skip_args)
904
+ operators.append(skip_op)
905
+
906
+ # Create main link operator
907
+ main_task_id = self.format_task_id(node)
908
+ if Operator.is_link(link_node.operator):
909
+ # Prevent nested link operators
910
+ main_op = self.__create_link_operator(
911
+ dag=dag,
912
+ node=node,
913
+ link_node=link_node,
914
+ link_item=link_item,
915
+ task_id=main_task_id,
916
+ is_workflow=is_workflow,
917
+ )
918
+ else:
919
+ # Add workflow metadata to executor config
920
+ executor_config = copy.deepcopy(kwargs)
921
+ executor_config["executor_config"].update(
922
+ {
923
+ "link_workflow_id": link_item.link_wf_id,
924
+ "link_workflow_version": link_item.link_wf_version,
925
+ }
926
+ )
927
+
928
+ main_op = self.__create_link_operator(
929
+ dag=dag,
930
+ node=node,
931
+ link_node=link_node,
932
+ link_item=link_item,
933
+ task_id=main_task_id,
934
+ is_workflow=is_workflow,
935
+ **executor_config,
936
+ )
937
+ operators.append(main_op)
938
+
939
+ # Add empty node after skip_self operator to ensure proper trigger rule handling
940
+ # Only add empty node if skip_downstream is False, to avoid conflicts
941
+ if link_node.skip_self and not link_node.skip_downstream:
942
+ empty_task_id = self.format_task_id(node, "skip_self")
943
+ empty_operator = EmptyOperator(task_id=empty_task_id, trigger_rule=TriggerRule.NONE_FAILED, dag=dag)
944
+ operators.append(empty_operator)
945
+
946
+ # Set up dependencies between operators
947
+ for upstream_op, downstream_op in zip(operators[:-1], operators[1:]):
948
+ upstream_op.set_downstream(downstream_op)
949
+
950
+ return operators
951
+
952
+ def _on_finished_callback(self, callback_context):
953
+ dag_run: DagRun = callback_context["dag_run"]
954
+ tis: list[TaskInstance] = dag_run.get_task_instances()
955
+ task_info_map = {}
956
+ for ti in tis:
957
+ task_info_map[ti.task_id] = {
958
+ "state": ti.state,
959
+ "task_display_name": ti.task_display_name,
960
+ }
961
+
962
+ job_run_result = {
963
+ "job_id": getattr(callback_context["dag"], "job_id", None), # from Recurve metadata
964
+ "run_id": callback_context["run_id"],
965
+ "task_info_map": task_info_map,
966
+ "state": dag_run.get_state(),
967
+ "data_interval_end": dag_run.data_interval_end.isoformat(),
968
+ }
969
+ self.client.on_job_run_finished(job_run_result)
970
+
971
+
972
+ if __name__ == "__main__":
973
+ scheduler = AirflowScheduler()
974
+ globals().update(scheduler.execute())