recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,331 @@
1
+ import datetime
2
+ import logging
3
+ from typing import Any, Optional
4
+
5
+ from airflow.api.common.mark_tasks import set_dag_run_state_to_failed, set_dag_run_state_to_success
6
+ from airflow.api.common.trigger_dag import trigger_dag
7
+ from airflow.models import DAG, DagModel, DagRun, TaskInstance
8
+ from airflow.models.serialized_dag import SerializedDagModel
9
+ from airflow.utils.session import create_session, provide_session
10
+ from airflow.utils.state import TaskInstanceState
11
+ from sqlalchemy import Index, Table
12
+ from sqlalchemy.orm import Session
13
+ from sqlalchemy.schema import CreateIndex
14
+
15
+ from recurvedata.utils.date_time import to_local_datetime, utcnow
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class AirflowDbService:
21
+ @classmethod
22
+ def update_dag(cls, dag: DAG):
23
+ logger.info(f"start sync dag {dag.dag_id} to serialized_dag")
24
+ DAG.sync_to_db(dag)
25
+ SerializedDagModel.write_dag(dag)
26
+ logger.info(f"finish sync {dag.dag_id} to serialized_dag")
27
+
28
+ @classmethod
29
+ def activate_dag(cls, dag: DAG):
30
+ with create_session() as session:
31
+ d = session.query(DagModel).filter(DagModel.dag_id == dag.dag_id).one_or_none()
32
+ if not d:
33
+ raise ValueError(f"dag not exists: {dag.dag_id}")
34
+
35
+ if d.is_paused is False:
36
+ logger.info(f"{dag.dag_id} is active, no need to activate")
37
+ return
38
+
39
+ logger.info(f"start activate_dag dag {dag.dag_id}")
40
+
41
+ d.is_paused = False
42
+ session.merge(d)
43
+ session.commit()
44
+
45
+ logger.info(f"finish activate_dag dag {dag.dag_id}")
46
+
47
+ @classmethod
48
+ def deactivate_dag(cls, dag: DAG):
49
+ with create_session() as session:
50
+ d = session.query(DagModel).filter(DagModel.dag_id == dag.dag_id).one_or_none()
51
+ if not d:
52
+ raise ValueError(f"dag not exists: {dag.dag_id}")
53
+
54
+ if d.is_paused is True:
55
+ logger.info(f"{dag.dag_id} is deactive, no need to deactivate")
56
+ return
57
+
58
+ logger.info(f"start deactivate_dag dag {dag.dag_id}")
59
+
60
+ d.is_paused = True
61
+ session.merge(d)
62
+ session.commit()
63
+
64
+ logger.info(f"finish deactivate_dag dag {dag.dag_id}")
65
+
66
+ @classmethod
67
+ def delete_dag(cls, dag_id: str, job_name: str):
68
+ with create_session() as session:
69
+ d: DagModel = session.query(DagModel).filter(DagModel.dag_id == dag_id).one_or_none()
70
+ if not d:
71
+ logger.warning(f"dag not exists: {dag_id}")
72
+ return
73
+
74
+ logger.info(f"start delete_dag dag {job_name} {dag_id}")
75
+ d.is_paused = True
76
+ d.is_active = False
77
+ session.merge(d)
78
+ session.commit()
79
+
80
+ logger.info(f"finish delete_dag dag {job_name} {dag_id}")
81
+ # todo(chenjingmeng): delete dag
82
+
83
+ @classmethod
84
+ def trigger_job_run(
85
+ cls,
86
+ dag: DAG,
87
+ execution_date: datetime.datetime,
88
+ include_past: bool,
89
+ include_future: bool,
90
+ run_type: str,
91
+ conf: dict[str, Any] | None = None,
92
+ ):
93
+ execution_date_ds = execution_date.isoformat()
94
+ run_id = f"{run_type}__{execution_date_ds}"
95
+ reference_date = to_local_datetime(execution_date_ds)
96
+ current_date = utcnow()
97
+ airflow_current_date = dag.previous_schedule(current_date)
98
+
99
+ if include_past:
100
+ airflow_start_date = dag.start_date or dag.default_args.get("start_date")
101
+ if airflow_start_date:
102
+ tmp_date = dag.previous_schedule(reference_date)
103
+ while tmp_date >= airflow_start_date:
104
+ cls._trigger_run_if_not_exists(
105
+ dag.dag_id, run_id=f"{run_type}__{tmp_date.isoformat()}", execution_date=tmp_date, conf=conf
106
+ )
107
+ tmp_date = dag.previous_schedule(tmp_date)
108
+
109
+ if include_future:
110
+ tmp_date = dag.following_schedule(reference_date)
111
+ while tmp_date <= airflow_current_date:
112
+ cls._trigger_run_if_not_exists(
113
+ dag.dag_id, run_id=f"{run_type}__{tmp_date.isoformat()}", execution_date=tmp_date, conf=conf
114
+ )
115
+ tmp_date = dag.following_schedule(tmp_date)
116
+
117
+ cls._trigger_run_if_not_exists(dag.dag_id, run_id, execution_date=execution_date, conf=conf)
118
+
119
+ @staticmethod
120
+ def _trigger_run_if_not_exists(
121
+ dag_id: str, run_id: str, execution_date: datetime.datetime, conf: dict[str, Any] | None = None
122
+ ):
123
+ import recurvedata.schedulers.airflow_trigger_dag_patch # noqa
124
+
125
+ with create_session() as session:
126
+ existing_run = session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.run_id == run_id).first()
127
+ if existing_run:
128
+ logger.info(f"Skipping existing run for {dag_id} at {run_id}")
129
+ return
130
+ logger.info(f"start trigger dag_run for {dag_id} at {run_id}")
131
+ trigger_dag(dag_id, run_id=run_id, execution_date=execution_date, conf=conf, replace_microseconds=False)
132
+ logger.info(f"finished trigger dag_run for {dag_id} at {run_id} execution_date: {execution_date}")
133
+
134
+ @staticmethod
135
+ @provide_session
136
+ def _get_rerun_earliest_execution_date(dag: DAG, session: Session = None) -> Optional[datetime.datetime]:
137
+ earliest_dag_run = (
138
+ session.query(DagRun).filter(DagRun.dag_id == dag.dag_id).order_by(DagRun.execution_date).first()
139
+ )
140
+ return earliest_dag_run and earliest_dag_run.execution_date
141
+
142
+ @classmethod
143
+ def rerun_job_run(
144
+ cls,
145
+ dag: DAG,
146
+ run_id: str | None,
147
+ min_execution_date: datetime.datetime | None,
148
+ max_execution_date: datetime.datetime | None,
149
+ failed_only: bool,
150
+ ):
151
+ drs: list[DagRun] = DagRun.find(
152
+ dag_id=dag.dag_id,
153
+ run_id=run_id,
154
+ execution_start_date=min_execution_date,
155
+ execution_end_date=max_execution_date,
156
+ )
157
+ if not drs:
158
+ logger.info(f"skip rerun, no dag_run found for {dag.dag_id} at {run_id}")
159
+ return
160
+ clear_start_date = min([dr.execution_date for dr in drs])
161
+ clear_end_date = max([dr.execution_date for dr in drs])
162
+
163
+ logger.info(
164
+ f"prepare to clear dag_run for {dag.dag_id}, start_date: {clear_start_date}, end_date: {clear_end_date}, failed_only: {failed_only}"
165
+ )
166
+
167
+ dag.clear(
168
+ start_date=clear_start_date,
169
+ end_date=clear_end_date,
170
+ only_failed=failed_only,
171
+ )
172
+
173
+ @classmethod
174
+ def rerun_task_run(
175
+ cls,
176
+ dag: DAG,
177
+ run_id: str,
178
+ node_key: str,
179
+ min_execution_date: datetime.datetime | None,
180
+ max_execution_date: datetime.datetime | None,
181
+ include_upstream: bool,
182
+ include_downstream: bool,
183
+ failed_only: bool,
184
+ ):
185
+ drs: list[DagRun] = DagRun.find(
186
+ dag_id=dag.dag_id,
187
+ run_id=run_id,
188
+ execution_start_date=min_execution_date,
189
+ execution_end_date=max_execution_date,
190
+ )
191
+ if not drs:
192
+ logger.info(f"skip rerun, no dag_run found for {dag.dag_id} at {run_id}")
193
+ return
194
+ clear_start_date = min([dr.execution_date for dr in drs])
195
+ clear_end_date = max([dr.execution_date for dr in drs])
196
+
197
+ clear_task_ids: list[str] = []
198
+ for task_id in dag.task_dict.keys():
199
+ if task_id.startswith(node_key):
200
+ clear_task_ids.append(task_id)
201
+
202
+ expanded_task_ids = set(clear_task_ids)
203
+ if include_upstream or include_downstream:
204
+ if include_upstream:
205
+ for task_id in clear_task_ids:
206
+ if task_id in dag.task_dict:
207
+ task = dag.task_dict[task_id]
208
+ upstream_task_ids = [t.task_id for t in task.upstream_list]
209
+ expanded_task_ids.update(upstream_task_ids)
210
+
211
+ if include_downstream:
212
+ for task_id in clear_task_ids:
213
+ if task_id in dag.task_dict:
214
+ task = dag.task_dict[task_id]
215
+ downstream_task_ids = [t.task_id for t in task.downstream_list]
216
+ expanded_task_ids.update(downstream_task_ids)
217
+
218
+ clear_task_ids = list(expanded_task_ids)
219
+
220
+ logger.info(
221
+ f"prepare to clear task: {dag.dag_id}, {clear_task_ids} start_date: {clear_start_date}, end_date: {clear_end_date}, failed_only: {failed_only}"
222
+ )
223
+
224
+ clear_cnt = dag.clear(
225
+ task_ids=clear_task_ids,
226
+ start_date=clear_start_date,
227
+ end_date=clear_end_date,
228
+ only_failed=failed_only,
229
+ )
230
+ logger.info(f"finish clear task: {dag.dag_id}, {clear_task_ids}, total clear: {clear_cnt} task_instances")
231
+
232
+ @classmethod
233
+ def init_airflow_tables(cls):
234
+ from airflow.settings import engine
235
+ from airflow.utils.db import reflect_tables
236
+
237
+ def _is_index_exists(session: Session, table_name: str, index_name: str) -> bool:
238
+ query = f"""
239
+ SELECT EXISTS (
240
+ SELECT 1
241
+ FROM pg_indexes
242
+ WHERE tablename = {table_name!r}
243
+ AND indexname = {index_name!r}
244
+ )
245
+ """
246
+ result = session.execute(query)
247
+ return result.scalar()
248
+
249
+ with create_session() as session:
250
+ metadata = reflect_tables(tables=["dag_run", "task_instance"], session=session)
251
+ dag_run = Table("dag_run", metadata, autoload_with=engine)
252
+ task_instance = Table("task_instance", metadata, autoload_with=engine)
253
+
254
+ dag_run_updated_at_idx = Index("ix_dag_run_updated_at", dag_run.c.updated_at)
255
+ task_instance_updated_at_idx = Index("ix_task_instance_updated_at", task_instance.c.updated_at)
256
+
257
+ with engine.connect():
258
+ if not _is_index_exists(session, "dag_run", "ix_dag_run_updated_at"):
259
+ logger.info("start creating index on dag_run.updated_at")
260
+ session.execute(CreateIndex(dag_run_updated_at_idx))
261
+ logger.info("Created index on dag_run.updated_at")
262
+ else:
263
+ logger.info("Skipped creating index on dag_run.updated_at")
264
+
265
+ if not _is_index_exists(session, "task_instance", "ix_task_instance_updated_at"):
266
+ logger.info("start creating index on task_instance.updated_at")
267
+ session.execute(CreateIndex(task_instance_updated_at_idx))
268
+ logger.info("Created index on task_instance.updated_at")
269
+ else:
270
+ logger.info("Skipped creating index on task_instance.updated_at")
271
+
272
+ @classmethod
273
+ def mark_dag_run_success(cls, dag: DAG, run_id: str = None, whole_dag: bool = False):
274
+ if not run_id:
275
+ if not whole_dag:
276
+ logger.info("mark_dag_run need a run_id, skip mark_dag_run")
277
+ return
278
+ run_ids = cls._get_dag_run_ids(dag)
279
+ for run_id in run_ids:
280
+ cls.mark_dag_run_success(dag, run_id)
281
+ return
282
+ logger.info(f"start mark dag run {dag.dag_id} {run_id} to success")
283
+ set_dag_run_state_to_success(dag=dag, run_id=run_id, commit=True)
284
+
285
+ @classmethod
286
+ def mark_dag_run_failed(cls, dag: DAG, run_id: str = None, whole_dag: bool = False):
287
+ """
288
+ will mark un-running tasks to skipped;
289
+ mark running tasks to failed;
290
+ keep finished tasks the same.
291
+ """
292
+ if not run_id:
293
+ if not whole_dag:
294
+ logger.info("mark_dag_run need a run_id, skip mark_dag_run")
295
+ return
296
+ run_ids = cls._get_dag_run_ids(dag)
297
+ for run_id in run_ids:
298
+ cls.mark_dag_run_failed(dag, run_id)
299
+ return
300
+ logger.info(f"start mark dag run {dag.dag_id} {run_id} to success")
301
+ set_dag_run_state_to_failed(dag=dag, run_id=run_id, commit=True)
302
+
303
+ @staticmethod
304
+ @provide_session
305
+ def _get_dag_run_ids(dag: DAG, session: Session = None) -> list[str]:
306
+ query = session.query(DagRun.run_id).filter(DagRun.dag_id == dag.dag_id)
307
+ return [res[0] for res in query.all()]
308
+
309
+ @classmethod
310
+ @provide_session
311
+ def delete_whole_dag_dr_ti(cls, dag: DAG, session: Session = None):
312
+ logger.info(f"start delete whole dag_run and task_instance for {dag.dag_id}")
313
+ for model in (TaskInstance, DagRun):
314
+ session.query(model).filter(model.dag_id == dag.dag_id).delete(synchronize_session="fetch")
315
+ logger.info(f"finish deleted whole dag_run and task_instance for {dag.dag_id}")
316
+
317
+ @staticmethod
318
+ @provide_session
319
+ def _set_task_run_state(dag: DAG, run_id: str, node_key: str, state: TaskInstanceState, session: Session = None):
320
+ logger.info(f"start set task_run {dag.dag_id} {run_id} {node_key} to {state}")
321
+ dag.set_task_instance_state(
322
+ task_id=node_key,
323
+ run_id=run_id,
324
+ state=state,
325
+ session=session,
326
+ )
327
+ logger.info(f"finish set task_run {dag.dag_id} {run_id} {node_key} to {state}")
328
+
329
+ @staticmethod
330
+ def terminate_task_run(dag: DAG, run_id: str, node_key: str):
331
+ AirflowDbService._set_task_run_state(dag, run_id, node_key, TaskInstanceState.FAILED)
@@ -0,0 +1,61 @@
1
+ import logging
2
+
3
+ from airflow.exceptions import AirflowSkipException
4
+ from airflow.models import TaskInstance
5
+ from airflow.operators.bash import BashOperator
6
+ from airflow.utils.context import Context
7
+ from airflow.utils.task_instance_session import get_current_task_instance_session
8
+ from sqlalchemy.orm.attributes import flag_modified
9
+
10
+ from recurvedata.executors.utils import read_meta_file
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class RecurveBashOperator(BashOperator):
16
+ def execute(self, context: Context):
17
+ try:
18
+ res = super().execute(context)
19
+ self.update_meta_to_task_instance_executor_config(context)
20
+ return res
21
+ except Exception:
22
+ self.update_meta_to_task_instance_executor_config(context)
23
+ raise
24
+
25
+ @staticmethod
26
+ def read_meta_file(context: Context) -> dict:
27
+ return read_meta_file(
28
+ context["dag"].dag_id, context["ti"].task_id, context["next_execution_date"] or context["execution_date"]
29
+ )
30
+
31
+ def update_meta_to_task_instance_executor_config(self, context: Context):
32
+ meta = self.read_meta_file(context)
33
+ if not meta:
34
+ return
35
+ logger.debug(f"update_meta_to_task_instance_executor_config: {str(meta)}")
36
+ session = get_current_task_instance_session()
37
+ task_instance = TaskInstance.get_task_instance(
38
+ dag_id=context["dag"].dag_id,
39
+ task_id=context["ti"].task_id,
40
+ run_id=context["dag_run"].run_id,
41
+ map_index=-1,
42
+ session=session,
43
+ )
44
+ if task_instance:
45
+ task_instance.executor_config.update(meta)
46
+ flag_modified(task_instance, "executor_config")
47
+
48
+
49
+ class SkipSelfBashOperator(BashOperator):
50
+ ui_color = "#e8f7e4"
51
+
52
+ def execute(self, context):
53
+ raise AirflowSkipException("This task is skipped")
54
+
55
+
56
+ class LinkNodeBashOperator(RecurveBashOperator):
57
+ ui_color = "#8DEEEE"
58
+
59
+
60
+ class LinkErrorBashOperator(BashOperator):
61
+ ui_color = "red" # not used
@@ -0,0 +1,9 @@
1
+ from airflow.plugins_manager import AirflowPlugin
2
+
3
+
4
+ class RecurveAirflowPlugin(AirflowPlugin):
5
+ name = "recurvedata"
6
+
7
+ @classmethod
8
+ def on_load(cls, *args, **kwargs):
9
+ import recurvedata.schedulers.debug_celery # noqa: F401
@@ -0,0 +1,117 @@
1
+ """
2
+ monkey patch airflow airflow/api/common/trigger_dag.py,
3
+ airflow native _trigger_dag will create data_interval_end = execution_date dag run,
4
+ which will cause plan run error (one data_interval_end may have multiple run_id).
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from datetime import datetime
10
+
11
+ import airflow.api.common.trigger_dag
12
+ from airflow.exceptions import DagNotFound, DagRunAlreadyExists
13
+ from airflow.models import DagBag, DagRun
14
+ from airflow.models.dag import DAG
15
+ from airflow.timetables.base import DataInterval
16
+ from airflow.timetables.interval import CronDataIntervalTimetable
17
+ from airflow.utils import timezone
18
+ from airflow.utils.state import DagRunState
19
+ from airflow.utils.types import DagRunType
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _recurve_get_next_data_interval(mannual_data_interval: DataInterval, dag: DAG):
25
+ if not isinstance(dag.timetable, CronDataIntervalTimetable):
26
+ return mannual_data_interval
27
+ next_data_interval_end = dag.timetable._get_next(mannual_data_interval.end)
28
+ return DataInterval(start=mannual_data_interval.end, end=next_data_interval_end)
29
+
30
+
31
+ def _recurve_trigger_dag(
32
+ dag_id: str,
33
+ dag_bag: DagBag,
34
+ run_id: str | None = None,
35
+ conf: dict | str | None = None,
36
+ execution_date: datetime | None = None,
37
+ replace_microseconds: bool = True,
38
+ ) -> list[DagRun | None]:
39
+ """
40
+ Triggers DAG run.
41
+
42
+ :param dag_id: DAG ID
43
+ :param dag_bag: DAG Bag model
44
+ :param run_id: ID of the dag_run
45
+ :param conf: configuration
46
+ :param execution_date: date of execution
47
+ :param replace_microseconds: whether microseconds should be zeroed
48
+ :return: list of triggered dags
49
+ """
50
+ logger.info("start call _recurve_trigger_dag")
51
+ dag = dag_bag.get_dag(dag_id) # prefetch dag if it is stored serialized
52
+
53
+ if dag is None or dag_id not in dag_bag.dags:
54
+ raise DagNotFound(f"Dag id {dag_id} not found")
55
+
56
+ execution_date = execution_date or timezone.utcnow()
57
+
58
+ if not timezone.is_localized(execution_date):
59
+ raise ValueError("The execution_date should be localized")
60
+
61
+ if replace_microseconds:
62
+ execution_date = execution_date.replace(microsecond=0)
63
+
64
+ if dag.default_args and "start_date" in dag.default_args:
65
+ min_dag_start_date = dag.default_args["start_date"]
66
+ if min_dag_start_date and execution_date < min_dag_start_date:
67
+ raise ValueError(
68
+ f"The execution_date [{execution_date.isoformat()}] should be >= start_date "
69
+ f"[{min_dag_start_date.isoformat()}] from DAG's default_args"
70
+ )
71
+ logical_date = timezone.coerce_datetime(execution_date)
72
+
73
+ data_interval = dag.timetable.infer_manual_data_interval(run_after=logical_date)
74
+
75
+ # recurve update start #
76
+ recurve_external_trigger = True
77
+ inferred_run_type = DagRunType.from_run_id(run_id)
78
+ if inferred_run_type == DagRunType.SCHEDULED:
79
+ new_data_interval = _recurve_get_next_data_interval(data_interval, dag)
80
+ logger.info(f"adjust data interval: {data_interval} -> {new_data_interval}")
81
+ data_interval = new_data_interval
82
+ recurve_external_trigger = False
83
+ # recurve update end #
84
+
85
+ run_id = run_id or dag.timetable.generate_run_id(
86
+ run_type=DagRunType.MANUAL, logical_date=logical_date, data_interval=data_interval
87
+ )
88
+ dag_run = DagRun.find_duplicate(dag_id=dag_id, execution_date=execution_date, run_id=run_id)
89
+
90
+ if dag_run:
91
+ raise DagRunAlreadyExists(dag_run=dag_run, execution_date=execution_date, run_id=run_id)
92
+
93
+ run_conf = None
94
+ if conf:
95
+ run_conf = conf if isinstance(conf, dict) else json.loads(conf)
96
+
97
+ # recurve update start #
98
+ dag_runs = []
99
+ dags_to_run = [dag, *dag.subdags]
100
+ for _dag in dags_to_run:
101
+ dag_run = _dag.create_dagrun(
102
+ run_id=run_id,
103
+ execution_date=execution_date,
104
+ state=DagRunState.QUEUED,
105
+ conf=run_conf,
106
+ external_trigger=recurve_external_trigger,
107
+ dag_hash=dag_bag.dags_hash.get(dag_id),
108
+ data_interval=data_interval,
109
+ )
110
+ dag_runs.append(dag_run)
111
+ # recurve update end #
112
+
113
+ return dag_runs
114
+
115
+
116
+ logger.info("monkey patch airflow.api.common.trigger_dag._trigger_dag")
117
+ airflow.api.common.trigger_dag._trigger_dag = _recurve_trigger_dag
@@ -0,0 +1,99 @@
1
+ import datetime
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from functools import cached_property
5
+ from typing import ClassVar, Optional
6
+
7
+ import pendulum
8
+
9
+ from recurvedata.schedulers.client import SchedulerClient
10
+ from recurvedata.schedulers.schemas import JobListResponse
11
+ from recurvedata.utils.dataclass import init_dataclass_from_dict
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class DagSchema:
18
+ id: int # recurve job_id
19
+ name: str # recurve job_name
20
+ project_id: int
21
+ project_name: str
22
+ workflow_id: int
23
+ workflow_name: str
24
+ workflow_version: str
25
+ graph: list[tuple[str, str]] # [(upstream_node_key, downstream_node_key),]
26
+ nodes: list
27
+ schedule_type: str
28
+ schedule_interval: str
29
+ timezone: str
30
+
31
+ owner_username: str
32
+ # scheduler_args: dict
33
+ start_date: Optional[datetime.datetime] = None
34
+ end_date: Optional[datetime.datetime] = None
35
+ scheduler_settings: Optional[dict] = None
36
+ retries: Optional[int] = None
37
+ retry_delay: Optional[int] = None
38
+
39
+ # attr for modeling pipeline
40
+ skip_data_tests: bool = False
41
+
42
+ @property
43
+ def job_id(self):
44
+ return self.id
45
+
46
+
47
+ class SchedulerBase(object):
48
+ DEFAULT_DAG_OWNER: ClassVar[str] = "recurve"
49
+
50
+ def __init__(self, sharding_size: int = 1, sharding_key: int = 0):
51
+ self.sharding_size = sharding_size
52
+ self.sharding_key = sharding_key
53
+ self.client: SchedulerClient = self.init_client()
54
+
55
+ @cached_property
56
+ def localtz(self): # todo: move to CONF
57
+ return pendulum.timezone("Asia/Shanghai")
58
+
59
+ @classmethod
60
+ def init_client(cls) -> SchedulerClient:
61
+ return SchedulerClient()
62
+
63
+ def list_scheduler_dag(self):
64
+ """
65
+ 从 sdk 获取符合条件的所有 dag 信息
66
+ :return:
67
+ """
68
+
69
+ jobs: JobListResponse = self.client.list_jobs(sharding_size=self.sharding_size, sharding_key=self.sharding_key)
70
+
71
+ for job in jobs.jobs:
72
+ dag = init_dataclass_from_dict(DagSchema, job.model_dump())
73
+ yield dag
74
+
75
+ def create_dag(self, row: DagSchema):
76
+ """
77
+ 生成对应调度器(airflow/...) 的对象
78
+ :param args:
79
+ :param kwargs:
80
+ :return:
81
+ """
82
+ try:
83
+ return self.create_dag_impl(row)
84
+ except Exception as e:
85
+ logger.exception(f"failed to generate dag {row.id}, %s", e)
86
+ return # todo: add new client api to notify
87
+
88
+ def create_dag_impl(self, row: DagSchema):
89
+ pass
90
+
91
+ def execute(self, *args, **kwargs):
92
+ """
93
+ 入口
94
+ :param args:
95
+ :param kwargs:
96
+ :return:
97
+ """
98
+ for row in self.list_scheduler_dag():
99
+ pass