recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,228 @@
1
+ import json
2
+ import logging
3
+ from typing import Any
4
+
5
+ import dateutil
6
+ import typer
7
+ from airflow.models import DAG
8
+
9
+ from recurvedata.schedulers.airflow_db_process import AirflowDbService
10
+ from recurvedata.schedulers.schemas import WorkflowNodeDebugDetail
11
+ from recurvedata.schedulers.service import get_job_dag
12
+ from recurvedata.schedulers.task_status import TaskStatusScanner
13
+ from recurvedata.schedulers.utils import clear_task_instance, format_dag_id, init_client
14
+ from recurvedata.utils import init_logging
15
+ from recurvedata.utils._typer import RecurveTyper
16
+ from recurvedata.utils.date_time import to_local_datetime
17
+
18
+ logger = logging.getLogger(__name__)
19
+ cli = RecurveTyper()
20
+
21
+
22
+ def _ensure_dag_exists(job_id: int, raise_error=True) -> DAG | None:
23
+ dag = get_job_dag(job_id)
24
+ if not dag: # job deleted
25
+ if not raise_error:
26
+ logger.info(f"dag missing for job {job_id}")
27
+ return
28
+ raise ValueError(f"dag not exists: {job_id}")
29
+ return dag
30
+
31
+
32
+ @cli.callback()
33
+ def init():
34
+ init_logging()
35
+
36
+
37
+ @cli.command()
38
+ def update_dag(job_id: int = typer.Option(..., "--job_id")):
39
+ dag = _ensure_dag_exists(job_id, raise_error=False)
40
+ AirflowDbService.update_dag(dag)
41
+
42
+
43
+ @cli.command()
44
+ def activate_dag(job_id: int = typer.Option(..., "--job_id")):
45
+ dag = _ensure_dag_exists(job_id, raise_error=False)
46
+ AirflowDbService.activate_dag(dag)
47
+
48
+
49
+ @cli.command()
50
+ def deactivate_dag(job_id: int = typer.Option(..., "--job_id")):
51
+ dag = _ensure_dag_exists(job_id, raise_error=False)
52
+ AirflowDbService.deactivate_dag(dag)
53
+
54
+
55
+ @cli.command()
56
+ def delete_dag(job_id: int = typer.Option(..., "--job_id"), job_name: str = typer.Option(..., "--job_name")):
57
+ dag_id = format_dag_id(job_id)
58
+ AirflowDbService.delete_dag(dag_id, job_name)
59
+
60
+
61
+ @cli.command()
62
+ def clear(
63
+ job_id: int = typer.Option(..., "--job_id"),
64
+ node_key: str = typer.Option(..., "--node_key"),
65
+ execution_date: str = typer.Option(..., "--execution_date", callback=to_local_datetime),
66
+ only_failed: bool = typer.Option(False, "--only_failed"),
67
+ including_downstream: bool = typer.Option(False, "--including_downstream"),
68
+ ):
69
+ dag = get_job_dag(job_id)
70
+ clear_task_instance(dag, node_key, execution_date, only_failed, including_downstream)
71
+
72
+
73
+ @cli.command()
74
+ def start_workflow_node_debug(
75
+ workflow_id: int = typer.Option(..., "--workflow_id"),
76
+ node_key: str = typer.Option(..., "--node_key"),
77
+ schedule_type: str = typer.Option(..., "--schedule_type"),
78
+ schedule_interval: str = typer.Option(..., "--schedule_interval"),
79
+ execution_date: str = typer.Option(..., "--execution_date"),
80
+ timezone: str = typer.Option(..., "--timezone"),
81
+ ):
82
+ from celery.result import AsyncResult
83
+
84
+ from recurvedata.schedulers.debug_celery import debug_node
85
+
86
+ celery_kwargs = {
87
+ "workflow_id": workflow_id,
88
+ "node_key": node_key,
89
+ "schedule_type": schedule_type,
90
+ "schedule_interval": schedule_interval,
91
+ "execution_date": execution_date,
92
+ "timezone": timezone,
93
+ }
94
+ result: AsyncResult = debug_node.apply_async(kwargs=celery_kwargs)
95
+ logger.info(f"sent debug_node {celery_kwargs}, celery_id: {result.task_id}")
96
+ return {
97
+ "celery_task_id": result.task_id,
98
+ }
99
+
100
+
101
+ @cli.command()
102
+ def abort_workflow_node_debug(
103
+ workflow_id: int = typer.Option(..., "--workflow_id"),
104
+ node_key: str = typer.Option(..., "--node_key"),
105
+ celery_task_id: str = typer.Option(None, "--celery_task_id"),
106
+ ):
107
+ import recurvedata.schedulers.debug_celery
108
+
109
+ if not celery_task_id:
110
+ client = init_client()
111
+ detail: WorkflowNodeDebugDetail = client.get_workflow_node_debug_detail(
112
+ workflow_id=workflow_id, node_key=node_key
113
+ )
114
+ celery_task_id = detail.celery_task_id
115
+
116
+ if not celery_task_id:
117
+ logger.info("skip revoke_debug, no celery_task_id found")
118
+ return
119
+ logger.info(f"start revoke debug: {workflow_id} {node_key} {celery_task_id}")
120
+ recurvedata.schedulers.debug_celery.revoke_task(celery_task_id)
121
+ logger.info(f"finish revoke debug: {workflow_id} {node_key} {celery_task_id}")
122
+
123
+
124
+ @cli.command()
125
+ def sync_task_status(interval: int = typer.Option(5, "--interval")):
126
+ scanner = TaskStatusScanner()
127
+ scanner.run(interval)
128
+
129
+
130
+ @cli.command()
131
+ def trigger_job_run(
132
+ job_id: int = typer.Option(..., "--job_id"),
133
+ execution_date: str = typer.Option(..., "--execution_date", callback=to_local_datetime),
134
+ include_past: bool = typer.Option(False, "--include_past"),
135
+ include_future: bool = typer.Option(False, "--include_future"),
136
+ run_type: str = typer.Option(None, "--run_type"),
137
+ conf: str = typer.Option(None, "--conf"),
138
+ ):
139
+ dag = _ensure_dag_exists(job_id)
140
+ if conf:
141
+ conf: dict[str, Any] = json.loads(conf)
142
+ AirflowDbService.trigger_job_run(dag, execution_date, include_past, include_future, run_type, conf)
143
+
144
+
145
+ @cli.command()
146
+ def rerun_job_run(
147
+ job_id: int = typer.Option(..., "--job_id"),
148
+ run_id: str = typer.Option(None, "--run_id"),
149
+ min_execution_date: str = typer.Option(None, "--min_execution_date"),
150
+ max_execution_date: str = typer.Option(None, "--max_execution_date"),
151
+ failed_only: bool = typer.Option(False, "--failed_only"),
152
+ ):
153
+ dag = _ensure_dag_exists(job_id)
154
+ if min_execution_date:
155
+ min_execution_date = dateutil.parser.parse(min_execution_date)
156
+ if max_execution_date:
157
+ max_execution_date = dateutil.parser.parse(max_execution_date)
158
+ AirflowDbService.rerun_job_run(dag, run_id, min_execution_date, max_execution_date, failed_only)
159
+
160
+
161
+ @cli.command()
162
+ def rerun_task_run(
163
+ job_id: int = typer.Option(..., "--job_id"),
164
+ run_id: str = typer.Option(None, "--run_id"),
165
+ node_key: str = typer.Option(..., "--node_key"),
166
+ include_upstream: bool = typer.Option(False, "--include_upstream"),
167
+ include_downstream: bool = typer.Option(False, "--include_downstream"),
168
+ min_execution_date: str = typer.Option(None, "--min_execution_date"),
169
+ max_execution_date: str = typer.Option(None, "--max_execution_date"),
170
+ failed_only: bool = typer.Option(False, "--failed_only"),
171
+ ):
172
+ dag = _ensure_dag_exists(job_id)
173
+ if min_execution_date:
174
+ min_execution_date = dateutil.parser.parse(min_execution_date)
175
+ if max_execution_date:
176
+ max_execution_date = dateutil.parser.parse(max_execution_date)
177
+ AirflowDbService.rerun_task_run(
178
+ dag=dag,
179
+ run_id=run_id,
180
+ node_key=node_key,
181
+ min_execution_date=min_execution_date,
182
+ max_execution_date=max_execution_date,
183
+ include_upstream=include_upstream,
184
+ include_downstream=include_downstream,
185
+ failed_only=failed_only,
186
+ )
187
+
188
+
189
+ @cli.command()
190
+ def init_airflow_tables():
191
+ AirflowDbService.init_airflow_tables()
192
+
193
+
194
+ @cli.command()
195
+ def stop_dev_run(job_id: int = typer.Option(..., "--job_id")):
196
+ logger.info(f"start stop dev run job_id: {job_id}")
197
+ dag = _ensure_dag_exists(job_id)
198
+
199
+ AirflowDbService.mark_dag_run_failed(dag, whole_dag=True)
200
+
201
+
202
+ @cli.command()
203
+ def start_dev_run(
204
+ job_id: int = typer.Option(..., "--job_id"),
205
+ execution_date: str = typer.Option(..., "--execution_date", callback=to_local_datetime),
206
+ ):
207
+ logger.info(f"start stop dev run job_id: {job_id}")
208
+ dag = _ensure_dag_exists(job_id)
209
+
210
+ AirflowDbService.update_dag(dag)
211
+ # AirflowDbService.mark_dag_run_failed(dag, whole_dag=True)
212
+ AirflowDbService.delete_whole_dag_dr_ti(dag)
213
+ AirflowDbService.activate_dag(dag)
214
+ AirflowDbService.trigger_job_run(dag, execution_date, False, False, "manual")
215
+
216
+
217
+ @cli.command()
218
+ def terminate_task_run(
219
+ job_id: int = typer.Option(..., "--job_id"),
220
+ run_id: str = typer.Option(..., "--run_id"),
221
+ node_key: str = typer.Option(..., "--node_key"),
222
+ ):
223
+ dag = _ensure_dag_exists(job_id)
224
+ AirflowDbService.terminate_task_run(dag, run_id, node_key)
225
+
226
+
227
+ if __name__ == "__main__":
228
+ cli()
@@ -0,0 +1,56 @@
1
+ from recurvedata.client import Client
2
+ from recurvedata.schedulers.schemas import JobListResponse, TaskStatusCursor, WorkflowNodeDebugDetail
3
+ from recurvedata.utils import get_env_id
4
+
5
+
6
+ class SchedulerClient(Client):
7
+ def list_jobs(self, sharding_size: int = None, sharding_key: int = None) -> JobListResponse:
8
+ if not sharding_size:
9
+ sharding_size = 1
10
+ sharding_key = 0
11
+
12
+ params = {
13
+ "env_id": get_env_id(),
14
+ "sharding_key": sharding_key,
15
+ "sharding_size": sharding_size,
16
+ }
17
+
18
+ return self.request("GET", path="/api/scheduler/jobs", response_model_class=JobListResponse, params=params)
19
+
20
+ def get_task_status_cursor(self) -> TaskStatusCursor:
21
+ params = {"env_id": get_env_id()}
22
+ return self.request(
23
+ "GET", path="/api/scheduler/task-status-cursor", response_model_class=TaskStatusCursor, params=params
24
+ )
25
+
26
+ def sync_task_status(self, job_runs: list[dict] | None = None, task_runs: list[dict] | None = None):
27
+ params = {"env_id": get_env_id()}
28
+ payload = {
29
+ "job_runs": job_runs,
30
+ "task_runs": task_runs,
31
+ }
32
+ return self.request("POST", path="/api/scheduler/sync-task-status", params=params, json=payload)
33
+
34
+ def get_workflow_node_debug_detail(self, workflow_id: int, node_key: str) -> WorkflowNodeDebugDetail:
35
+ params = {
36
+ "env_id": get_env_id(),
37
+ "workflow_id": workflow_id,
38
+ "node_key": node_key,
39
+ }
40
+ return self.request(
41
+ "GET",
42
+ path="/api/scheduler/workflow-node-debug-detail",
43
+ response_model_class=WorkflowNodeDebugDetail,
44
+ params=params,
45
+ )
46
+
47
+ def on_job_run_finished(self, job_run_result: dict):
48
+ params = {"env_id": get_env_id()}
49
+ payload = {
50
+ "job_id": job_run_result["job_id"],
51
+ "run_id": job_run_result["run_id"],
52
+ "task_info_map": job_run_result["task_info_map"],
53
+ "state": job_run_result["state"],
54
+ "data_interval_end": job_run_result["data_interval_end"],
55
+ }
56
+ return self.request("POST", path="/api/scheduler/on-job-run-finished", params=params, json=payload)
@@ -0,0 +1,52 @@
1
+ import re
2
+ from enum import Enum
3
+
4
+
5
+ class OperatorEnum(str, Enum):
6
+ SQLOperator = "SQLOperator"
7
+ TransferOperator = "TransferOperator"
8
+ PythonOperator = "PythonOperator"
9
+ SparkOperator = "SparkOperator"
10
+ NotifyOperator = "NotifyOperator"
11
+ LinkOperator = "LinkOperator"
12
+
13
+
14
+ WORK_DIR = "/opt/airflow" # todo: use /opt/recurve
15
+
16
+
17
+ class Operator(str, Enum): # todo
18
+ SQLOperator = "SQLOperator"
19
+ TransferOperator = "TransferOperator"
20
+ PythonOperator = "PythonOperator"
21
+ SparkOperator = "SparkOperator"
22
+ NotifyOperator = "NotifyOperator"
23
+ LinkOperator = "LinkOperator"
24
+
25
+
26
+ class ScheduleType(str, Enum):
27
+ crontab = "crontab"
28
+ customization = "customization" # 快捷设置
29
+ manual = "manual" # 手动触发
30
+
31
+
32
+ SYSTEM_SYNC_STATUS_DAG_ID = "system_sync_status"
33
+
34
+
35
+ def format_recurve_env_key(key: str) -> str:
36
+ return f"RECURVE__{key.upper()}"
37
+
38
+
39
+ def get_dag_file_loc(job_id: int) -> str:
40
+ # todo: configuration
41
+ idx = job_id % 7
42
+ return f"/opt/airflow/dags/autogen_sharding_{idx}.py"
43
+
44
+
45
+ DEFAULT_RETRY_NUMBER = 2
46
+ DEFAULT_RETRY_DELAY = 60 * 5 # 5 minutes
47
+
48
+
49
+ def is_dev_run_job(job_name: str) -> bool:
50
+ pattern = r"dev_run_.*_\d+_\d+"
51
+ match = re.match(pattern, job_name)
52
+ return match is not None
@@ -0,0 +1,62 @@
1
+ import logging
2
+ import os
3
+
4
+ from airflow.providers.celery.executors.celery_executor import app as celery_app
5
+ from celery import Task
6
+
7
+ from recurvedata.executors.client import ExecutorClient
8
+ from recurvedata.operators.config import CONF
9
+ from recurvedata.utils.mp import run_subprocess
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @celery_app.task(bind=True)
15
+ def debug_node(
16
+ self: Task,
17
+ workflow_id: int,
18
+ node_key: str,
19
+ schedule_type: str,
20
+ schedule_interval: str,
21
+ execution_date: str,
22
+ timezone: str,
23
+ ):
24
+ task_id = self.request.id
25
+ logging.info(
26
+ f"start {task_id} {workflow_id} {node_key} {schedule_type} {schedule_interval} {execution_date} {timezone}"
27
+ )
28
+
29
+ executor_client = ExecutorClient()
30
+
31
+ executor_client.debug_start(workflow_id, node_key, task_id)
32
+ try:
33
+ run_subprocess(
34
+ [
35
+ os.path.join(CONF.RECURVE_EXECUTOR_PYENV_BIN_PATH, "recurve_executor"),
36
+ "debug",
37
+ "--workflow_id",
38
+ f"{workflow_id}",
39
+ "--node_key",
40
+ f"{node_key}",
41
+ "--schedule_type",
42
+ schedule_type,
43
+ "--schedule_interval",
44
+ schedule_interval,
45
+ "--execution_date",
46
+ execution_date,
47
+ "--timezone",
48
+ timezone,
49
+ "--celery_task_id",
50
+ task_id,
51
+ ],
52
+ env=os.environ.copy(),
53
+ )
54
+ is_success = True
55
+ except Exception as e:
56
+ logger.exception(f"{workflow_id} {node_key} {execution_date} debug failed, err: {e}")
57
+ is_success = False
58
+ executor_client.debug_end(workflow_id, node_key, task_id, is_success)
59
+
60
+
61
+ def revoke_task(task_id: str = None, terminate=True):
62
+ return celery_app.control.revoke(task_id, terminate=terminate)
@@ -0,0 +1,63 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from recurvedata.operators.models import DagBase, NodeBase
5
+
6
+
7
+ @dataclass
8
+ class SchedulerDag(DagBase):
9
+ workflow_version: str | None = None
10
+
11
+
12
+ @dataclass
13
+ class SchedulerNode(NodeBase):
14
+ """
15
+ 调度器的 Node 对象
16
+ """
17
+
18
+ operator: str
19
+
20
+ scheduler_settings: Optional[dict] = None
21
+ skip_self: Optional[bool] = None
22
+ skip_downstream: Optional[bool] = None
23
+ latest_only: Optional[bool] = None
24
+
25
+
26
+ @dataclass
27
+ class LinkNodeItem:
28
+ """
29
+ the node linked by LinkOperator
30
+ """
31
+
32
+ link_wf_id: int
33
+ link_wf_version: str
34
+ link_node_id: int
35
+ link_node_name: str
36
+ link_node_key: str
37
+ link_latest_only: bool
38
+ link_operator: str
39
+ link_skip_downstream: bool
40
+ link_skip_self: bool
41
+ link_scheduler_settings: dict = None
42
+ link_config: dict = None # used in CustomAirflowOperator
43
+ node_id: int = None
44
+ plan_id: int = None
45
+
46
+ @property
47
+ def config(self):
48
+ # for CustomAirflowOperator
49
+ return self.link_config
50
+
51
+
52
+ @dataclass
53
+ class LinkWorkflowItem:
54
+ """
55
+ LinkOperator - link workflow
56
+ """
57
+
58
+ node_id: int
59
+ link_wf_id: int
60
+ link_wf_name: str
61
+ link_wf_version: str
62
+ link_graph: list[tuple[str, str]] = field(default_factory=list) # [(upstream_node_key, downstream_node_key),]
63
+ link_nodes: list[LinkNodeItem] = None
@@ -0,0 +1,97 @@
1
+ import datetime
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from recurvedata.consts import Operator, ScheduleType
6
+
7
+
8
+ class SchedulerLinkWorkflowNodeItem(BaseModel):
9
+ link_wf_id: int
10
+ link_wf_version: str
11
+ link_node_id: int
12
+ link_node_key: str
13
+ link_node_name: str
14
+ link_operator: Operator
15
+ link_skip_self: bool
16
+ link_skip_downstream: bool
17
+ link_latest_only: bool
18
+ link_scheduler_settings: dict | None
19
+ plan_id: int | None = None
20
+
21
+
22
+ class SchedulerLinkNodeItem(BaseModel):
23
+ node_id: int
24
+ link_wf_id: int
25
+ link_wf_version: str
26
+ link_node_id: int
27
+ link_node_key: str
28
+ link_node_name: str
29
+ link_operator: Operator
30
+ link_skip_self: bool
31
+ link_skip_downstream: bool
32
+ link_latest_only: bool
33
+ link_scheduler_settings: dict | None
34
+
35
+
36
+ class SchedulerNodeItem(BaseModel):
37
+ id: int
38
+ node_key: str
39
+ name: str
40
+ operator: Operator
41
+ scheduler_settings: dict | None
42
+ skip_self: bool
43
+ skip_downstream: bool
44
+ latest_only: bool
45
+
46
+
47
+ class JobItem(BaseModel):
48
+ id: int
49
+ name: str
50
+ schedule_type: ScheduleType
51
+ schedule_interval: str
52
+ timezone: str | None
53
+ workflow_version: str
54
+ start_date: datetime.datetime | None
55
+ end_date: datetime.datetime | None
56
+ scheduler_settings: dict | None
57
+ owner_username: str
58
+
59
+ nodes: list[SchedulerNodeItem]
60
+
61
+ graph: list
62
+ project_id: int = None
63
+ project_name: str = None
64
+ workflow_id: int = None
65
+ workflow_name: str = None
66
+
67
+ skip_data_tests: bool = False
68
+ retries: int | None = None
69
+ retry_delay: int | None = None
70
+
71
+
72
+ class SchedulerLinkWorkflowItem(BaseModel):
73
+ node_id: int
74
+ link_wf_id: int
75
+ link_wf_name: str
76
+ link_wf_version: str
77
+ link_nodes: list[SchedulerLinkWorkflowNodeItem]
78
+ link_graph: list
79
+
80
+
81
+ class JobListResponse(BaseModel):
82
+ jobs: list[JobItem]
83
+ link_nodes: list[SchedulerLinkNodeItem]
84
+ link_workflows: list[SchedulerLinkWorkflowItem]
85
+
86
+
87
+ class TaskStatusCursor(BaseModel):
88
+ job_run: datetime.datetime | None = None
89
+ task_run: datetime.datetime | None = None
90
+ limit: int = 30
91
+ sliding_time: int = 1
92
+ unfinished: dict | None = None
93
+
94
+
95
+ class WorkflowNodeDebugDetail(BaseModel):
96
+ celery_task_id: str | None = None
97
+ state: str | None = None
@@ -0,0 +1,20 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ from airflow.models import DAG
5
+
6
+ from recurvedata.schedulers.airflow import AirflowScheduler
7
+ from recurvedata.schedulers.consts import get_dag_file_loc
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def get_job_dag(job_id: int) -> Optional["DAG"]:
13
+ scheduler = AirflowScheduler(sharding_size=job_id, sharding_key=0)
14
+ dag_dct = scheduler.execute()
15
+ dag_ids = list(dag_dct.keys())
16
+ if not dag_ids:
17
+ return
18
+ dag = dag_dct[dag_ids[0]]
19
+ dag.fileloc = get_dag_file_loc(job_id)
20
+ return dag
@@ -0,0 +1,59 @@
1
+ import os
2
+ from datetime import datetime, timedelta
3
+
4
+ from airflow import DAG
5
+ from airflow.operators.bash import BashOperator
6
+
7
+ from recurvedata.schedulers.consts import SYSTEM_SYNC_STATUS_DAG_ID
8
+
9
+
10
+ def create_system_dags():
11
+ return [
12
+ create_sync_status_dag(),
13
+ ]
14
+
15
+
16
+ def _prepare_bash_env():
17
+ dct = {}
18
+ for key, val in os.environ.items():
19
+ if key.startswith("RECURVE__"):
20
+ dct[key] = val
21
+ elif key.startswith("AIRFLOW"):
22
+ dct[key] = val
23
+ elif key in (
24
+ "PATH",
25
+ "PYENV_ROOT",
26
+ ):
27
+ dct[key] = os.environ[key]
28
+ return dct
29
+
30
+
31
+ def create_sync_status_dag():
32
+ start_date = datetime(2024, 8, 5)
33
+ default_args = {
34
+ "depends_on_past": False,
35
+ "retries": 150,
36
+ "retry_delay": timedelta(seconds=10),
37
+ "priority_weight": 100,
38
+ "retry_exponential_backoff": True,
39
+ "max_retry_delay": timedelta(seconds=30),
40
+ }
41
+ dag = DAG(
42
+ SYSTEM_SYNC_STATUS_DAG_ID,
43
+ default_args=default_args,
44
+ description="A DAG to sync db status",
45
+ schedule_interval="0 */6 * * *", # Run every 6 hours
46
+ start_date=start_date,
47
+ catchup=False,
48
+ dagrun_timeout=timedelta(minutes=60 * 6),
49
+ max_active_runs=1, # todo: retry may delay the future dag_run
50
+ is_paused_upon_creation=False,
51
+ )
52
+
53
+ BashOperator(
54
+ task_id="sync_status",
55
+ bash_command="recurve_scheduler sync-task-status --interval=5",
56
+ dag=dag,
57
+ env=_prepare_bash_env(),
58
+ )
59
+ return dag