recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,279 @@
1
+ import datetime
2
+ import logging
3
+ import signal
4
+ import sys
5
+ import time
6
+
7
+ import pytz
8
+ from airflow.models import DagRun, TaskInstance
9
+ from airflow.models.taskinstancehistory import TaskInstanceHistory
10
+ from airflow.utils.session import create_session
11
+ from sqlalchemy import and_, func, tuple_
12
+ from sqlalchemy.orm import joinedload
13
+
14
+ from recurvedata.config import AgentConfig
15
+ from recurvedata.schedulers.client import SchedulerClient
16
+ from recurvedata.schedulers.consts import SYSTEM_SYNC_STATUS_DAG_ID
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class TaskStatusScanner:
22
+ def __init__(self):
23
+ config = AgentConfig.load()
24
+ if config.request_timeout < 30:
25
+ config.request_timeout = 30
26
+ self.client = SchedulerClient(config)
27
+ self._running = False
28
+
29
+ def run(self, interval: int):
30
+ def signal_handler(_sig, _frame):
31
+ self._running = False
32
+
33
+ def handle_sigterm(_sig, _frame):
34
+ self._running = False
35
+ sys.exit(0)
36
+
37
+ signal.signal(signal.SIGINT, signal_handler)
38
+ signal.signal(signal.SIGTERM, handle_sigterm)
39
+
40
+ self._running = True
41
+ step = interval
42
+
43
+ while self._running:
44
+ if step >= interval:
45
+ status_cursor = self.client.get_task_status_cursor()
46
+
47
+ job_runs = {}
48
+ task_runs = {}
49
+
50
+ def _collect_job_runs(_job_runs):
51
+ for jr in _job_runs:
52
+ job_runs[(str(jr["job_id"]), jr["run_id"])] = jr
53
+
54
+ def _collect_task_runs(_task_runs):
55
+ for tr in _task_runs:
56
+ task_runs[(tr["job_id"], tr["run_id"], tr["node_key"])] = tr
57
+
58
+ _job_runs = self.scan_dag_runs(
59
+ self._localize_time(status_cursor.job_run),
60
+ status_cursor.limit,
61
+ sliding_time=status_cursor.sliding_time,
62
+ )
63
+ _collect_job_runs(_job_runs)
64
+
65
+ _task_runs, _job_runs = self.scan_task_instances(
66
+ self._localize_time(status_cursor.task_run),
67
+ status_cursor.limit,
68
+ sliding_time=status_cursor.sliding_time,
69
+ )
70
+ _collect_job_runs(_job_runs)
71
+ _collect_task_runs(_task_runs)
72
+
73
+ _task_runs, _job_runs = self.scan_unfinished_task_instances(status_cursor.unfinished)
74
+ _collect_job_runs(_job_runs)
75
+ _collect_task_runs(_task_runs)
76
+
77
+ logger.info(f"number of job runs: {len(job_runs)}")
78
+ logger.info(f"number of task runs: {len(task_runs)}")
79
+
80
+ # get actual job start time
81
+ filters = list(job_runs.keys())
82
+
83
+ if filters:
84
+ with create_session() as session:
85
+ query = (
86
+ session.query(
87
+ func.min(TaskInstance.start_date).label("start_time"),
88
+ TaskInstance.dag_id,
89
+ TaskInstance.run_id,
90
+ TaskInstance.try_number,
91
+ )
92
+ .filter(tuple_(TaskInstance.dag_id, TaskInstance.run_id).in_(filters))
93
+ .group_by(TaskInstance.dag_id, TaskInstance.run_id, TaskInstance.try_number)
94
+ )
95
+
96
+ # retried job runs
97
+ history_filters = []
98
+ for row in query.all():
99
+ if row.try_number > 1:
100
+ history_filters.append((str(row.dag_id), row.run_id))
101
+ continue
102
+
103
+ job_runs[(row.dag_id), row.run_id]["start_time"] = (
104
+ row.start_time and row.start_time.isoformat()
105
+ )
106
+
107
+ if history_filters:
108
+ query = (
109
+ session.query(
110
+ func.min(TaskInstanceHistory.start_date).label("start_time"),
111
+ TaskInstanceHistory.dag_id,
112
+ TaskInstanceHistory.run_id,
113
+ )
114
+ .filter(
115
+ tuple_(TaskInstanceHistory.dag_id, TaskInstanceHistory.run_id).in_(history_filters)
116
+ )
117
+ .group_by(TaskInstanceHistory.dag_id, TaskInstanceHistory.run_id)
118
+ )
119
+
120
+ for row in query.all():
121
+ job_runs[(row.dag_id, row.run_id)]["start_time"] = (
122
+ row.start_time and row.start_time.isoformat()
123
+ )
124
+
125
+ self.client.sync_task_status(job_runs=list(job_runs.values()), task_runs=list(task_runs.values()))
126
+ step = 0
127
+ time.sleep(1.0)
128
+ step += 1
129
+
130
+ def _localize_time(self, time: datetime.datetime | None):
131
+ if time is not None and time.tzinfo is None:
132
+ timezone = pytz.timezone("UTC")
133
+ time = timezone.localize(time)
134
+ return time
135
+
136
+ def _parse_job_id(self, dag_id: str):
137
+ return int(dag_id.split(".")[-1])
138
+
139
+ def _sliding_time_query(self, session, model, last_updated_time, sliding_time):
140
+ query = session.query(model).where(model.dag_id != SYSTEM_SYNC_STATUS_DAG_ID)
141
+ query = query.where(
142
+ and_(
143
+ model.updated_at >= (last_updated_time - datetime.timedelta(seconds=sliding_time)),
144
+ model.updated_at < last_updated_time,
145
+ )
146
+ )
147
+ return query.order_by(model.updated_at.asc())
148
+
149
+ def _limit_query(self, session, model, last_updated_time, limit):
150
+ query = session.query(model).where(model.dag_id != SYSTEM_SYNC_STATUS_DAG_ID)
151
+ if last_updated_time is not None:
152
+ query = query.where(model.updated_at >= last_updated_time)
153
+ return query.order_by(model.updated_at.asc()).limit(limit)
154
+
155
+ def _format_job_run(self, dr: DagRun, workflow_version: str):
156
+ return dict(
157
+ job_id=self._parse_job_id(dr.dag_id),
158
+ run_id=dr.run_id,
159
+ state=dr.state,
160
+ start_time=dr.start_date and dr.start_date.isoformat(),
161
+ end_time=dr.end_date and dr.end_date.isoformat(),
162
+ execution_date=dr.execution_date and dr.execution_date.isoformat(),
163
+ workflow_version=workflow_version,
164
+ airflow_updated_at=dr.updated_at and dr.updated_at.isoformat(),
165
+ run_type=None,
166
+ data_interval_end=dr.data_interval_end and dr.data_interval_end.isoformat(),
167
+ )
168
+
169
+ def scan_dag_runs(self, last_updated_time: datetime.datetime | None, limit: int, sliding_time: int = 0):
170
+ job_runs = []
171
+ workflow_version_map = {}
172
+
173
+ with create_session() as session:
174
+ dag_runs: list[DagRun] = []
175
+ if last_updated_time and sliding_time > 0:
176
+ dag_runs.extend(self._sliding_time_query(session, DagRun, last_updated_time, sliding_time).all())
177
+ dag_runs.extend(self._limit_query(session, DagRun, last_updated_time, limit).all())
178
+
179
+ for dr in dag_runs:
180
+ workflow_version = workflow_version_map.get((dr.dag_id, dr.run_id))
181
+ if workflow_version is None:
182
+ ti = (
183
+ session.query(TaskInstance)
184
+ .filter(TaskInstance.dag_id == dr.dag_id, TaskInstance.run_id == dr.run_id)
185
+ .first()
186
+ )
187
+ workflow_version = ti and ti.executor_config.get("workflow_version")
188
+ if workflow_version is not None:
189
+ workflow_version_map[(dr.dag_id, dr.run_id)] = workflow_version
190
+ job_runs.append(self._format_job_run(dr, workflow_version))
191
+ return job_runs
192
+
193
+ def _format_task_run(self, ti: TaskInstance):
194
+ return dict(
195
+ job_id=self._parse_job_id(ti.dag_id),
196
+ run_id=ti.run_id,
197
+ node_key=ti.task_id,
198
+ state=ti.state,
199
+ try_number=ti._try_number,
200
+ start_time=ti.start_date and ti.start_date.isoformat(),
201
+ end_time=ti.end_date and ti.end_date.isoformat(),
202
+ execution_date=ti.execution_date and ti.execution_date.isoformat(),
203
+ workflow_version=ti.executor_config.get("workflow_version"),
204
+ operator=ti.executor_config.get("operator"),
205
+ task=ti.executor_config.get("task"),
206
+ link_workflow_id=ti.executor_config.get("link_workflow_id"),
207
+ link_workflow_version=ti.executor_config.get("link_workflow_version"),
208
+ airflow_updated_at=ti.updated_at and ti.updated_at.isoformat(),
209
+ )
210
+
211
+ def scan_task_instances(self, last_updated_time: datetime.datetime | None, limit: int, sliding_time: int = 0):
212
+ dag_runs = {}
213
+ task_runs = []
214
+
215
+ with create_session() as session:
216
+ tis: list[TaskInstance] = []
217
+ if last_updated_time and sliding_time > 0:
218
+ tis.extend(
219
+ self._sliding_time_query(session, TaskInstance, last_updated_time, sliding_time)
220
+ .options(joinedload(TaskInstance.dag_run))
221
+ .all()
222
+ )
223
+ tis.extend(
224
+ self._limit_query(session, TaskInstance, last_updated_time, limit)
225
+ .options(joinedload(TaskInstance.dag_run))
226
+ .all()
227
+ )
228
+
229
+ for ti in tis:
230
+ dag_runs[(ti.dag_run.dag_id, ti.dag_run.run_id)] = (
231
+ ti.dag_run,
232
+ ti.executor_config.get("workflow_version"),
233
+ )
234
+ task_runs.append(self._format_task_run(ti))
235
+ return task_runs, [self._format_job_run(dr, workflow_version) for dr, workflow_version in dag_runs.values()]
236
+
237
+ def scan_unfinished_task_instances(self, data: dict | None):
238
+ if not data:
239
+ return [], []
240
+
241
+ dag_ids = set()
242
+ task_ids = set()
243
+ run_ids = set()
244
+ for dag_id, item in data.items():
245
+ dag_ids.add(dag_id)
246
+ for task_id, _run_ids in item.items():
247
+ task_ids.add(task_id)
248
+ for run_id in _run_ids:
249
+ run_ids.add(run_id)
250
+
251
+ dag_runs = {}
252
+ task_runs = []
253
+
254
+ with create_session() as session:
255
+ criterion = []
256
+ if dag_ids:
257
+ criterion.append(TaskInstance.dag_id.in_(dag_ids))
258
+ if task_ids:
259
+ criterion.append(TaskInstance.task_id.in_(task_ids))
260
+ if run_ids:
261
+ criterion.append(TaskInstance.run_id.in_(run_ids))
262
+
263
+ tis: list[TaskInstance] = (
264
+ session.query(TaskInstance).where(*criterion).options(joinedload(TaskInstance.dag_run)).all()
265
+ )
266
+
267
+ for ti in tis:
268
+ if (
269
+ ti.dag_id not in data
270
+ or ti.task_id not in data[ti.dag_id]
271
+ or ti.run_id not in data[ti.dag_id][ti.task_id]
272
+ ):
273
+ continue
274
+ dag_runs[(ti.dag_run.dag_id, ti.dag_run.run_id)] = (
275
+ ti.dag_run,
276
+ ti.executor_config.get("workflow_version"),
277
+ )
278
+ task_runs.append(self._format_task_run(ti))
279
+ return task_runs, [self._format_job_run(dr, workflow_version) for dr, workflow_version in dag_runs.values()]
@@ -0,0 +1,73 @@
1
+ import datetime
2
+ import json
3
+ import logging
4
+ from typing import Generator
5
+
6
+ from airflow.models import DAG, BaseOperator
7
+ from airflow.utils.session import create_session
8
+ from slugify import slugify
9
+
10
+ from recurvedata.schedulers.client import SchedulerClient
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def get_tasks(dag: DAG, recurve_node_key: str = None) -> Generator[BaseOperator, None, None]:
16
+ if recurve_node_key:
17
+ for task in dag.task_dict.values():
18
+ if task.doc_json and json.loads(task.doc_json).get("recurve_node_key") == recurve_node_key:
19
+ yield task
20
+
21
+
22
+ def clear_task_instance(
23
+ dag: DAG,
24
+ recurve_node_key: str,
25
+ execution_date: datetime.datetime,
26
+ only_failed: bool = False,
27
+ including_downstream: bool = False,
28
+ ):
29
+ clear_task_ids: list[str] = []
30
+ for task in get_tasks(dag, recurve_node_key):
31
+ clear_task_ids.append(task.task_id)
32
+ airflow_execution_date = dag.previous_schedule(execution_date) # todo: timezone
33
+ with create_session() as session:
34
+ dag = dag.partial_subset(task_ids_or_regex=clear_task_ids, include_downstream=including_downstream)
35
+ clear_task_ids = [tid for tid in dag.task_dict]
36
+ logger.info(f"prepare to clear dag: {dag.dag_id}, {clear_task_ids} execution_date: {airflow_execution_date}")
37
+
38
+ clear_cnt = dag.clear(
39
+ task_ids=clear_task_ids,
40
+ start_date=airflow_execution_date,
41
+ end_date=airflow_execution_date,
42
+ only_failed=only_failed,
43
+ session=session,
44
+ )
45
+ logger.info(
46
+ f"finish clear dag: {dag.dag_id}, {clear_task_ids} execution_date: {airflow_execution_date}, total clear: {clear_cnt} task_instances"
47
+ )
48
+
49
+
50
+ def slugify_text(s: str) -> str:
51
+ """A simple wrapper to python-slugify, using custom regex_pattern to keep `.` and `_` as is
52
+
53
+ >>> slugify_text('我是谁')
54
+ 'wo-shi-shui'
55
+ >>> slugify_text('load_fact_user_stats')
56
+ 'load_fact_user_stats'
57
+ >>> slugify_text('tidb prepare category tables')
58
+ 'tidb-prepare-category-tables'
59
+ >>> slugify_text('estimate daily deal 2017.10.20')
60
+ 'estimate-daily-deal-2017.10.20'
61
+ """
62
+ return slugify(s, regex_pattern=r"[^-a-zA-Z0-9\._]+")
63
+
64
+
65
+ def format_dag_id(job_id: int) -> str:
66
+ """
67
+ please do not adjust this function
68
+ """
69
+ return str(job_id)
70
+
71
+
72
+ def init_client() -> SchedulerClient:
73
+ return SchedulerClient()
File without changes
@@ -0,0 +1,88 @@
1
+ import dataclasses
2
+ import datetime
3
+ from typing import Any, Callable, Optional
4
+
5
+ import dateutil.parser
6
+
7
+ from recurvedata.schema.types import DataType
8
+ from recurvedata.utils import json
9
+ from recurvedata.utils.registry import GenericRegistry
10
+
11
+ _registry = GenericRegistry[DataType, Callable[[str], Any]]()
12
+
13
+
14
+ @_registry.add(DataType.INT8, DataType.INT16, DataType.INT32, DataType.INT64)
15
+ def _(value: str) -> int:
16
+ if value == "":
17
+ return 0
18
+ return int(value)
19
+
20
+
21
+ @_registry.add(DataType.FLOAT32, DataType.FLOAT64)
22
+ def _(value: str) -> float:
23
+ if value == "":
24
+ return 0.0
25
+ return float(value)
26
+
27
+
28
+ @_registry.add(DataType.BOOLEAN)
29
+ def _(value: str) -> bool:
30
+ if value.lower() in ("", "0", "false"):
31
+ return False
32
+ return True
33
+
34
+
35
+ @_registry.add(DataType.DATETIME)
36
+ def _(value: str) -> Optional[datetime.datetime]:
37
+ if value == "":
38
+ return None
39
+ return dateutil.parser.parse(value)
40
+
41
+
42
+ @_registry.add(DataType.DATE)
43
+ def _(value: str) -> Optional[datetime.date]:
44
+ if value == "":
45
+ return None
46
+ return dateutil.parser.parse(value).date()
47
+
48
+
49
+ @_registry.add(DataType.JSON)
50
+ def _(value: str) -> Any:
51
+ if value in ("",):
52
+ # 正常情况下不会有 '',很可能是从 CSV 文件读到了空字符,当作 None 处理
53
+ return None
54
+ return json.loads(value)
55
+
56
+
57
+ @dataclasses.dataclass
58
+ class Field:
59
+ name: str
60
+ type: DataType
61
+ size: int = None
62
+ comment: str = None
63
+ extra: dict = None
64
+
65
+ def __post_init__(self):
66
+ self._cast_func: Callable[[str], Any] = _registry.get(self.type, lambda x: x)
67
+
68
+ def cast(self, value: Optional[str]) -> Any:
69
+ if value is None:
70
+ return None
71
+ if value == "NULL":
72
+ return None
73
+ return self._cast_func(value)
74
+
75
+ def to_dict(self) -> dict[str, Any]:
76
+ return {
77
+ "name": self.name,
78
+ "type": self.type,
79
+ "size": self.size,
80
+ "comment": self.comment,
81
+ "extra": self.extra,
82
+ }
83
+
84
+ def __str__(self):
85
+ return f'<Field ("{self.name}", "{self.type}")>'
86
+
87
+ def __repr__(self):
88
+ return f'<Field ("{self.name}", "{self.type}")>'
@@ -0,0 +1,55 @@
1
+ import dataclasses
2
+ from typing import Any
3
+
4
+ from recurvedata.schema.field import Field
5
+ from recurvedata.schema.types import DataType
6
+ from recurvedata.utils import json
7
+
8
+
9
+ @dataclasses.dataclass
10
+ class Schema:
11
+ fields: list[Field] = dataclasses.field(default_factory=list)
12
+
13
+ def add_field(self, field: Field):
14
+ if field.name in self.field_names:
15
+ raise ValueError(f"Field name {field.name} already exists")
16
+ self.fields.append(field)
17
+
18
+ def add_field_by_attrs(
19
+ self,
20
+ name: str,
21
+ type: DataType,
22
+ size: int = None,
23
+ comment: str = None,
24
+ extra: dict = None,
25
+ ):
26
+ self.add_field(Field(name, type, size, comment, extra))
27
+
28
+ def remove_field(self, name: str):
29
+ self.fields = [x for x in self.fields if x.name != name]
30
+
31
+ def keep_fields(self, names: list[str]):
32
+ self.fields = [x for x in self.fields if x.name in names]
33
+
34
+ @property
35
+ def field_names(self) -> list[str]:
36
+ return [x.name for x in self.fields]
37
+
38
+ def __iter__(self):
39
+ return iter(self.fields)
40
+
41
+ def to_list(self) -> list[dict[str, Any]]:
42
+ return [x.to_dict() for x in self.fields]
43
+
44
+ def to_json(self, **kwargs) -> str:
45
+ return json.dumps(self.to_list(), **kwargs)
46
+
47
+ def dump(self, filename: str):
48
+ with open(filename, "w") as f:
49
+ f.write(self.to_json(indent=2))
50
+
51
+ @classmethod
52
+ def load(cls, filename: str) -> "Schema":
53
+ with open(filename) as f:
54
+ data = json.loads(f.read())
55
+ return cls([Field(**item) for item in data])
@@ -0,0 +1,17 @@
1
+ import enum
2
+
3
+
4
+ class DataType(str, enum.Enum):
5
+ INT8 = "INT8" # 1-byte (8-bit) signed integers
6
+ INT16 = "INT16" # 2-byte (16-bit) signed integers
7
+ INT32 = "INT32" # 4-byte (32-bit) signed integers
8
+ INT64 = "INT64" # 8-byte (64-bit) signed integers
9
+ FLOAT32 = "FLOAT32" # 4-byte (32-bit) single-precision floating
10
+ FLOAT64 = "FLOAT64" # 8-byte (64-bit) double-precision floating
11
+ BOOLEAN = "BOOLEAN"
12
+
13
+ DATETIME = "DATETIME"
14
+ DATE = "DATE"
15
+
16
+ STRING = "STRING"
17
+ JSON = "JSON"
recurvedata/schema.py ADDED
File without changes
File without changes
@@ -0,0 +1,7 @@
1
+ from recurvedata.executors.utils import patch_pandas_mysql_connector_cext_missing
2
+
3
+ patch_pandas_mysql_connector_cext_missing()
4
+
5
+ from recurvedata.server.main import create_app # noqa: E402
6
+
7
+ app = create_app()
File without changes
@@ -0,0 +1,79 @@
1
+ from fastapi import APIRouter
2
+ from loguru import logger
3
+
4
+ from recurvedata.core.tracing import Tracing
5
+ from recurvedata.executors.schemas import (
6
+ ColumnListPayload,
7
+ ConnectionRuntimePayload,
8
+ ResponseModel,
9
+ TableListPayload,
10
+ TestConnectionPayload,
11
+ )
12
+ from recurvedata.executors.service.connector import ConnectionService
13
+ from recurvedata.executors.utils import run_with_result_handling_v2
14
+ from recurvedata.server.connector.schemas import (
15
+ ListColumnsResponse,
16
+ ListDatabasesResponse,
17
+ ListFullDatabasesResponse,
18
+ ListTablesResponse,
19
+ TestConnectionResponse,
20
+ )
21
+
22
+ tracer = Tracing()
23
+ router = APIRouter()
24
+
25
+
26
+ @router.post("/test-connection")
27
+ @tracer.create_span(sampling_rate=0.1)
28
+ async def test_connection(*, payload: TestConnectionPayload) -> TestConnectionResponse:
29
+ logger.info(f"test_connection: {payload.connection_type}")
30
+
31
+ res: ResponseModel = await run_with_result_handling_v2(
32
+ ConnectionService.test_connection, payload.timeout, payload.connection_type, payload.config
33
+ )
34
+ logger.info("finish test_connection")
35
+ return TestConnectionResponse.model_validate(res.model_dump())
36
+
37
+
38
+ @router.post("/list-databases")
39
+ @tracer.create_span(sampling_rate=0.1)
40
+ async def list_databases(*, payload: ConnectionRuntimePayload) -> ListDatabasesResponse:
41
+ logger.info(f"list_databases: {payload.connection_type}")
42
+ res: ResponseModel = await run_with_result_handling_v2(
43
+ ConnectionService.list_databases, None, payload.connection_type, payload.config
44
+ )
45
+ logger.info("finish list_databases")
46
+ return ListDatabasesResponse.model_validate(res.model_dump())
47
+
48
+
49
+ @router.post("/list-tables")
50
+ @tracer.create_span(sampling_rate=0.1)
51
+ async def list_tables(*, payload: TableListPayload) -> ListTablesResponse:
52
+ logger.info(f"list_tables: {payload.connection_type} {payload.database}")
53
+ res: ResponseModel = await run_with_result_handling_v2(
54
+ ConnectionService.list_tables, None, payload.connection_type, payload.config, payload.database
55
+ )
56
+ logger.info("finish list_tables")
57
+ return ListTablesResponse.model_validate(res.model_dump())
58
+
59
+
60
+ @router.post("/list-columns")
61
+ @tracer.create_span(sampling_rate=0.1)
62
+ async def list_columns(*, payload: ColumnListPayload) -> ListColumnsResponse:
63
+ logger.info(f"list_columns: {payload.connection_type} {payload.database} {payload.table}")
64
+ res: ResponseModel = await run_with_result_handling_v2(
65
+ ConnectionService.list_columns, None, payload.connection_type, payload.config, payload.database, payload.table
66
+ )
67
+ logger.info("finish list_columns")
68
+ return ListColumnsResponse.model_validate(res.model_dump())
69
+
70
+
71
+ @router.post("/list-full-databases")
72
+ @tracer.create_span(sampling_rate=0.1)
73
+ async def list_full_databases(*, payload: ConnectionRuntimePayload) -> ListFullDatabasesResponse:
74
+ logger.info(f"list_full_databases: {payload.connection_type}")
75
+ res: ResponseModel = await run_with_result_handling_v2(
76
+ ConnectionService.list_full_databases, None, payload.connection_type, payload.config
77
+ )
78
+ logger.info("finish list_full_databases")
79
+ return ListFullDatabasesResponse.model_validate(res.model_dump())
@@ -0,0 +1,28 @@
1
+ from recurvedata.executors.schemas import (
2
+ ColumnItem,
3
+ FullDatabaseItem,
4
+ ListDatabases,
5
+ Pagination,
6
+ ResponseModel,
7
+ TableItem,
8
+ )
9
+
10
+
11
+ class TestConnectionResponse(ResponseModel):
12
+ pass
13
+
14
+
15
+ class ListDatabasesResponse(ResponseModel):
16
+ data: ListDatabases | None
17
+
18
+
19
+ class ListTablesResponse(ResponseModel):
20
+ data: Pagination[TableItem] | None
21
+
22
+
23
+ class ListColumnsResponse(ResponseModel):
24
+ data: Pagination[ColumnItem] | None
25
+
26
+
27
+ class ListFullDatabasesResponse(ResponseModel):
28
+ data: Pagination[FullDatabaseItem] | None
File without changes