recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,172 @@
1
+ import logging
2
+ import os
3
+ from tempfile import NamedTemporaryFile
4
+
5
+ from recurvedata.core.translation import _l
6
+ from recurvedata.operators.operator import BaseOperator
7
+ from recurvedata.operators.task import BaseTask
8
+ from recurvedata.utils.mp import robust_run_subprocess
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ AIRFLOW_PYTHON_PATH = "python" # system python path
13
+
14
+
15
+ class SensorTask(BaseTask):
16
+ """
17
+ Sensor operator is used to create a sensor that could check upstream task status,
18
+ will wait until the upstream task is success.
19
+ It uses airflow ExternalTaskSensor to check the status of the upstream task.
20
+ For modeling pipeline,
21
+ the node_key is the node_key of the selected model.
22
+ for advanced pipeline,
23
+ - if normal Operator like SQLOperator, the node_key is the node_key of the SQLOperator.
24
+ - if Modeling Pipeline ( which is LinkModelPipelineOperator), the node_key is "{node_key of the LinkModelPipelineOperator}.{node_key of the model}"
25
+
26
+ """
27
+
28
+ # todo: same schedule interval dependency
29
+
30
+ @classmethod
31
+ def config_schema(cls):
32
+ return {
33
+ "type": "object",
34
+ "properties": {
35
+ "project_id": {
36
+ "type": "string",
37
+ "title": _l("Project Name"),
38
+ "description": _l("Project Name containing the external task"),
39
+ "ui:field": "SensorOperatorProjectSelectField",
40
+ },
41
+ "job_id": {
42
+ "type": "string",
43
+ "title": _l("Job Name"),
44
+ "description": _l("Job Name of the external task"),
45
+ "ui:field": "SensorOperatorJobSelectField",
46
+ },
47
+ "node_key": {
48
+ "type": "string",
49
+ "title": _l("Node Name"),
50
+ "description": _l("Node Name of the external task"),
51
+ "ui:field": "SensorOperatorNodeSelectField",
52
+ },
53
+ "wait_time": {
54
+ "type": "integer",
55
+ "title": _l("Wait Time"),
56
+ "description": _l("Wait time in seconds between checks"),
57
+ "ui:options": {
58
+ "min": 0,
59
+ "step": 1,
60
+ },
61
+ },
62
+ "timeout": {
63
+ "type": "integer",
64
+ "title": _l("Timeout"),
65
+ "description": _l("Timeout"),
66
+ "default": 60,
67
+ "ui:options": {
68
+ "min": 0,
69
+ "step": 1,
70
+ },
71
+ },
72
+ },
73
+ "required": ["project_id", "job_id", "node_key"],
74
+ }
75
+
76
+ def generate_airflow_operator_code(self):
77
+ config = self.rendered_config
78
+ external_job_id = config["job_id"]
79
+ external_node_key = config["node_key"]
80
+ execution_delta = config.get("wait_time", 0)
81
+ timeout = config.get("timeout", 3600)
82
+ dag_name = self.dag.name
83
+
84
+ return f"""
85
+ import sys
86
+ from recurvedata.operators.sensor_operator.airflow_utils import format_external_dag_id, format_external_task_id, data_interval_end_to_data_interval_start
87
+ from airflow.sensors.external_task import ExternalTaskSensor
88
+ import datetime
89
+ import logging
90
+ from recurvedata.schedulers.consts import is_dev_run_job
91
+
92
+ logger = logging.getLogger()
93
+ external_dag_id = format_external_dag_id({external_job_id!r})
94
+ external_task_id = format_external_task_id({external_node_key!r})
95
+
96
+ external_dag = get_dag_from_db(external_dag_id)
97
+ if not external_dag:
98
+ raise ValueError("External DAG not found")
99
+ external_task = external_dag.get_task(external_task_id)
100
+ if not external_task:
101
+ raise ValueError("External Task not found")
102
+
103
+ data_interval_end = context["data_interval_end"]
104
+ external_data_interval_end = data_interval_end - datetime.timedelta(seconds={execution_delta})
105
+ external_data_interval_start = data_interval_end_to_data_interval_start(external_dag, external_data_interval_end)
106
+
107
+ logger.debug("external_data_interval_start " + str(external_data_interval_start))
108
+
109
+ tmp_task_id="tmp_task_id_for_external_task_sensor"
110
+ operator = ExternalTaskSensor(
111
+ dag=dag,
112
+ task_id=tmp_task_id,
113
+ external_dag_id=external_dag_id,
114
+ external_task_id=external_task_id,
115
+ execution_date_fn = lambda *args, **kwargs: external_data_interval_start,
116
+ execution_timeout = datetime.timedelta(seconds={timeout}),
117
+ )
118
+ if is_dev_run_job({dag_name!r}):
119
+ logger.info(f"dag_name: {dag_name!r}")
120
+ logger.info(f"skip: SensorOperator is not working in dev mode")
121
+ sys.exit(0)
122
+
123
+ """
124
+
125
+ def generate_airflow_code(self) -> str:
126
+ config = self.rendered_config
127
+ timeout = config.get("timeout", 3600)
128
+ operator_code = self.generate_airflow_operator_code()
129
+ return """
130
+ import os
131
+ import time
132
+ from recurvedata.operators.sensor_operator.airflow_utils import prepare_airflow_env, get_dag_from_db, \
133
+ build_execute_context
134
+ from recurvedata.utils.timeout import timeout
135
+
136
+ prepare_airflow_env()
137
+
138
+ dag_id = os.environ.get("AIRFLOW_CTX_DAG_ID")
139
+ task_id = os.environ.get("AIRFLOW_CTX_TASK_ID")
140
+ run_id = os.environ.get("AIRFLOW_CTX_DAG_RUN_ID")
141
+
142
+ dag = get_dag_from_db(dag_id)
143
+ task = dag.get_task(task_id)
144
+ context = build_execute_context(dag, task, run_id)
145
+
146
+ {operator_code}
147
+
148
+ with timeout({timeout}):
149
+ operator.execute(context)
150
+ """.format(
151
+ operator_code=operator_code,
152
+ timeout=timeout,
153
+ )
154
+
155
+ def __run_airflow_operator(self, filename: str):
156
+ script_path = os.path.abspath(filename)
157
+ env = os.environ.copy()
158
+ output, ret_code = robust_run_subprocess([AIRFLOW_PYTHON_PATH, script_path], _logger=logger, env=env)
159
+ if ret_code:
160
+ raise RuntimeError(f"Airflow Error:\n{output}")
161
+
162
+ def execute_impl(self, *args, **kwargs):
163
+ code = self.generate_airflow_code()
164
+ prefix = f"reorc_sensor_operator_{self.dag.id}_{self.node.id}_"
165
+ with NamedTemporaryFile(mode="w+t", prefix=prefix, suffix=".py") as tmp_file:
166
+ tmp_file.write(code)
167
+ tmp_file.flush()
168
+ self.__run_airflow_operator(tmp_file.name)
169
+
170
+
171
+ class SensorOperator(BaseOperator):
172
+ task_cls = SensorTask
@@ -0,0 +1 @@
1
+ from recurvedata.operators.spark_operator.operator import SparkOperator
@@ -0,0 +1,200 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ from functools import lru_cache
5
+ from importlib import resources
6
+ from subprocess import PIPE, STDOUT, Popen
7
+ from tempfile import NamedTemporaryFile
8
+ from textwrap import dedent
9
+
10
+ from recurvedata.core.translation import _l
11
+ from recurvedata.operators.operator import BaseOperator
12
+ from recurvedata.operators.task import BaseTask
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @lru_cache()
18
+ def get_sample_code():
19
+ return resources.files("recurvedata.operators.spark_operator").joinpath("spark_sample.py").read_text()
20
+
21
+
22
+ class SparkTask(BaseTask):
23
+ @classmethod
24
+ def config_schema(cls):
25
+ return {
26
+ "type": "object",
27
+ "properties": {
28
+ "spark_source": {
29
+ "type": "string",
30
+ "title": _l("Spark Environment"),
31
+ "ui:field": "ProjectConnectionSelectorField",
32
+ "ui:options": {
33
+ "supportTypes": [
34
+ "spark",
35
+ ],
36
+ },
37
+ "description": _l("Select the Spark environment and version to use for this task"),
38
+ },
39
+ "env": {
40
+ "type": "string",
41
+ "title": _l("Environment Variables"),
42
+ "default": "{}",
43
+ "description": _l(
44
+ 'Additional environment variables in JSON format (e.g. {"HADOOP_CONF_DIR": "/etc/hadoop/conf"})'
45
+ ),
46
+ "ui:field": "CodeEditorWithReferencesField",
47
+ "ui:options": {
48
+ "type": "code",
49
+ "lang": "json",
50
+ },
51
+ },
52
+ "execution_config": {
53
+ "type": "string",
54
+ "title": _l("Spark Configuration"),
55
+ "default": dedent(
56
+ """\
57
+ {
58
+ "master": "yarn",
59
+ "executor-memory": "4g",
60
+ "num-executors": "10",
61
+ "executor-cores": "2",
62
+ "queue": "default",
63
+ "conf": {
64
+ "spark.dynamicAllocation.enabled": "False"
65
+ }
66
+ }
67
+ """
68
+ ),
69
+ "description": _l(
70
+ "Spark execution parameters and configurations. See "
71
+ "<a target='_blank' href='https://spark.apache.org/docs/latest/configuration.html'>"
72
+ "Spark Docs</a> for available options"
73
+ ),
74
+ "ui:field": "CodeEditorWithReferencesField",
75
+ "ui:options": {
76
+ "type": "code",
77
+ "lang": "json",
78
+ },
79
+ },
80
+ "code": {
81
+ "type": "string",
82
+ "title": _l("Spark Code"),
83
+ "default": get_sample_code(),
84
+ "description": _l(
85
+ "PySpark code to execute. The default template shows how to create a SparkSession "
86
+ "(Spark 2.3+). Supports Jinja templating for dynamic code generation."
87
+ ),
88
+ "ui:field": "CodeEditorWithReferencesField",
89
+ "ui:options": {
90
+ "type": "code",
91
+ "lang": "python",
92
+ },
93
+ },
94
+ },
95
+ "required": ["spark_source", "env", "execution_config", "code"],
96
+ }
97
+
98
+ def __create_env(self, source_env, extra_env):
99
+ env = os.environ.copy()
100
+ env.update(source_env)
101
+ env.update(extra_env)
102
+ return env
103
+
104
+ def __create_bash_command(self, script_path, submitter, execution_config, source_conf: dict):
105
+ execution_conf_list = []
106
+ conf_names = []
107
+ for k, v in execution_config.items():
108
+ if k == "conf":
109
+ for k2, v2 in v.items():
110
+ execution_conf_list.append(f"--conf {k2}={v2}")
111
+ conf_names.append(k2)
112
+ else:
113
+ execution_conf_list.append(f"--{k} {v}")
114
+ for k, v in source_conf.items():
115
+ if k in conf_names:
116
+ continue
117
+ execution_conf_list.append(f"--conf {k}={v}")
118
+ execution_conf_str = " ".join(execution_conf_list)
119
+ bash_command = submitter + " " + execution_conf_str + " " + script_path
120
+ return bash_command
121
+
122
+ def __execute_command(self, bash_command, env):
123
+ logger.info("Running command: %s", bash_command)
124
+ sub_process = Popen(["bash", "-c", bash_command], stdout=PIPE, stderr=STDOUT, env=env)
125
+ logger.info("Output:")
126
+ for raw_line in iter(sub_process.stdout.readline, b""):
127
+ line = raw_line.decode("utf8").rstrip()
128
+ logger.info(line)
129
+ sub_process.wait()
130
+ logger.info("Node exited with return code %s", sub_process.returncode)
131
+ if sub_process.returncode:
132
+ raise Exception("Spark node failed")
133
+
134
+ @classmethod
135
+ def __filter_empty_value_in_dict(cls, dct: dict):
136
+ if not dct:
137
+ return dct
138
+ return {k: v for (k, v) in dct.items() if (v is not None and v != "" and v != {})}
139
+
140
+ @classmethod
141
+ def _merge_dict(cls, priority_dct: dict, other_dct: dict):
142
+ """
143
+ Filter out empty values
144
+ """
145
+ if not (other_dct and priority_dct):
146
+ return priority_dct or other_dct
147
+ result_dct = {}
148
+ for key in set(list(priority_dct.keys()) + list(other_dct.keys())):
149
+ if key not in priority_dct:
150
+ result_dct[key] = other_dct[key]
151
+ continue
152
+ if key not in other_dct:
153
+ result_dct[key] = priority_dct[key]
154
+ continue
155
+ if isinstance(priority_dct[key], dict) and isinstance(other_dct, dict):
156
+ result_dct[key] = cls._merge_dict(priority_dct[key], other_dct[key])
157
+ continue
158
+ result_dct[key] = priority_dct[key]
159
+ return result_dct
160
+
161
+ def __excute_spark_code(self, config):
162
+ spark_source = self.must_get_connection_by_name(config.spark_source)
163
+ submitter = spark_source.extra.get("submitter")
164
+ source_conf = spark_source.extra.get("conf", {})
165
+ source_env = self.__filter_empty_value_in_dict(
166
+ spark_source.extra.get("env")
167
+ ) # Some empty values may be saved when saving on the page
168
+ execution_config = self._merge_dict(
169
+ self.__filter_empty_value_in_dict(json.loads(config.execution_config)),
170
+ self.__filter_empty_value_in_dict(spark_source.extra.get("execution_config")),
171
+ )
172
+ extra_env = json.loads(config.env) # User input, don't filter empty values
173
+ code = config.code
174
+
175
+ prefix = f"recurve_pyspark_{self.dag.dag_id}_{self.node.id}_"
176
+ with NamedTemporaryFile(mode="w+t", prefix=prefix, suffix=".py") as tmp_file:
177
+ tmp_file.write(code)
178
+ tmp_file.flush()
179
+ logger.info(code)
180
+ script_path = os.path.abspath(tmp_file.name)
181
+ bash_command = self.__create_bash_command(script_path, submitter, execution_config, source_conf)
182
+ env = self.__create_env(source_env, extra_env)
183
+ self.__execute_command(bash_command, env)
184
+
185
+ def execute_impl(self, *args, **kwargs):
186
+ config = self.rendered_config
187
+ self.__excute_spark_code(config)
188
+ return None
189
+
190
+
191
+ class SparkOperator(BaseOperator):
192
+ task_cls = SparkTask
193
+
194
+ @classmethod
195
+ def validate(cls, configuration):
196
+ config = super().validate(configuration)
197
+ # execution_config = json.loads(config['execution_config'])
198
+ # if execution_config['master'] != 'yarn':
199
+ # raise jsonschema.ValidationError(message='master should be yarn')
200
+ return config
@@ -0,0 +1,47 @@
1
+ try:
2
+ from pyspark.sql import SparkSession
3
+ from pyspark.sql.types import IntegerType, StringType, StructField, StructType
4
+
5
+ # Initialize a Spark session
6
+ spark = SparkSession.builder.appName("PySpark SQL Example").getOrCreate()
7
+
8
+ # Define the schema
9
+ schema = StructType(
10
+ [
11
+ StructField("name", StringType(), True),
12
+ StructField("age", IntegerType(), True),
13
+ StructField("city", StringType(), True),
14
+ ]
15
+ )
16
+
17
+ # Create a DataFrame manually
18
+ data = [
19
+ ("Alice", 34, "New York"),
20
+ ("Bob", 45, "Los Angeles"),
21
+ ("Cathy", 29, "Chicago"),
22
+ ("David", 31, "New York"),
23
+ ("Emma", 42, "San Francisco"),
24
+ ]
25
+
26
+ df = spark.createDataFrame(data, schema)
27
+
28
+ # Show the DataFrame
29
+ df.show()
30
+
31
+ # Register the DataFrame as a temporary view
32
+ df.createOrReplaceTempView("people")
33
+
34
+ # Perform a SQL query
35
+ result_df = spark.sql("SELECT * FROM people WHERE age > 30")
36
+
37
+ # Show the result of the SQL query
38
+ result_df.show()
39
+
40
+ # Write the result to another CSV file
41
+ result_df.write.csv("output.csv", header=True)
42
+
43
+ # Stop the Spark session
44
+ spark.stop()
45
+
46
+ except ImportError:
47
+ pass
@@ -0,0 +1 @@
1
+ from recurvedata.operators.sql_operator.operator import SQLOperator
@@ -0,0 +1,90 @@
1
+ from typing import Any
2
+
3
+ import jsonschema
4
+
5
+ from recurvedata.connectors.service import list_sql_operator_types
6
+ from recurvedata.core.translation import _l
7
+ from recurvedata.operators.operator import BaseOperator
8
+ from recurvedata.operators.task import BaseTask
9
+ from recurvedata.operators.utils import lineage
10
+
11
+
12
+ class SQLTask(BaseTask):
13
+ no_template_fields = ("autocommit", "data_source_name")
14
+
15
+ @classmethod
16
+ def config_schema(cls):
17
+ # get_names_by_type = cls.get_connection_names_by_type
18
+ return {
19
+ "type": "object",
20
+ "properties": {
21
+ "data_source_name": {
22
+ "type": "string",
23
+ "title": _l("Data Source"),
24
+ "ui:field": "ProjectConnectionSelectorField",
25
+ "ui:options": {"supportTypes": list_sql_operator_types()},
26
+ },
27
+ # "database": {
28
+ # "type": "string",
29
+ # "title": _l("Database"),
30
+ # "ui:field": "CodeEditorWithReferencesField",
31
+ # "ui:options": {
32
+ # "type": "plain",
33
+ # },
34
+ # },
35
+ "sql": {
36
+ "type": "string",
37
+ "title": _l("SQL Query"),
38
+ "description": _l(
39
+ "Execute single or multiple SQL statements. "
40
+ "Supports Jinja templating for variables and dynamic queries."
41
+ ),
42
+ "ui:field": "CodeEditorWithReferencesField",
43
+ "ui:options": {
44
+ "type": "code",
45
+ "lang": "sql",
46
+ "sqlLang": "sql",
47
+ },
48
+ },
49
+ },
50
+ "required": ["data_source_name", "sql"],
51
+ }
52
+
53
+ @classmethod
54
+ def validate(cls, configuration: dict[str, Any]) -> dict[str, Any]:
55
+ config = super().validate(configuration)
56
+
57
+ ds = cls.must_get_connection_by_name(configuration["data_source_name"])
58
+ if not ds.is_dbapi:
59
+ raise jsonschema.ValidationError(message="only DBAPI is supported", path=("data_source_name",))
60
+ return config
61
+
62
+ def execute_impl(self, *args, **kwargs):
63
+ config = self.rendered_config
64
+ ds = self.get_connection_by_name(config.data_source_name)
65
+ connector = ds.connector
66
+ # if config.database:
67
+ # connector.database = config.database
68
+
69
+ queries = config.sql
70
+ if connector.is_hive():
71
+ # Set spark.app.name to help locate the specific Recurve task in the YARN UI
72
+ queries = f"SET spark.app.name=recurve.{self.dag.name}.{self.node.name};\n{queries}"
73
+
74
+ comment = self.get_query_comment_conf()
75
+ annotated_queries = connector.add_leading_comment(queries, comment)
76
+
77
+ connector.execute(annotated_queries, autocommit=config.get("autocommit", False))
78
+ return None
79
+
80
+ def parse_lineage(self):
81
+ config = self.rendered_config
82
+ ds = self.get_connection_by_name(config.data_source_name)
83
+ if not lineage.supported_recurve_ds_type(ds.ds_type):
84
+ return
85
+ res = lineage.parse_lineage(config.sql, config.database, ds.name, ds.ds_type)
86
+ return res
87
+
88
+
89
+ class SQLOperator(BaseOperator):
90
+ task_cls = SQLTask