recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,211 @@
1
+ import datetime
2
+ import json
3
+ import logging
4
+ import urllib.parse
5
+ from functools import cached_property
6
+ from typing import TYPE_CHECKING, Any, Optional, Union
7
+
8
+ import jsonschema
9
+
10
+ from recurvedata.consts import ETLExecutionStatus
11
+ from recurvedata.core.templating import Renderer
12
+ from recurvedata.operators.base import Configurable
13
+ from recurvedata.operators.context import context
14
+ from recurvedata.operators.models import DagBase, NodeBase
15
+ from recurvedata.utils.attrdict import AttrDict
16
+
17
+ if TYPE_CHECKING:
18
+ from recurvedata.executors.client import ExecutorClient
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class LineageTaskMixin(object):
25
+ # todo: move to utils
26
+ def process_lineage(self):
27
+ try:
28
+ lineage = self.parse_lineage()
29
+ self.save_lineage(lineage)
30
+ except Exception as e:
31
+ # lineage_fail_notify(self)
32
+ logger.exception(f"failed to process lineage, error: {e}")
33
+
34
+ def parse_lineage(self):
35
+ pass
36
+
37
+ def save_lineage(self, lineage):
38
+ if not lineage:
39
+ return
40
+
41
+ self.save_lineage(self, lineage)
42
+ # todo: worker sdk
43
+
44
+
45
+ class BaseTask(Configurable, LineageTaskMixin):
46
+ no_template_fields = () # 不使用 jinja 渲染的字段
47
+ ds_name_fields = ()
48
+
49
+ def __init__(self, dag: DagBase, node: NodeBase, execution_date: datetime.datetime, variables: dict = None):
50
+ self.dag: DagBase = dag
51
+ self.node: NodeBase = node
52
+ self.execution_date: datetime.datetime = execution_date
53
+ self.variables: dict = variables or {}
54
+
55
+ self.config = AttrDict(self.node.configuration)
56
+ self.task_instance_id: int = 0
57
+
58
+ @classmethod
59
+ def validate(cls, configuration: dict) -> dict:
60
+ config = super().validate(configuration)
61
+
62
+ # validate data sources
63
+ for name in cls.ds_name_fields:
64
+ ds = context.get_connection_by_name(connection_name=configuration[name])
65
+ if not ds:
66
+ raise jsonschema.ValidationError(
67
+ message=f"Unknown data source {repr(configuration[name])}", path=(name,)
68
+ )
69
+ return config
70
+
71
+ @classmethod
72
+ def get_ds_name_field_values(cls, rendered_config: dict) -> list[str]:
73
+ res = set()
74
+ for field in cls.ds_name_fields:
75
+ if field in rendered_config:
76
+ ds_name = rendered_config[field]
77
+ res.add(ds_name)
78
+ elif "." in field:
79
+ tmp_rendered_config = rendered_config
80
+ for sub_field in field.split("."):
81
+ if sub_field not in tmp_rendered_config:
82
+ break
83
+ tmp_rendered_config = tmp_rendered_config[sub_field]
84
+ else:
85
+ if isinstance(tmp_rendered_config, str):
86
+ ds_name = tmp_rendered_config
87
+ res.add(ds_name)
88
+ return list(res)
89
+
90
+ @cached_property
91
+ def rendered_config(self) -> AttrDict:
92
+ return self.render_config()
93
+
94
+ def render_config(self) -> AttrDict:
95
+ result = {}
96
+ env = Renderer()
97
+ ctx = self.get_template_context()
98
+
99
+ for k, v in self.config.items():
100
+ if v is None or k in self.__class__.no_template_fields or not isinstance(v, (str, dict, list, tuple)):
101
+ result[k] = v
102
+ else:
103
+ result[k] = env.render_template(v, ctx)
104
+ return AttrDict(result)
105
+
106
+ def get_template_context(self) -> dict[str, Any]:
107
+ ctx = Renderer.init_context(self.execution_date, self.dag.schedule_interval)
108
+ ctx.update(self.variables)
109
+ return ctx
110
+
111
+ def execute(self, *args, **kwargs):
112
+ # TODO: create new task instance, send request to server or message queue?
113
+
114
+ self.on_task_start()
115
+
116
+ self.before_execute_hook()
117
+
118
+ error = None
119
+ meta = None
120
+ error_stack = None
121
+
122
+ logger.info("task configuration: %s", json.dumps(self.rendered_config, indent=2, ensure_ascii=False))
123
+ try:
124
+ meta = self.execute_impl(*args, **kwargs)
125
+ except Exception as exc:
126
+ error = exc
127
+ error_stack = exc.__repr__()
128
+ self.on_execute_impl_error(exc)
129
+
130
+ self.after_execute_hook()
131
+
132
+ self.on_task_finish(meta, error, error_stack) # todo: try except?
133
+
134
+ if error is not None:
135
+ raise error
136
+
137
+ def on_task_start(self):
138
+ self.task_instance_id = context.init_task_instance_on_task_start(self)
139
+
140
+ def on_task_finish(self, meta: Any, error: Exception, error_stack: str):
141
+ try:
142
+ if meta:
143
+ meta = meta.to_json()
144
+ except Exception as e:
145
+ logger.debug(f"failed to get json from meta {meta}, error: {e}")
146
+ meta = None
147
+ if error_stack:
148
+ task_status = ETLExecutionStatus.FAILED
149
+ else:
150
+ task_status = ETLExecutionStatus.SUCCESS
151
+ context.update_task_instance_on_task_finish(self, self.task_instance_id, task_status, meta, error, error_stack)
152
+
153
+ def before_execute_hook(self):
154
+ pass
155
+
156
+ def after_execute_hook(self):
157
+ pass
158
+
159
+ def on_execute_impl_error(self, exc: Exception):
160
+ """callback function to be called if `execute_impl` throws exceptions"""
161
+ pass
162
+
163
+ def execute_impl(self, *args, **kwargs):
164
+ raise NotImplementedError
165
+
166
+ def get_query_comment_conf(self) -> str:
167
+ query_config = {
168
+ "Source": "Recurve",
169
+ "Owner": self.dag.owner,
170
+ "Node": self.node_url,
171
+ }
172
+ return ", ".join(["{}: {}".format(k, v) for k, v in query_config.items()])
173
+
174
+ def set_execution_date(self, execution_date):
175
+ if execution_date == self.execution_date:
176
+ return
177
+ _ = self.rendered_config
178
+ # rendered_config 依赖 self.execution_date
179
+ # 需要用旧的 execution_date 渲染后,再替换掉 self.execution_date
180
+ self.execution_date = execution_date
181
+
182
+ @property
183
+ def node_url(self) -> str:
184
+ # https://dev-test.recurve.test.recurvedata.com/datawork/workflow?p_id=257942399102349312&wf_id=258282502478635008&open_drawer=true&node_key=D2f0I
185
+ host = context.client.base_url # todo: correct it
186
+ query_string = urllib.parse.urlencode(
187
+ {"p_id": self.dag.project_id, "job_id": self.dag.id, "node_key": self.node.node_key, "open_drawer": "true"}
188
+ )
189
+ return f"{host}/datawork/workspace/job?{query_string}"
190
+
191
+ # add proxy methods to avoid importing context everywhere
192
+
193
+ @staticmethod
194
+ def get_connection_by_name(name: str):
195
+ return context.get_connection_by_name(name)
196
+
197
+ @staticmethod
198
+ def must_get_connection_by_name(name: str):
199
+ return context.must_get_connection_by_name(name)
200
+
201
+ @staticmethod
202
+ def get_connection_names_by_type(connection_type: Union[str, list[str]]) -> list[str]:
203
+ return context.get_connection_names_by_type(connection_type)
204
+
205
+ @property
206
+ def stage(self) -> Optional[str]:
207
+ return
208
+
209
+ @property
210
+ def client(self) -> "ExecutorClient":
211
+ return context.client
@@ -0,0 +1,40 @@
1
+ from recurvedata.operators.transfer_operator.dump_aliyun_sls import AliyunSLSDumpTask
2
+ from recurvedata.operators.transfer_operator.dump_task_dbapi import DBAPIDumpTask
3
+ from recurvedata.operators.transfer_operator.dump_task_es import ElasticSearchDumpTask
4
+ from recurvedata.operators.transfer_operator.dump_task_feishu_sheet import FeishuSheetDumpTask
5
+ from recurvedata.operators.transfer_operator.dump_task_ftp import FTPDumpTask
6
+ from recurvedata.operators.transfer_operator.dump_task_google_sheet import GoogleSheetDumpTask
7
+
8
+ # from recurvedata.operators.transfer_operator.dump_task_cass import CassandraDumpTask
9
+ from recurvedata.operators.transfer_operator.dump_task_mongodb import MongoDBDumpTask
10
+ from recurvedata.operators.transfer_operator.dump_task_oss import AliyunOSSDumpTask
11
+ from recurvedata.operators.transfer_operator.dump_task_python import PythonDumpTask
12
+ from recurvedata.operators.transfer_operator.dump_task_s3 import S3DumpTask
13
+ from recurvedata.operators.transfer_operator.dump_task_sftp import SFTPDumpTask
14
+ from recurvedata.operators.transfer_operator.load_task_aliyun_oss import AliyunOSSLoadTask
15
+ from recurvedata.operators.transfer_operator.load_task_azure_blob import AzureBlobStorageLoadTask
16
+ from recurvedata.operators.transfer_operator.load_task_clickhouse import ClickHouseLoadTask
17
+
18
+ # from recurvedata.operators.transfer_operator.load_task_filebrowser import FileBrowserLoadTask
19
+ # from recurvedata.operators.transfer_operator.load_task_hive import HiveLoadTask
20
+ # from recurvedata.operators.transfer_operator.load_task_owncloud import OwnCloudLoadTask
21
+ # from recurvedata.operators.transfer_operator.load_task_recurve_data_prep import DataPrepLoadTask
22
+ # from recurvedata.operators.transfer_operator.load_task_yicrowds import YiCrowdsLoadTask
23
+ # from recurvedata.operators.transfer_operator.load_task_azure_synapse import AzureSynapseLoadTask
24
+ # from recurvedata.operators.transfer_operator.load_task_email import EmailLoadTask
25
+ from recurvedata.operators.transfer_operator.load_task_doris import DorisLoadTask
26
+ from recurvedata.operators.transfer_operator.load_task_es import ElasticSearchLoadTask
27
+ from recurvedata.operators.transfer_operator.load_task_ftp import FTPLoadTask
28
+ from recurvedata.operators.transfer_operator.load_task_google_bigquery import GoogleBigqueryLoadTask
29
+ from recurvedata.operators.transfer_operator.load_task_google_cloud_storage import GoogleCloudStorageLoadTask
30
+ from recurvedata.operators.transfer_operator.load_task_google_sheet import GoogleSheetLoadTask
31
+ from recurvedata.operators.transfer_operator.load_task_microsoft_fabric import MicrosoftFabricLoadTask
32
+ from recurvedata.operators.transfer_operator.load_task_mssql import MsSQLLoadTask
33
+ from recurvedata.operators.transfer_operator.load_task_mysql import MySQLLoadTask
34
+ from recurvedata.operators.transfer_operator.load_task_postgresql import PostgresqlLoadTask
35
+ from recurvedata.operators.transfer_operator.load_task_qcloud_cos import TencentCOSLoadTask
36
+ from recurvedata.operators.transfer_operator.load_task_redshift import RedshiftLoadTask
37
+ from recurvedata.operators.transfer_operator.load_task_s3 import S3LoadTask
38
+ from recurvedata.operators.transfer_operator.load_task_sftp import SFTPLoadTask
39
+ from recurvedata.operators.transfer_operator.load_task_starrocks import StarRocksLoadTask
40
+ from recurvedata.operators.transfer_operator.operator import TransferOperator
@@ -0,0 +1,10 @@
1
+ LOAD_RENAME_OVERWRITE = "RENAME_OVERWRITE"
2
+ LOAD_OVERWRITE = "OVERWRITE"
3
+ LOAD_MERGE = "MERGE"
4
+ LOAD_APPEND = "APPEND"
5
+
6
+
7
+ FILE_TRANSFORM_FUNC_DEFAULT_VALUE = """\
8
+ def transform(filename: str) -> str:
9
+ return filename
10
+ """
@@ -0,0 +1,82 @@
1
+ import copy
2
+
3
+ from recurvedata.core.translation import _l
4
+ from recurvedata.operators.transfer_operator import utils
5
+ from recurvedata.operators.transfer_operator.task import DumpTask
6
+ from recurvedata.pigeon.dumper.aliyun_sls import AliyunSLSDumper
7
+ from recurvedata.utils import extract_dict
8
+
9
+
10
+ class AliyunSLSDumpTask(DumpTask):
11
+ ds_name_fields = ("data_source_name",)
12
+
13
+ def execute_impl(self, *args, **kwargs):
14
+ ds = self.must_get_connection_by_name(self.config["data_source_name"])
15
+ hf = self.create_handler_factory()
16
+ dump_options = extract_dict(
17
+ self.rendered_config, keys=["project", "logstore", "query", "start_time", "end_time", "fields"]
18
+ )
19
+ access_key_id = ds.data.get("access_key_id")
20
+ access_key_secret = ds.data.get("access_key_secret")
21
+ endpoint = ds.data.get("endpoint")
22
+ dump_options.update(
23
+ {
24
+ "endpoint": endpoint,
25
+ "access_key_id": access_key_id,
26
+ "access_key_secret": access_key_secret,
27
+ "handler_factories": [hf],
28
+ }
29
+ )
30
+ dumper = AliyunSLSDumper(**dump_options)
31
+ return dumper.execute()
32
+
33
+ @classmethod
34
+ def config_schema(cls):
35
+ return {
36
+ "type": "object",
37
+ "properties": {
38
+ "data_source_name": {
39
+ "type": "string",
40
+ "title": _l("Aliyun Access Key"),
41
+ "ui:field": "ProjectConnectionSelectorField",
42
+ "ui:options": {
43
+ "supportTypes": ["aliyun_access_key"],
44
+ },
45
+ },
46
+ "project": {"type": "string", "title": _l("Project Name")},
47
+ "logstore": {"type": "string", "title": _l("Logstore Name")},
48
+ "query": {
49
+ "type": "string",
50
+ "title": _l("Query"),
51
+ "description": _l("Query to retrieve logs from Aliyun SLS."),
52
+ "ui:field": "CodeEditorWithReferencesField",
53
+ "ui:options": {
54
+ "type": "code",
55
+ "lang": "sql",
56
+ },
57
+ },
58
+ "start_time": {
59
+ "type": "string",
60
+ "description": _l(
61
+ "Start time of the data to retrieve, supports Jinja templating for dynamic. Format: %Y-%m-%d %H:%M:%S"
62
+ ),
63
+ "title": _l("Start Time"),
64
+ "default": "{{ data_interval_start }}",
65
+ },
66
+ "end_time": {
67
+ "type": "string",
68
+ "description": _l(
69
+ "End time of the data to retrieve, supports Jinja templating for dynamic. Format: %Y-%m-%d %H:%M:%S"
70
+ ),
71
+ "title": _l("End Time"),
72
+ "default": "{{ data_interval_end }}",
73
+ },
74
+ "fields": {
75
+ "type": "string",
76
+ "title": _l("Fields"),
77
+ "description": _l("Comma-separated list of fields to retrieve. Leave empty to get all fields."),
78
+ },
79
+ "transform": copy.deepcopy(utils.TRANSFORM),
80
+ },
81
+ "required": ["data_source_name", "project", "logstore", "start_time", "end_time"],
82
+ }
@@ -0,0 +1,292 @@
1
+ import inspect
2
+ import json
3
+ import logging
4
+ import os
5
+
6
+ import jsonschema
7
+
8
+ from recurvedata.pigeon.handler.csv_handler import CSVFileHandler
9
+ from recurvedata.pigeon.utils import ensure_str_list, fs
10
+
11
+ try:
12
+ import numpy as np
13
+ import pandas as pd
14
+ except ImportError:
15
+ pass
16
+
17
+ from recurvedata.core.translation import _l
18
+ from recurvedata.operators.transfer_operator.task import DumpTask
19
+ from recurvedata.operators.utils import infer_schema_from_dataframe, parse_to_date
20
+ from recurvedata.utils.attrdict import AttrDict
21
+
22
+ logger = logging.getLogger(__name__)
23
+ _transform_default_value = """\
24
+ import pandas as pd
25
+
26
+
27
+ def transform(df: pd.DataFrame) -> pd.DataFrame:
28
+ return df
29
+ """
30
+
31
+
32
+ class SheetDumpTaskBase(DumpTask):
33
+ _AUTO_REGISTER = False
34
+
35
+ common_config_schema_properties = {
36
+ "extra_read_kwargs": {
37
+ "type": "string",
38
+ "title": _l("Additional Read Parameters"),
39
+ "description": _l(
40
+ "Additional parameters to pass to pandas read_csv or read_excel functions in JSON format"
41
+ ),
42
+ "ui:field": "CodeEditorWithReferencesField",
43
+ "ui:options": {
44
+ "type": "code",
45
+ "lang": "json",
46
+ },
47
+ },
48
+ "type_mapping": {
49
+ "type": "string",
50
+ "title": _l("Column Type Mapping"),
51
+ "description": _l(
52
+ 'Specify data types for columns using format {"column_name": "data_type"}. '
53
+ "This mapping is passed to DataFrame.astype() - see "
54
+ '<a target="_blank" href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html">'
55
+ "pandas documentation</a> for supported types."
56
+ ),
57
+ "ui:field": "CodeEditorWithReferencesField",
58
+ "ui:options": {
59
+ "type": "code",
60
+ "lang": "json",
61
+ },
62
+ },
63
+ "date_columns": {
64
+ "type": "string",
65
+ "title": _l("Date Format Columns"),
66
+ "description": _l("Comma-separated list of column names to parse as dates"),
67
+ "ui:field": "CodeEditorWithReferencesField",
68
+ "ui:options": {
69
+ "type": "plain",
70
+ },
71
+ },
72
+ "fillna_to_null": {
73
+ "type": "boolean",
74
+ "title": _l("Convert NaN to NULL"),
75
+ "default": True,
76
+ },
77
+ "order_by": {
78
+ "type": "string",
79
+ "title": _l("Sort Order"),
80
+ "description": _l(
81
+ "Comma-separated list of columns to sort rows by. Original order is preserved if not specified."
82
+ ),
83
+ "ui:field": "CodeEditorWithReferencesField",
84
+ "ui:options": {
85
+ "type": "plain",
86
+ },
87
+ },
88
+ "column_name_mapping": {
89
+ "type": "string",
90
+ "title": _l("Rename Columns"),
91
+ "description": _l('Map old column names to new names using format {"old_name": "new_name"}'),
92
+ "ui:field": "CodeEditorWithReferencesField",
93
+ "ui:options": {
94
+ "type": "code",
95
+ "lang": "json",
96
+ },
97
+ },
98
+ "result_columns": {
99
+ "type": "string",
100
+ "title": _l("Output Columns"),
101
+ "description": _l(
102
+ "Comma-separated list of columns to include in output and their order. All columns included if not specified."
103
+ ),
104
+ "ui:field": "CodeEditorWithReferencesField",
105
+ "ui:options": {
106
+ "type": "plain",
107
+ },
108
+ },
109
+ "primary_keys": {
110
+ "type": "string",
111
+ "title": _l("Unique Key Columns"),
112
+ "description": _l(
113
+ "Comma-separated list of columns that should contain unique values. "
114
+ "Task will fail if duplicates are found. Leave empty to skip uniqueness check."
115
+ ),
116
+ "ui:field": "CodeEditorWithReferencesField",
117
+ "ui:options": {
118
+ "type": "plain",
119
+ },
120
+ },
121
+ "not_nullable_columns": {
122
+ "type": "string",
123
+ "title": _l("Required Columns"),
124
+ "description": _l(
125
+ "Comma-separated list of columns that must not contain NULL values. "
126
+ "Task will fail if NULL values are found in these columns."
127
+ ),
128
+ "ui:field": "CodeEditorWithReferencesField",
129
+ "ui:options": {
130
+ "type": "plain",
131
+ },
132
+ },
133
+ "transform_func": {
134
+ "type": "string",
135
+ "title": _l("Custom Transform"),
136
+ "description": _l(
137
+ "Optional Python function to transform the DataFrame. Must accept and return a pandas DataFrame. "
138
+ "This transformation is applied after all other processing steps."
139
+ ),
140
+ "default": _transform_default_value,
141
+ "ui:field": "CodeEditorWithReferencesField",
142
+ "ui:options": {
143
+ "type": "code",
144
+ "lang": "python",
145
+ },
146
+ },
147
+ }
148
+
149
+ custom_config_schema_properties = {}
150
+
151
+ custom_config_schema_required = []
152
+
153
+ @classmethod
154
+ def config_schema(cls):
155
+ schema = {
156
+ "type": "object",
157
+ "properties": {},
158
+ "required": cls.custom_config_schema_required,
159
+ }
160
+ schema["properties"].update(cls.custom_config_schema_properties)
161
+ for k, v in cls.common_config_schema_properties.items():
162
+ if k not in schema["properties"]:
163
+ schema["properties"][k] = v
164
+ return schema
165
+
166
+ def execute_impl(self, *args, **kwargs):
167
+ conf = self.rendered_config
168
+ df = self.read_origin_df()
169
+
170
+ df = self.apply_builtin_transform(conf, df)
171
+ df = self.apply_validations(conf, df)
172
+ df = self.apply_custom_transform_func(conf, df)
173
+
174
+ self.df_to_csv(df)
175
+
176
+ def read_origin_df(self) -> "pd.DataFrame":
177
+ raise NotImplementedError
178
+
179
+ def df_to_csv(self, df):
180
+ logger.info(f"result DataFrame shape {df.shape}, dtypes:\n{df.dtypes}")
181
+ logger.info(df.head())
182
+
183
+ handler: CSVFileHandler = self.create_handler_factory().create_handler()
184
+ for row in df.itertuples(index=False):
185
+ handler.handle(row)
186
+ handler.close()
187
+ if handler.filename != self.filename and os.path.exists(handler.filename):
188
+ os.rename(handler.filename, self.filename)
189
+ logger.info(f"exported {len(df)} rows into {self.filename}")
190
+
191
+ schema = infer_schema_from_dataframe(df)
192
+ schema_filename = fs.schema_filename(self.filename)
193
+ schema.dump(schema_filename)
194
+ logger.info(f"saving schema to {schema_filename}")
195
+
196
+ @staticmethod
197
+ def apply_builtin_transform(conf: AttrDict, df: "pd.DataFrame") -> "pd.DataFrame":
198
+ logger.info("apply_builtin_transform...")
199
+ if conf.type_mapping:
200
+ logger.info(f" * convert dtypes with {conf.type_mapping}")
201
+ df = df.astype(json.loads(conf.type_mapping))
202
+
203
+ if conf.date_columns:
204
+ cols = ensure_str_list(conf.date_columns)
205
+ logger.info(f" * parse {cols} to date")
206
+ for col in cols:
207
+ df[col] = df[col].map(parse_to_date)
208
+
209
+ if conf.fillna_to_null:
210
+ logger.info(" * fillna with None")
211
+ df = df.fillna(np.nan).replace([np.nan], [None])
212
+
213
+ if conf.order_by:
214
+ cols = ensure_str_list(conf.order_by)
215
+ logger.info(" * sort by {cols")
216
+ df = df.sort_values(by=cols)
217
+
218
+ if conf.column_name_mapping:
219
+ logger.info(f" * apply column name mapping {conf.column_name_mapping}")
220
+ df = df.rename(json.loads(conf.column_name_mapping), axis=1)
221
+
222
+ if conf.result_columns:
223
+ cols = ensure_str_list(conf.result_columns)
224
+ logger.info(f" * change result columns with {cols}")
225
+ df = df[cols]
226
+
227
+ return df
228
+
229
+ @staticmethod
230
+ def apply_validations(conf: AttrDict, df: "pd.DataFrame") -> "pd.DataFrame":
231
+ logger.info("apply_validations...")
232
+ if conf.primary_keys:
233
+ logger.info(" * checking duplication...")
234
+ duplicate = df[df.duplicated(subset=ensure_str_list(conf.primary_keys))]
235
+ if not duplicate.empty:
236
+ logger.error(f"duplicate rows: {duplicate}")
237
+ raise ValueError("duplication detected")
238
+
239
+ if conf.not_nullable_columns:
240
+ cols = ensure_str_list(conf.not_nullable_columns)
241
+ logger.info(f" * checking null to columns {cols}...")
242
+ null_cols = []
243
+ for col in cols:
244
+ if df[col].isnull().values.any():
245
+ null_cols.append(col)
246
+ if null_cols:
247
+ logger.error(f"{null_cols} contains null values")
248
+ raise ValueError(f"{null_cols} contains null values")
249
+
250
+ return df
251
+
252
+ @staticmethod
253
+ def apply_custom_transform_func(conf: AttrDict, df: "pd.DataFrame") -> "pd.DataFrame":
254
+ if not conf.transform_func:
255
+ return df
256
+ func = validate_transform(conf.transform_func)
257
+ if not func:
258
+ return df
259
+
260
+ logger.info("apply transform function...")
261
+ df = func(df)
262
+ if not isinstance(df, pd.DataFrame):
263
+ raise ValueError(f"transform function must return an Pandas DataFrame object, got {type(df)} instead")
264
+ return df
265
+
266
+ @classmethod
267
+ def validate(cls, configuration):
268
+ conf = super().validate(configuration)
269
+
270
+ transform_func_code = conf.get("transform_func", "").strip()
271
+ if transform_func_code:
272
+ validate_transform(transform_func_code)
273
+ return conf
274
+
275
+
276
+ def validate_transform(raw_code):
277
+ code = compile(raw_code, "", "exec")
278
+ ns = {}
279
+ exec(code, ns)
280
+ func = ns.get("transform")
281
+ if not func:
282
+ return None
283
+
284
+ if not callable(func):
285
+ raise jsonschema.ValidationError(message="transform should be callable", path=("transform_func",))
286
+
287
+ sig = inspect.signature(func)
288
+ if tuple(sig.parameters.keys()) != ("df",):
289
+ raise jsonschema.ValidationError(
290
+ message="transform must accept and only accept df as parameter", path=("transform_func",)
291
+ )
292
+ return func