recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,231 @@
1
+ import datetime
2
+ import logging
3
+ import typing
4
+
5
+ try:
6
+ from recurvedata.pigeon.utils.fs import new_stagefile_factory
7
+ except ImportError:
8
+ pass
9
+
10
+ from recurvedata.operators.config import CONF
11
+ from recurvedata.operators.models import DagBase, NodeBase
12
+ from recurvedata.operators.operator import BaseOperator
13
+ from recurvedata.operators.transfer_operator.task import get_dump_classes, get_load_classes, get_task_class
14
+ from recurvedata.operators.ui import format_config_schema
15
+ from recurvedata.utils import md5hash
16
+
17
+ if typing.TYPE_CHECKING:
18
+ from recurvedata.operators.transfer_operator.task import DumpTask, LoadTask
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class TransferOperator(BaseOperator):
24
+ """
25
+ Operator that handles data transfer operations between dump and load stages.
26
+ Manages the execution of dump and load tasks with appropriate configurations.
27
+ """
28
+
29
+ stages = ("dump", "load")
30
+
31
+ def __init__(self, dag: DagBase, node: NodeBase, execution_date: datetime.datetime, variables: dict = None) -> None:
32
+ self.dump_task: "DumpTask" = None
33
+ self.load_task: "LoadTask" = None
34
+ self.filename: str = self._determine_filename(dag, node, execution_date)
35
+ # self.execution_date = as_local_datetime(execution_date)
36
+
37
+ super().__init__(dag, node, execution_date, variables)
38
+
39
+ def init_task(self):
40
+ params = {
41
+ "dag": self.dag,
42
+ "node": self.node,
43
+ "execution_date": self.execution_date,
44
+ "filename": self.filename,
45
+ "variables": self.variables,
46
+ }
47
+
48
+ load_config = self.node.configuration["load"]
49
+ load_cls = self.get_task_class(load_config["name"])
50
+ logger.debug(f"create load task with {params}")
51
+
52
+ self.load_task: LoadTask = load_cls(config=load_config["config"], **params)
53
+
54
+ # TODO: 最好能去掉这种配置,dump 和 load 都使用使用统一的 CSV 格式,由 loader 自己去处理
55
+ handler_options = {
56
+ "encoding": None,
57
+ "write_header": self._determine_write_header(),
58
+ }
59
+ if self.load_task.default_dumper_handler_options:
60
+ handler_options.update(self.load_task.default_dumper_handler_options)
61
+
62
+ dump_config = self.node.configuration["dump"]
63
+ dump_cls = self.get_task_class(dump_config["name"])
64
+ logger.debug(f"create dump task with {params}")
65
+ self.dump_task: DumpTask = dump_cls(config=dump_config["config"], handler_options=handler_options, **params)
66
+ self.load_task.dump_task_type = dump_cls.__name__
67
+
68
+ def set_execution_date(self, execution_date):
69
+ self.dump_task.set_execution_date(execution_date)
70
+ self.load_task.set_execution_date(execution_date)
71
+
72
+ def _determine_write_header(self):
73
+ return self.load_task.should_write_header
74
+
75
+ @staticmethod
76
+ def _determine_filename(dag: DagBase, node: NodeBase, execution_date: datetime.datetime) -> str:
77
+ """
78
+ Generate a unique filename for the transfer operation.
79
+
80
+ Args:
81
+ dag: The DAG instance
82
+ node: The node instance
83
+ execution_date: The execution datetime
84
+
85
+ Returns:
86
+ str: Generated filename
87
+ """
88
+ dag_id = dag.id
89
+ node_id = node.node_key
90
+ is_link_node = getattr(node, "is_link_op", False)
91
+ if not is_link_node:
92
+ hash_txt = md5hash(f"{dag_id}|{node_id}|{execution_date}")
93
+ prefix = f"{dag_id}_{node_id}_"
94
+ else:
95
+ origin_node = node.origin_node
96
+ hash_txt = md5hash(f"{dag_id}|{origin_node.node_key}|{node_id}|{execution_date}")
97
+ prefix = f"{dag_id}_{origin_node.node_key}_{node_id}_"
98
+ logger.info(f"link op _determine_filename: {prefix} {hash_txt}")
99
+
100
+ hash_len = max(8, len(hash_txt) - len(prefix))
101
+ return new_stagefile_factory(CONF.DATA_ROOT)(prefix + hash_txt[:hash_len])
102
+
103
+ def dump(self):
104
+ return self.dump_task.execute()
105
+
106
+ def load(self):
107
+ return self.load_task.execute()
108
+
109
+ def execute(self):
110
+ self.dump()
111
+ self.load()
112
+
113
+ @classmethod
114
+ def validate(cls, configuration: dict):
115
+ config = {
116
+ "dump": cls._validate_task_config(configuration["dump"]),
117
+ "load": cls._validate_task_config(configuration["load"]),
118
+ }
119
+ return config
120
+
121
+ @classmethod
122
+ def _validate_task_config(cls, config: dict):
123
+ task_cls = cls.get_task_class(config["name"])
124
+ cfg = task_cls.validate(config["config"])
125
+ return {"name": config["name"], "config": cfg}
126
+
127
+ @classmethod
128
+ def to_dict(cls) -> dict:
129
+ return {
130
+ "name": cls.name(),
131
+ "config_schema": {
132
+ "dump": [x.to_dict() for x in get_dump_classes()],
133
+ "load": [x.to_dict() for x in get_load_classes()],
134
+ },
135
+ }
136
+
137
+ @classmethod
138
+ def config_schema(cls):
139
+ return {
140
+ "dump": [{"name": x.name(), "config_schema": x.config_schema()} for x in cls.get_dump_classes()],
141
+ "load": [{"name": x.name(), "config_schema": x.config_schema()} for x in cls.get_load_classes()],
142
+ }
143
+
144
+ @classmethod
145
+ def ui_config_schema(cls):
146
+ return {
147
+ "dump": {
148
+ "name": "Dump",
149
+ "config_schema": [
150
+ {"name": x.name(), "config_schema": format_config_schema(x.config_schema(), "dump")}
151
+ for x in cls.get_dump_classes()
152
+ ],
153
+ },
154
+ "load": {
155
+ "name": "Load",
156
+ "config_schema": [
157
+ {"name": x.name(), "config_schema": format_config_schema(x.config_schema(), "load")}
158
+ for x in cls.get_load_classes()
159
+ ],
160
+ },
161
+ }
162
+
163
+ @classmethod
164
+ def ui_validate(cls, configuration: dict) -> dict:
165
+ res = {
166
+ "dump": cls._add_schema_name_to_json_schema_error("dump", cls._validate_task_config, configuration["dump"]),
167
+ "load": cls._add_schema_name_to_json_schema_error("load", cls._validate_task_config, configuration["load"]),
168
+ }
169
+ return res
170
+
171
+ @classmethod
172
+ def ui_config_to_config(cls, configuration: dict) -> dict:
173
+ return {
174
+ "dump": configuration["dump"],
175
+ "load": configuration["load"],
176
+ }
177
+
178
+ @classmethod
179
+ def get_ds_name_field_values(cls, rendered_config: dict) -> list[str]:
180
+ config = cls.ui_config_to_config(rendered_config)
181
+ res = []
182
+ dump_cls = cls.get_task_class(config["dump"]["name"])
183
+ if dump_cls:
184
+ res.extend(dump_cls.get_ds_name_field_values(config["dump"]["config"]))
185
+ load_cls = cls.get_task_class(config["load"]["name"])
186
+ if load_cls:
187
+ res.extend(load_cls.get_ds_name_field_values(config["load"]["config"]))
188
+ return res
189
+
190
+ @classmethod
191
+ def get_task_class(cls, name: str):
192
+ return get_task_class(name)
193
+
194
+ @classmethod
195
+ def get_dump_classes(cls, check_enabled=True):
196
+ res_lst = get_dump_classes()
197
+ if check_enabled:
198
+ res_lst = [dump_cls for dump_cls in res_lst if dump_cls.enabled]
199
+ return res_lst
200
+
201
+ @classmethod
202
+ def get_load_classes(cls, check_enabled=True):
203
+ res_lst = get_load_classes()
204
+ if check_enabled:
205
+ res_lst = [load_cls for load_cls in res_lst if load_cls.enabled]
206
+ return res_lst
207
+
208
+ @classmethod
209
+ def get_setup_install_require(cls) -> dict:
210
+ require_dct = {}
211
+ op_name = cls.name()
212
+ op_web_requires = cls.web_install_require[:]
213
+ op_worker_requires = cls.worker_install_require[:]
214
+ for dump_cls in cls.get_dump_classes():
215
+ if dump_cls.web_install_require:
216
+ require_dct[f"web.{op_name}.dump.{dump_cls.name()}"] = dump_cls.web_install_require
217
+ op_web_requires.extend(dump_cls.web_install_require)
218
+ if dump_cls.worker_install_require:
219
+ require_dct[f"worker.{op_name}.dump.{dump_cls.name()}"] = dump_cls.worker_install_require
220
+ op_worker_requires.extend(dump_cls.worker_install_require)
221
+
222
+ for load_cls in cls.get_load_classes():
223
+ if load_cls.web_install_require:
224
+ require_dct[f"web.{op_name}.load.{load_cls.name()}"] = load_cls.web_install_require
225
+ op_web_requires.extend(load_cls.web_install_require)
226
+ if load_cls.worker_install_require:
227
+ require_dct[f"worker.{op_name}.load.{load_cls.name()}"] = load_cls.worker_install_require
228
+ op_worker_requires.extend(load_cls.worker_install_require)
229
+ require_dct["web"] = sorted(list(set(op_web_requires)))
230
+ require_dct["worker"] = sorted(list(set(op_worker_requires)))
231
+ return require_dct
@@ -0,0 +1,223 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import tempfile
6
+ import traceback
7
+
8
+ import jsonschema
9
+
10
+ from recurvedata.core.transformer import Transformer
11
+ from recurvedata.operators.task import BaseTask
12
+ from recurvedata.utils.attrdict import AttrDict
13
+ from recurvedata.utils.date_time import round_time_resolution
14
+ from recurvedata.utils.helpers import first
15
+ from recurvedata.utils.registry import Registry
16
+
17
+ try:
18
+ from recurvedata.pigeon.handler.csv_handler import create_csv_file_handler_factory
19
+ from recurvedata.pigeon.utils import fs, trim_suffix
20
+ except ImportError:
21
+ pass
22
+
23
+ from recurvedata.operators.transfer_operator import utils
24
+
25
+ logger = logging.getLogger(__name__)
26
+ _registry = Registry(key_callback=lambda x: x.name())
27
+ _load_task_registry = Registry(key_callback=lambda x: x.ds_types)
28
+
29
+
30
+ class Task(BaseTask):
31
+ worker_install_require = []
32
+ web_install_require = []
33
+
34
+ def __init__(self, dag, node, execution_date, variables, config, filename):
35
+ super().__init__(dag, node, execution_date, variables)
36
+
37
+ self.config = AttrDict(config)
38
+ self.filename = filename
39
+
40
+ @classmethod
41
+ def type(cls):
42
+ return None
43
+
44
+ @staticmethod
45
+ def first_or_default(dss, default=""):
46
+ return first(dss, default)
47
+
48
+
49
+ class DumpTask(Task):
50
+ _AUTO_REGISTER = True
51
+ _MAX_ERROR_RATE = 0
52
+ no_template_fields = (
53
+ "data_source_name",
54
+ "filter_engine",
55
+ )
56
+
57
+ @classmethod
58
+ def type(cls):
59
+ return "dump"
60
+
61
+ @property
62
+ def stage(self) -> str:
63
+ return "dump"
64
+
65
+ def __init_subclass__(cls, **kwargs):
66
+ if cls._AUTO_REGISTER:
67
+ _registry.add(cls)
68
+
69
+ def __init__(self, handler_options=None, *args, **kwargs):
70
+ self.handler_options = handler_options or {}
71
+ super().__init__(*args, **kwargs)
72
+
73
+ @classmethod
74
+ def validate(cls, configuration):
75
+ config = super().validate(configuration)
76
+
77
+ transformer_code = configuration.get("transform", "").strip()
78
+ if not transformer_code:
79
+ return config
80
+ try:
81
+ utils.validate_transform(transformer_code)
82
+ except (ValueError, TypeError) as e:
83
+ raise jsonschema.ValidationError(message=str(e), path=("transform",))
84
+ except Exception:
85
+ tb = traceback.format_exc(limit=0)
86
+ msg = "\n".join(tb.splitlines()[1:])
87
+ raise jsonschema.ValidationError(message=msg, path=("transform",))
88
+
89
+ if "custom_handler_options" in config:
90
+ try:
91
+ value = json.loads(config["custom_handler_options"])
92
+ except Exception:
93
+ raise jsonschema.ValidationError(
94
+ message="custom_handler_options should be valid JSON", path=("custom_handler_options",)
95
+ )
96
+ if not isinstance(value, dict):
97
+ raise jsonschema.ValidationError(
98
+ message="custom_handler_options should be dict", path=("custom_handler_options",)
99
+ )
100
+ return config
101
+
102
+ def create_handler_factory(self):
103
+ self.remove_intermediate_files()
104
+ transformer = self.create_transformer()
105
+ kwargs = self.handler_options.copy()
106
+ encoding = self.rendered_config.get("middle_file_encoding")
107
+ kwargs.update(
108
+ {
109
+ "filename": self.filename,
110
+ "encoding": encoding,
111
+ "transformer": transformer,
112
+ "max_error_rate": self._MAX_ERROR_RATE,
113
+ }
114
+ )
115
+ # FIXME: ugly way to get more handler options from Transformer definition
116
+ kwargs.update(getattr(transformer, "handler_options", {}))
117
+
118
+ # allow user to override the default handler options
119
+ if self.rendered_config.custom_handler_options:
120
+ custom_handler_options = json.loads(self.rendered_config.custom_handler_options)
121
+ kwargs.update(custom_handler_options)
122
+ hf = create_csv_file_handler_factory(**kwargs)
123
+ return hf
124
+
125
+ def create_transformer(self) -> Transformer:
126
+ transformer_code = self.rendered_config.get("transform", "").strip()
127
+ if transformer_code:
128
+ transformer = utils.validate_transform(transformer_code)
129
+ else:
130
+ transformer = None
131
+ return transformer
132
+
133
+ def has_custom_transformer(self):
134
+ return self.rendered_config.get("transform")
135
+
136
+ def get_schedule_time_range(self):
137
+ end_date = self.execution_date
138
+ start_date = self.dag.previous_schedule(self.execution_date)
139
+ if self.config.get("time_auto_round", False):
140
+ start_date = round_time_resolution(start_date, self.dag.schedule_interval)
141
+ end_date = round_time_resolution(end_date, self.dag.schedule_interval)
142
+ return start_date, end_date
143
+
144
+ def remove_intermediate_files(self):
145
+ pattern = f"{self.filename}.*"
146
+ logger.info(f"remove intermediate files {pattern}")
147
+ fs.remove_files_by_pattern(pattern)
148
+
149
+ def on_execute_impl_error(self, exc: Exception):
150
+ logger.exception(f"caught error: {exc}")
151
+ self.remove_intermediate_files()
152
+
153
+
154
+ class LoadTask(Task):
155
+ ds_types = ()
156
+ should_write_header = False
157
+ default_dumper_handler_options = {}
158
+ dump_task_type = None
159
+
160
+ def __init_subclass__(cls, **kwargs):
161
+ _registry.add(cls)
162
+ _load_task_registry.add(cls)
163
+
164
+ @classmethod
165
+ def type(cls):
166
+ return "load"
167
+
168
+ @property
169
+ def stage(self) -> str:
170
+ return "load"
171
+
172
+ @staticmethod
173
+ def compress_file(filename, target_filename=None, compress_mode="None"):
174
+ """compress file before loading, only support gzip and zip"""
175
+ if compress_mode == "None":
176
+ return filename, None
177
+ if compress_mode not in ("Gzip", "Zip", "Bzip2"):
178
+ raise ValueError(f"{compress_mode} is not supported")
179
+
180
+ logger.info(f"Compressing file using {compress_mode}")
181
+ compress_method, ext = {
182
+ "Gzip": (fs.gzip_compress, ".gz"),
183
+ "Zip": (fs.zip_compress, ".zip"),
184
+ "Bzip2": (fs.bzip2_compress, ".bz2"),
185
+ }[compress_mode]
186
+
187
+ # 如果指定了压缩包内的文件名,先把文件临时改名为目标文件名,压缩完了再改回来
188
+ if target_filename:
189
+ inner_filename = trim_suffix(os.path.basename(target_filename), ext)
190
+ tmp_dir = tempfile.mkdtemp(dir=os.path.dirname(filename))
191
+ file_to_compress = os.path.join(tmp_dir, inner_filename)
192
+ os.rename(filename, file_to_compress)
193
+ else:
194
+ target_filename = f"{filename}{ext}"
195
+ file_to_compress = filename
196
+
197
+ try:
198
+ compressed_file = compress_method(file_to_compress, target_filename=target_filename, using_cmd=True)
199
+ except BaseException as e:
200
+ raise e
201
+ finally:
202
+ # 如果发生异常,做回滚操作
203
+ if file_to_compress != filename:
204
+ os.rename(file_to_compress, filename)
205
+ shutil.rmtree(os.path.dirname(file_to_compress))
206
+ return compressed_file, ext
207
+
208
+
209
+ def get_task_class(name):
210
+ return _registry[name]
211
+
212
+
213
+ def get_dump_classes():
214
+ return sorted([x for x in _registry.values() if x.type() == "dump"], key=lambda x: x.name())
215
+
216
+
217
+ def get_load_classes():
218
+ return sorted([x for x in _registry.values() if x.type() == "load"], key=lambda x: x.name())
219
+
220
+
221
+ def get_load_by_ds_type(ds_type):
222
+ klass = _load_task_registry.get(ds_type)
223
+ return klass.name()
@@ -0,0 +1,134 @@
1
+ import importlib.util
2
+ import sys
3
+
4
+ from recurvedata.core.transformer import Transformer
5
+ from recurvedata.core.translation import _l
6
+ from recurvedata.operators.transfer_operator import const
7
+
8
+ allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
9
+
10
+ _TRANSFORM_SKELETON = """\
11
+ from recurvedata.core.transformer import Transformer
12
+
13
+
14
+ class MyTransformer(Transformer):
15
+ def transform_impl(self, row, *args, **kwargs):
16
+ # The row is an OrderedDict. Write your custom transformation logic here.
17
+ return row
18
+
19
+
20
+ # Instantiate the transformer, the name must be `transformer`
21
+ transformer = MyTransformer()
22
+ """
23
+
24
+
25
+ TRANSFORM = {
26
+ "type": "string",
27
+ "title": _l("Custom Transformation"),
28
+ "description": _l(
29
+ "Python code to transform data during transfer. Must implement a Transformer class with "
30
+ "transform_impl method that processes each row. See example code below."
31
+ ),
32
+ "default": _TRANSFORM_SKELETON,
33
+ "ui:field": "CodeEditorWithReferencesField",
34
+ "ui:options": {
35
+ "type": "code",
36
+ "lang": "python",
37
+ },
38
+ }
39
+
40
+ LOAD_COMMON = {
41
+ "mode": {
42
+ "type": "string",
43
+ "title": _l("Load Mode"),
44
+ "description": _l("How to handle existing data in the target table"),
45
+ "enum": list(allowed_modes),
46
+ "enumNames": list(allowed_modes),
47
+ "default": const.LOAD_OVERWRITE,
48
+ },
49
+ "primary_keys": {
50
+ "type": "string",
51
+ "title": _l("Primary Keys"),
52
+ "description": _l(
53
+ "Comma-separated list of columns used for deduplication in MERGE mode. "
54
+ "Should be primary or unique key columns."
55
+ ),
56
+ "ui:field": "CodeEditorWithReferencesField",
57
+ "ui:options": {
58
+ "type": "plain",
59
+ },
60
+ "ui:hidden": '{{parentFormData.mode !== "MERGE"}}',
61
+ },
62
+ "dedup": {
63
+ "type": "boolean",
64
+ "title": _l("Enable Deduplication"),
65
+ "default": False,
66
+ "description": _l("Remove duplicate rows from the data before loading"),
67
+ "ui:widget": "BaseCheckbox",
68
+ "ui:options": {
69
+ "label": _l("Enable Deduplication"),
70
+ },
71
+ },
72
+ "dedup_uniq_keys": {
73
+ "type": "string",
74
+ "title": _l("Deduplication Keys"),
75
+ "description": _l("Comma-separated list of columns that uniquely identify each row"),
76
+ "ui:field": "CodeEditorWithReferencesField",
77
+ "ui:options": {
78
+ "type": "plain",
79
+ },
80
+ "ui:hidden": "{{!parentFormData.dedup}}",
81
+ },
82
+ "dedup_orderby": {
83
+ "type": "string",
84
+ "title": _l("Sort Order"),
85
+ "description": _l("Comma-separated list of columns to sort by before deduplication"),
86
+ "ui:field": "CodeEditorWithReferencesField",
87
+ "ui:options": {
88
+ "type": "plain",
89
+ },
90
+ "ui:hidden": "{{!parentFormData.dedup}}",
91
+ },
92
+ # "pre_queries": {
93
+ # "type": "string",
94
+ # "title": "Queries Ran Before Loading",
95
+ # "description": '新数据导入前运行的 SQL,多条 SQL 用 `;` 分隔;支持传入变量,详见 <a target="_blank" href="https://bit.ly/2JMutjn">文档</a>',
96
+ # "ui:field": "CodeEditorWithReferencesField",
97
+ # "ui:options": {
98
+ # "type": "code",
99
+ # "lang": "sql",
100
+ # "sqlLang": "sql",
101
+ # },
102
+ # },
103
+ # "post_queries": {
104
+ # "type": "string",
105
+ # "title": "Queries Ran After Loading",
106
+ # "description": '新数据导入后运行的 SQL,多条 SQL 用 `;` 分隔;支持传入变量,详见 <a target="_blank" href="https://bit.ly/2JMutjn">文档</a>',
107
+ # "ui:field": "CodeEditorWithReferencesField",
108
+ # "ui:options": {
109
+ # "type": "code",
110
+ # "lang": "sql",
111
+ # "sqlLang": "sql",
112
+ # },
113
+ # },
114
+ }
115
+
116
+ __spec = importlib.util.spec_from_loader("recurve_hack", None)
117
+ __recurve_hack = importlib.util.module_from_spec(__spec)
118
+ sys.modules["recurve_hack"] = __recurve_hack
119
+
120
+
121
+ def validate_transform(raw_code):
122
+ from recurvedata.pigeon.transformer import Transformer as PigeonTransformer
123
+
124
+ code = compile(raw_code, "", "exec")
125
+ exec(code, __recurve_hack.__dict__)
126
+ transformer = __recurve_hack.__dict__.get("transformer")
127
+ if not transformer:
128
+ raise ValueError("transformer is required")
129
+ if (
130
+ not isinstance(transformer, (Transformer, PigeonTransformer))
131
+ and transformer.__class__.__name__ != "MyTransformer"
132
+ ):
133
+ raise TypeError(f"transformer should be type of pigeon.transformer.Transformer, {type(transformer)}")
134
+ return transformer
@@ -0,0 +1,80 @@
1
+ def format_config_schema(config_schema: dict, schema_name: str):
2
+ """
3
+ 按前端需求,重新格式化 config_schema
4
+ """
5
+ if "properties" not in config_schema:
6
+ return config_schema
7
+ for field_name, field_dct in config_schema["properties"].items():
8
+ if field_dct["type"] == "object":
9
+ format_config_schema(field_dct, field_name)
10
+ else:
11
+ format_field_schema(field_dct, schema_name)
12
+ return config_schema
13
+
14
+
15
+ def format_field_schema(field_dct: dict, schema_name: str):
16
+ """
17
+ field_dct 例子:
18
+ {
19
+ 'type': 'string',
20
+ 'title': 'Data Source',
21
+ "ui:field": "ProjectConnectionSelectorField",
22
+ "ui:options": {
23
+ "supportTypes": ["mysql", "postgres",],
24
+ },
25
+ },
26
+ """
27
+ _add_option_id(field_dct)
28
+ _format_input_with_variable(field_dct, schema_name)
29
+ _format_aliases_select_field(field_dct)
30
+
31
+
32
+ def _add_option_id(field_dct: dict):
33
+ ui_field = field_dct.get("ui:field")
34
+ if ui_field == "CodeEditorWithReferencesField":
35
+ return
36
+ if "ui:options" not in field_dct:
37
+ field_dct["ui:options"] = {}
38
+ if "id" in field_dct["ui:options"]:
39
+ return
40
+ field_dct["ui:options"]["id"] = ""
41
+
42
+
43
+ def _format_input_with_variable(field_dct: dict, schema_name: str):
44
+ ui_field = field_dct.get("ui:field")
45
+ if ui_field != "CodeEditorWithReferencesField":
46
+ return
47
+ ui_options: dict = field_dct.get("ui:options")
48
+ if not ui_options:
49
+ return
50
+ ui_type = ui_options.get("type")
51
+ if ui_type != "code":
52
+ return
53
+ # 全屏相关配置
54
+ if "toParent" in ui_options:
55
+ return
56
+ ui_options["toParent"] = ".expanded_code_position"
57
+ ui_options["parentName"] = schema_name
58
+ ui_options["needExpandBtn"] = True
59
+
60
+
61
+ def _format_aliases_select_field(field_dct: dict):
62
+ """
63
+ ProjectConnectionSelectorField 的 supportTypes 前端需要展示 connection 的 ui_type
64
+ """
65
+ from recurvedata.connectors import get_connection_ui_type
66
+
67
+ ui_field = field_dct.get("ui:field")
68
+ if ui_field != "ProjectConnectionSelectorField":
69
+ return
70
+ ui_options: dict = field_dct.get("ui:options")
71
+ if not ui_options:
72
+ return
73
+
74
+ support_types = ui_options.get("supportTypes")
75
+ if not support_types:
76
+ return
77
+
78
+ ui_options["supportTypes"] = [
79
+ ui_type for ui_type in [get_connection_ui_type(backend_type) for backend_type in support_types] if ui_type
80
+ ]