recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,285 @@
1
+ import csv
2
+ import inspect
3
+ import logging
4
+ import os
5
+ import shutil
6
+ from copy import deepcopy
7
+
8
+ import jsonschema
9
+
10
+ from recurvedata.core.translation import _l
11
+ from recurvedata.operators.transfer_operator.const import FILE_TRANSFORM_FUNC_DEFAULT_VALUE
12
+ from recurvedata.operators.transfer_operator.task import DumpTask
13
+ from recurvedata.operators.utils import file_factory as ff
14
+ from recurvedata.utils import unescape_backslash
15
+ from recurvedata.utils.files import merge_files
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class AliyunOSSDumpTask(DumpTask):
21
+ ds_name_fields = ("data_source_name",)
22
+
23
+ def execute_impl(self, *args, **kwargs):
24
+ tmp_dirname = f"{self.filename}_dir"
25
+ if os.path.exists(tmp_dirname):
26
+ shutil.rmtree(tmp_dirname)
27
+ os.makedirs(tmp_dirname)
28
+
29
+ conf = self.rendered_config.copy()
30
+
31
+ ds = self.must_get_connection_by_name(conf["data_source_name"])
32
+
33
+ object_path = conf["object_path"]
34
+ delimiter = "/" if object_path.endswith("/") else ""
35
+ keys = self.get_keys(ds, prefix=object_path, delimiter=delimiter)
36
+ logger.info(f"[start] downloading keys:{keys} from oss")
37
+ local_files = []
38
+ for key in keys:
39
+ ds.connector.download(key, folder=tmp_dirname)
40
+ local_files.append(os.path.join(tmp_dirname, os.path.basename(key)))
41
+ logger.info(f"[finish] downloading files to {local_files}")
42
+
43
+ filename = self.process_file(conf=conf, files=local_files)
44
+ filename = self.transform_file(conf, filename)
45
+
46
+ if filename != self.filename:
47
+ logger.info("renaming %s to %s", filename, self.filename)
48
+ os.rename(filename, self.filename)
49
+
50
+ shutil.rmtree(tmp_dirname)
51
+ return None
52
+
53
+ @staticmethod
54
+ def get_keys(ds, prefix, delimiter=""):
55
+ keys = ds.connector.get_keys(prefix=prefix, delimiter=delimiter)
56
+ return keys
57
+
58
+ @staticmethod
59
+ def _infer_compression(filename: str, default_compression: str) -> str:
60
+ """infer compression method from filename"""
61
+ ext = os.path.splitext(filename)[1].lower()
62
+ compression_map = {".gz": "Gzip", ".zip": "Zip"}
63
+ return compression_map.get(ext, default_compression)
64
+
65
+ @staticmethod
66
+ def _infer_file_format(filename: str, default_format: str) -> str:
67
+ """infer file format from filename"""
68
+ ext = os.path.splitext(filename)[1].lower()
69
+ format_map = {
70
+ ".xlsx": "Excel",
71
+ ".xls": "Excel",
72
+ }
73
+ return format_map.get(ext, default_format)
74
+
75
+ def process_file(self, conf, files):
76
+ filename = self.filename
77
+
78
+ for f in files:
79
+ compression = self._infer_compression(f, conf["decompress"])
80
+
81
+ file_format = self._infer_file_format(f, conf["file_format"])
82
+
83
+ if compression == "Gzip":
84
+ logger.info("decompressing %s using gzip", f)
85
+ ff.gzip_decompress(f, inplace=True)
86
+ elif compression == "Zip":
87
+ logger.info("decompressing %s using zip", f)
88
+ ff.zip_decompress(f, inplace=True)
89
+
90
+ skip_head_lines = conf.get("skip_head_lines", 0)
91
+
92
+ if file_format == "Excel":
93
+ logger.info("converting Excel to CSV...")
94
+ ff.convert_excel_to_csv(f, skiprows=skip_head_lines, inplace=True)
95
+ elif file_format == "JSONLines":
96
+ logger.info("converting JSON lines to CSV...")
97
+ ff.convert_jsonlines_to_csv(f, skiprows=skip_head_lines, src_encoding=conf["encoding"], inplace=True)
98
+ elif file_format == "CSV":
99
+ logger.info("converting CSV dialect and encoding if necessary...")
100
+ dialect_options = self._get_custom_csv_options(conf)
101
+ src_dialect_options = deepcopy(dialect_options)
102
+ src_dialect_options.pop("quoting")
103
+ src_dialect_options.pop("doublequote")
104
+ src_dialect_options.pop("escapechar")
105
+ ff.convert_csv_dialect(
106
+ f,
107
+ src_dialect_options=src_dialect_options,
108
+ dst_dialect_options=dialect_options,
109
+ skiprows=skip_head_lines,
110
+ src_encoding=conf["encoding"],
111
+ inplace=True,
112
+ )
113
+
114
+ if files:
115
+ merge_files(files=files, filename=filename)
116
+ return filename
117
+
118
+ def transform_file(self, conf, filename):
119
+ transform_func_code = conf.get("transform_func", "").strip()
120
+ if not transform_func_code:
121
+ return filename
122
+
123
+ func = self._validate_transform(transform_func_code)
124
+ if not func:
125
+ return filename
126
+
127
+ logger.info("calling transform function with %s", (filename,))
128
+ result_file = func(filename)
129
+ if result_file is None or not (isinstance(result_file, str) and os.path.isabs(result_file)):
130
+ raise ValueError("transform must return an absolute filepath, got %s instead", result_file)
131
+ logger.info("got %s", result_file)
132
+ return result_file
133
+
134
+ @staticmethod
135
+ def _get_custom_csv_options(conf):
136
+ rv = {
137
+ "delimiter": unescape_backslash(conf["csv_delimiter"]),
138
+ "lineterminator": unescape_backslash(conf["csv_lineterminator"]),
139
+ "quotechar": '"',
140
+ "doublequote": False,
141
+ "escapechar": "'",
142
+ }
143
+ quoting = conf["csv_quoting"]
144
+ rv["quoting"] = {
145
+ "QUOTE_ALL": csv.QUOTE_ALL,
146
+ "QUOTE_MINIMAL": csv.QUOTE_MINIMAL,
147
+ "QUOTE_NONE": csv.QUOTE_NONE,
148
+ "QUOTE_NONNUMERIC": csv.QUOTE_NONNUMERIC,
149
+ }[quoting]
150
+ return rv
151
+
152
+ @classmethod
153
+ def config_schema(cls):
154
+ # get_choices_by_type = cls.get_connection_names_by_type
155
+ return {
156
+ "type": "object",
157
+ "properties": {
158
+ "data_source_name": {
159
+ "type": "string",
160
+ "title": _l("Aliyun OSS Connection"),
161
+ "ui:field": "ProjectConnectionSelectorField",
162
+ "ui:options": {
163
+ "supportTypes": [
164
+ "oss",
165
+ ],
166
+ },
167
+ },
168
+ "object_path": {
169
+ "type": "string",
170
+ "title": _l("OSS Object Path"),
171
+ "description": _l("Object path or prefix pattern to download. Supports Jinja templating syntax."),
172
+ "ui:field": "CodeEditorWithReferencesField",
173
+ "ui:options": {
174
+ "type": "plain",
175
+ },
176
+ },
177
+ "decompress": {
178
+ "type": "string",
179
+ "title": _l("Decompression Method"),
180
+ "description": _l("Decompress downloaded file using specified method"),
181
+ "enum": ["None", "Gzip", "Zip"],
182
+ "enumNames": ["None", "Gzip", "Zip"],
183
+ "default": "None",
184
+ },
185
+ "file_format": {
186
+ "type": "string",
187
+ "title": _l("Input Format"),
188
+ "description": _l("Format of the source file to be converted to CSV"),
189
+ "enum": ["CSV", "Excel", "JSONLines"],
190
+ "enumNames": ["CSV", "Excel", "JSONLines"],
191
+ "default": "CSV",
192
+ },
193
+ "skip_head_lines": {
194
+ "type": "number",
195
+ "ui:options": {"controls": False},
196
+ "title": _l("Skip Header Rows"),
197
+ "description": _l("Number of rows to skip from the beginning of the file"),
198
+ "default": 0,
199
+ "minimum": 0,
200
+ },
201
+ "encoding": {
202
+ "ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
203
+ "type": "string",
204
+ "title": _l("File Encoding"),
205
+ "description": _l("Character encoding of the CSV file (e.g. utf-8, gbk)"),
206
+ "default": "utf-8",
207
+ "ui:field": "CodeEditorWithReferencesField",
208
+ "ui:options": {
209
+ "type": "plain",
210
+ },
211
+ },
212
+ "csv_delimiter": {
213
+ "ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
214
+ "type": "string",
215
+ "title": _l("Field Delimiter"),
216
+ "description": _l("Character used to separate fields in the CSV file"),
217
+ "default": ",",
218
+ "ui:field": "CodeEditorWithReferencesField",
219
+ "ui:options": {
220
+ "type": "plain",
221
+ },
222
+ },
223
+ "csv_lineterminator": {
224
+ "ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
225
+ "type": "string",
226
+ "title": _l("Line Ending"),
227
+ "description": _l("Character sequence used to terminate lines"),
228
+ "enum": [r"\n", r"\r\n"],
229
+ "enumNames": [r"\n", r"\r\n"],
230
+ "default": r"\r\n",
231
+ },
232
+ "csv_quoting": {
233
+ "ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
234
+ "type": "string",
235
+ "title": _l("Field Quoting"),
236
+ "description": _l("Strategy for quoting fields in the CSV file"),
237
+ "enum": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
238
+ "enumNames": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
239
+ "default": "QUOTE_MINIMAL",
240
+ },
241
+ "transform_func": {
242
+ "type": "string",
243
+ "title": _l("Custom Transformation"),
244
+ "description": _l(
245
+ "Python function to transform the downloaded file. Must accept a filepath argument and return "
246
+ "the path to the transformed file. Runs after built-in transformations."
247
+ ),
248
+ "default": FILE_TRANSFORM_FUNC_DEFAULT_VALUE,
249
+ "ui:field": "CodeEditorWithReferencesField",
250
+ "ui:options": {
251
+ "type": "code",
252
+ "lang": "python",
253
+ },
254
+ },
255
+ },
256
+ "required": ["data_source_name", "object_path"],
257
+ }
258
+
259
+ @staticmethod
260
+ def _validate_transform(raw_code):
261
+ code = compile(raw_code, "", "exec")
262
+ ns = {}
263
+ exec(code, ns)
264
+ func = ns.get("transform")
265
+ if not func:
266
+ return None
267
+
268
+ if not callable(func):
269
+ raise jsonschema.ValidationError(message="transform should be callable", path=("transform_func",))
270
+
271
+ sig = inspect.signature(func)
272
+ if tuple(sig.parameters.keys()) != ("filename",):
273
+ raise jsonschema.ValidationError(
274
+ message="transform must accept and only accept filename as parameter", path=("transform_func",)
275
+ )
276
+ return func
277
+
278
+ @classmethod
279
+ def validate(cls, configuration):
280
+ conf = super().validate(configuration)
281
+
282
+ transform_func_code = conf.get("transform_func", "").strip()
283
+ if transform_func_code:
284
+ cls._validate_transform(transform_func_code)
285
+ return conf
@@ -0,0 +1,212 @@
1
+ import copy
2
+ import inspect
3
+ import logging
4
+ import re
5
+
6
+ import jsonschema
7
+
8
+ from recurvedata.config import RECURVE_EXECUTOR_PYENV_NAME
9
+ from recurvedata.core.translation import _l
10
+ from recurvedata.operators.python_operator.operator import PythonRequirementsMixin
11
+ from recurvedata.operators.transfer_operator.mixin import HiveTextfileConverterMixin
12
+ from recurvedata.operators.transfer_operator.task import DumpTask
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _SOURCE_SKELETON = _l(
17
+ '''
18
+ def execute(filename: str, *args, **kwargs):
19
+ """
20
+ The execute function must be implemented as the entry point for ReOrc.
21
+
22
+ Args:
23
+ filename: Required. Output data to this file in CSV format. This file will be used as input for the Loader.
24
+
25
+ Data Source Parameters:
26
+ For database configurations, use ReOrc's Data Sources instead of hardcoding credentials in the code.
27
+ When defining the execute function, use special parameter names to specify required data sources.
28
+ ReOrc will pass corresponding pigeon connector objects that can be used for database interactions.
29
+
30
+ Parameter naming convention:
31
+ - Must have 'datasource_' prefix, e.g. datasource_xxx
32
+ - Example: datasource_mysql='my_mysql_default'
33
+ At runtime, ReOrc will pass a pigeon.connector.MysqlConnector object
34
+
35
+ Example usage:
36
+ def execute(filename, datasource_mysql='my_mysql_default'):
37
+ df = datasource_mysql.get_pandas_df('SELECT * FROM my_database.my_table')
38
+ df.to_csv(filename, header=False)
39
+ """
40
+ pass
41
+ '''
42
+ )
43
+
44
+
45
+ # FIXME: record all supported template variables, find a way to keep consistent with `get_template_context` method
46
+ _TEMPLATE_VARIABLES = {
47
+ "dt",
48
+ "yesterday",
49
+ "yesterday_dt",
50
+ "tomorrow",
51
+ "tomorrow_dt",
52
+ "logical_date",
53
+ "data_interval_start",
54
+ "data_interval_end",
55
+ "data_interval_start_dt",
56
+ "data_interval_end_dt",
57
+ }
58
+
59
+
60
+ class PythonCodeRunner(object):
61
+ # test page: https://regex101.com/r/p8YCQc/1
62
+ _JINJA2_VAR_PATTERN = re.compile(r"^{{\s*([^\d\W]\w*)\s*}}$")
63
+
64
+ def __init__(self, source):
65
+ self.source = source
66
+
67
+ self.__namespace = {}
68
+ self.__parameters = {}
69
+ self.__datasource_params = {}
70
+ self.__jinja2_variables_params = {}
71
+ self.__ready_for_execution = False
72
+ self.__compiled = False
73
+
74
+ @property
75
+ def entrypoint(self):
76
+ if not self.__compiled:
77
+ raise ValueError("entrypoint is not ready, inspect first")
78
+ return self.__namespace.get("execute")
79
+
80
+ def inspect(self):
81
+ logger.info("compiling source code\n%s", self.source)
82
+ code = compile(self.source, "", "exec")
83
+ exec(code, self.__namespace)
84
+ self.__compiled = True
85
+ entrypoint = self.entrypoint
86
+
87
+ if not (entrypoint and inspect.isfunction(entrypoint)):
88
+ raise jsonschema.ValidationError(message="execute function is required", path=("source",))
89
+
90
+ sig = inspect.signature(entrypoint)
91
+ for name, param in sig.parameters.items():
92
+ value = param.default
93
+ logger.info("found parameter %s=%s", name, value)
94
+
95
+ # special naming for data source parameters: `datasource_xxx`
96
+ if self.is_datasource_param(name):
97
+ if self._is_empty(value):
98
+ raise jsonschema.ValidationError(message=f"{name} must be known data source name", path=("source",))
99
+ ds = DumpTask.get_connection_by_name(value)
100
+ if not ds:
101
+ raise jsonschema.ValidationError(message=f"Unknown data source {repr(name)}", path=("source",))
102
+ self.__datasource_params[name] = value
103
+
104
+ # jinja2 template `{{ dt }}`, no Jinja2 rendering, directly replace
105
+ elif self.is_jinja2_variable(value):
106
+ variable = self._JINJA2_VAR_PATTERN.search(value).groups()[0]
107
+ # unsupported variables
108
+ if variable not in _TEMPLATE_VARIABLES:
109
+ raise jsonschema.ValidationError(
110
+ message=f"Unsupport template variable {repr(value)}", path=("source",)
111
+ )
112
+ self.__jinja2_variables_params[name] = variable
113
+
114
+ else:
115
+ # keep default value, data source and template variable parameters are injected at runtime by calling `bind_parameters`
116
+ self.__parameters[name] = value
117
+ self.__parameters.update(self.__datasource_params)
118
+ self.__parameters.update(self.__jinja2_variables_params)
119
+
120
+ def is_datasource_param(self, name: str) -> bool:
121
+ return name.startswith("datasource_")
122
+
123
+ def is_jinja2_variable(self, name: str) -> bool:
124
+ return isinstance(name, str) and self._JINJA2_VAR_PATTERN.match(name)
125
+
126
+ @staticmethod
127
+ def _is_empty(obj) -> bool:
128
+ return obj is inspect.Signature.empty
129
+
130
+ def bind_parameters(self, filename, template_context, **kwargs):
131
+ params = copy.deepcopy(kwargs)
132
+ params["filename"] = filename
133
+
134
+ logger.info("binding data source connectors %s", self.__datasource_params)
135
+ for param_name, ds_name in self.__datasource_params.items():
136
+ params[param_name] = DumpTask.get_connection_by_name(ds_name).connector
137
+
138
+ logger.info("binding jinja2 variables %s", self.__jinja2_variables_params)
139
+ for param_name, variable in self.__jinja2_variables_params.items():
140
+ params[param_name] = template_context[variable]
141
+
142
+ # bind other parameters, or override default parameters
143
+ for k, v in params.items():
144
+ if k in self.__parameters:
145
+ self.__parameters[k] = v
146
+
147
+ # check if there are any parameters not passed
148
+ for name, value in self.__parameters.items():
149
+ if name not in ["args", "kwargs"] and self._is_empty(value):
150
+ raise TypeError(f"parameter {repr(name)} is not bound")
151
+
152
+ logger.info("bounded parameters %s", self.__parameters)
153
+ self.__ready_for_execution = True
154
+
155
+ def execute(self):
156
+ if not self.__ready_for_execution:
157
+ raise RuntimeError("must call inspect and bind_parameters before calling execute")
158
+ logger.info("calling entrypoint %s with parameters %s...", self.entrypoint, self.__parameters)
159
+ self.entrypoint(**self.__parameters)
160
+ logger.info("done.")
161
+
162
+
163
+ class PythonDumpTask(DumpTask, HiveTextfileConverterMixin, PythonRequirementsMixin):
164
+ no_template_fields = ("source",)
165
+
166
+ def execute_impl(self, *args, **kwargs):
167
+ config = self.rendered_config.copy()
168
+ runner = PythonCodeRunner(config["source"])
169
+
170
+ # Get and install requirements if any
171
+ py_conn_configs = self.client.get_py_conn_configs()
172
+ if py_conn_configs and isinstance(py_conn_configs, dict):
173
+ requirements = "\n".join(py_conn_configs.get("requirements", []))
174
+ self._install_requirements(requirements, RECURVE_EXECUTOR_PYENV_NAME)
175
+
176
+ runner.inspect()
177
+ context = self.get_template_context()
178
+ runner.bind_parameters(filename=self.filename, template_context=context)
179
+ runner.execute()
180
+
181
+ self.convert_csv_to_hive_text_if_needed()
182
+ return None
183
+
184
+ @classmethod
185
+ def config_schema(cls):
186
+ return {
187
+ "type": "object",
188
+ "properties": {
189
+ "source": {
190
+ "type": "string",
191
+ "title": _l("Python Source Code"),
192
+ "description": _l(
193
+ "Python code that extracts data and writes to a CSV file. Must implement an execute() function that takes a filename parameter. Note: The Load step must specify a Create Table DDL when using PythonDump."
194
+ ),
195
+ "default": _SOURCE_SKELETON,
196
+ "ui:field": "CodeEditorWithReferencesField",
197
+ "ui:options": {
198
+ "type": "code",
199
+ "lang": "python",
200
+ },
201
+ },
202
+ },
203
+ "required": ["source"],
204
+ }
205
+
206
+ @classmethod
207
+ def validate(cls, configuration):
208
+ config = super().validate(configuration)
209
+
210
+ runner = PythonCodeRunner(config["source"])
211
+ runner.inspect()
212
+ return config