recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,234 @@
1
+ import csv
2
+ import inspect
3
+ import logging
4
+ import os
5
+
6
+ import jsonschema
7
+
8
+ from recurvedata.core.translation import _l
9
+ from recurvedata.operators.transfer_operator.const import FILE_TRANSFORM_FUNC_DEFAULT_VALUE
10
+ from recurvedata.operators.transfer_operator.mixin import HiveTextfileConverterMixin
11
+ from recurvedata.operators.transfer_operator.task import DumpTask
12
+ from recurvedata.operators.utils import file_factory as ff
13
+ from recurvedata.utils import unescape_backslash
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FTPDumpTask(DumpTask, HiveTextfileConverterMixin):
19
+ ds_name_fields = ("data_source_name",)
20
+ worker_install_require = ["pigeon"]
21
+
22
+ def execute_impl(self, *args, **kwargs):
23
+ from recurvedata.pigeon.dumper.ftp import FtpDumper
24
+
25
+ conf = self.rendered_config
26
+
27
+ ds = self.must_get_connection_by_name(conf["data_source_name"])
28
+ dumper = FtpDumper(ds.connector, src=conf["filepath"], dst=self.filename)
29
+ meta = dumper.execute()
30
+
31
+ filename = self.process_file(conf)
32
+ filename = self.transform_file(conf, filename)
33
+
34
+ if filename != self.filename:
35
+ logger.info("renaming %s to %s", filename, self.filename)
36
+ os.rename(filename, self.filename)
37
+
38
+ # TODO: pigeon loader 要支持不同的文件格式
39
+ self.convert_csv_to_hive_text_if_needed()
40
+ return meta
41
+
42
+ def process_file(self, conf):
43
+ filename = self.filename
44
+ if conf["decompress"] == "Gzip":
45
+ logger.info("decompressing %s using gzip", self.filename)
46
+ filename = ff.gzip_decompress(self.filename, inplace=True)
47
+
48
+ skip_head_lines = conf.get("skip_head_lines", 0)
49
+ if conf["file_format"] == "Excel":
50
+ logger.info("converting Excel to CSV...")
51
+ filename = ff.convert_excel_to_csv(filename, skiprows=skip_head_lines, inplace=True)
52
+ if conf["file_format"] == "JSONLines":
53
+ logger.info("converting JSON lines to CSV...")
54
+ filename = ff.convert_jsonlines_to_csv(
55
+ filename, skiprows=skip_head_lines, src_encoding=conf["encoding"], inplace=True
56
+ )
57
+ if conf["file_format"] == "CSV":
58
+ logger.info("converting CSV dialect and encoding if necessary...")
59
+ dialect_options = self._get_custom_csv_options(conf)
60
+ filename = ff.convert_csv_dialect(
61
+ filename,
62
+ src_dialect_options=dialect_options,
63
+ skiprows=skip_head_lines,
64
+ src_encoding=conf["encoding"],
65
+ inplace=True,
66
+ )
67
+ return filename
68
+
69
+ def transform_file(self, conf, filename):
70
+ transform_func_code = conf.get("transform_func", "").strip()
71
+ if not transform_func_code:
72
+ return filename
73
+
74
+ func = validate_transform(transform_func_code)
75
+ if not func:
76
+ return filename
77
+
78
+ logger.info("calling transform function with %s", (filename,))
79
+ result_file = func(filename)
80
+ if result_file is None or not (isinstance(result_file, str) and os.path.isabs(result_file)):
81
+ raise ValueError("transform must return an absolute filepath, got %s instead", result_file)
82
+ logger.info("got %s", result_file)
83
+ return result_file
84
+
85
+ def _get_custom_csv_options(self, conf):
86
+ rv = {
87
+ "delimiter": unescape_backslash(conf["csv_delimiter"]),
88
+ "lineterminator": unescape_backslash(conf["csv_lineterminator"]),
89
+ }
90
+ quoting = conf["csv_quoting"]
91
+ rv["quoting"] = {
92
+ "QUOTE_ALL": csv.QUOTE_ALL,
93
+ "QUOTE_MINIMAL": csv.QUOTE_MINIMAL,
94
+ "QUOTE_NONE": csv.QUOTE_NONE,
95
+ "QUOTE_NONNUMERIC": csv.QUOTE_NONNUMERIC,
96
+ }[quoting]
97
+ return rv
98
+
99
+ @classmethod
100
+ def config_schema(cls):
101
+ # get_choices_by_type = cls.get_connection_names_by_type
102
+ return {
103
+ "type": "object",
104
+ "properties": {
105
+ "data_source_name": {
106
+ "type": "string",
107
+ "title": _l("FTP Connection"),
108
+ "ui:field": "ProjectConnectionSelectorField",
109
+ "ui:options": {
110
+ "supportTypes": [
111
+ "ftp",
112
+ ],
113
+ },
114
+ },
115
+ "filepath": {
116
+ "type": "string",
117
+ "title": _l("Source File Path"),
118
+ "description": _l(
119
+ "Absolute path to the file on FTP server (e.g. /path/to/file.csv). Supports Jinja templating."
120
+ ),
121
+ "ui:field": "CodeEditorWithReferencesField",
122
+ "ui:options": {
123
+ "type": "plain",
124
+ },
125
+ },
126
+ "decompress": {
127
+ "type": "string",
128
+ "title": _l("Decompression Method"),
129
+ "description": _l("Decompress downloaded file using specified method"),
130
+ "enum": ["None", "Gzip"],
131
+ "enumNames": ["None", "Gzip"],
132
+ "default": "None",
133
+ },
134
+ "file_format": {
135
+ "type": "string",
136
+ "title": _l("Input Format"),
137
+ "description": _l("Format of the source file to be converted to CSV"),
138
+ "enum": ["CSV", "Excel", "JSONLines"],
139
+ "enumNames": ["CSV", "Excel", "JSONLines"],
140
+ "default": "CSV",
141
+ },
142
+ "skip_head_lines": {
143
+ "type": "number",
144
+ "ui:options": {"controls": False},
145
+ "title": _l("Skip Header Rows"),
146
+ "description": _l("Number of rows to skip from the beginning of the file"),
147
+ "default": 0,
148
+ "minimum": 0,
149
+ },
150
+ "encoding": {
151
+ "ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
152
+ "type": "string",
153
+ "title": _l("File Encoding"),
154
+ "description": _l("Character encoding of the CSV file (e.g. utf-8, gbk)"),
155
+ "default": "utf-8",
156
+ "ui:field": "CodeEditorWithReferencesField",
157
+ "ui:options": {
158
+ "type": "plain",
159
+ },
160
+ },
161
+ "csv_delimiter": {
162
+ "ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
163
+ "type": "string",
164
+ "title": _l("Field Delimiter"),
165
+ "description": _l("Character used to separate fields in the CSV file"),
166
+ "default": ",",
167
+ "ui:field": "CodeEditorWithReferencesField",
168
+ "ui:options": {
169
+ "type": "plain",
170
+ },
171
+ },
172
+ "csv_lineterminator": {
173
+ "ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
174
+ "type": "string",
175
+ "title": _l("Line Ending"),
176
+ "description": _l("Character sequence used to terminate lines"),
177
+ "enum": [r"\n", r"\r\n"],
178
+ "enumNames": [r"\n", r"\r\n"],
179
+ "default": r"\r\n",
180
+ },
181
+ "csv_quoting": {
182
+ "ui:hidden": '{{parentFormData.file_format !== "CSV"}}',
183
+ "type": "string",
184
+ "title": _l("Field Quoting"),
185
+ "description": _l("Strategy for quoting fields in the CSV file"),
186
+ "enum": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
187
+ "enumNames": ["QUOTE_ALL", "QUOTE_MINIMAL", "QUOTE_NONE", "QUOTE_NONNUMERIC"],
188
+ "default": "QUOTE_MINIMAL",
189
+ },
190
+ "transform_func": {
191
+ "type": "string",
192
+ "title": _l("Custom Transformation"),
193
+ "description": _l(
194
+ "Python function to transform the downloaded file. Must accept a filepath argument and return "
195
+ "the path to the transformed file. Runs after built-in transformations."
196
+ ),
197
+ "default": FILE_TRANSFORM_FUNC_DEFAULT_VALUE,
198
+ "ui:field": "CodeEditorWithReferencesField",
199
+ "ui:options": {
200
+ "type": "code",
201
+ "lang": "python",
202
+ },
203
+ },
204
+ },
205
+ "required": ["data_source_name", "filepath"],
206
+ }
207
+
208
+ @classmethod
209
+ def validate(cls, configuration):
210
+ conf = super().validate(configuration)
211
+
212
+ transform_func_code = conf.get("transform_func", "").strip()
213
+ if transform_func_code:
214
+ validate_transform(transform_func_code)
215
+ return conf
216
+
217
+
218
+ def validate_transform(raw_code):
219
+ code = compile(raw_code, "", "exec")
220
+ ns = {}
221
+ exec(code, ns)
222
+ func = ns.get("transform")
223
+ if not func:
224
+ return None
225
+
226
+ if not callable(func):
227
+ raise jsonschema.ValidationError(message="transform should be callable", path=("transform_func",))
228
+
229
+ sig = inspect.signature(func)
230
+ if tuple(sig.parameters.keys()) != ("filename",):
231
+ raise jsonschema.ValidationError(
232
+ message="transform must accept and only accept filename as parameter", path=("transform_func",)
233
+ )
234
+ return func
@@ -0,0 +1,66 @@
1
+ import logging
2
+
3
+ try:
4
+ import pandas as pd
5
+ except ImportError:
6
+ pass
7
+
8
+ from recurvedata.core.translation import _l
9
+ from recurvedata.operators.transfer_operator.dump_sheet_task_base import SheetDumpTaskBase
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class GoogleSheetDumpTask(SheetDumpTaskBase):
15
+ _AUTO_REGISTER = True
16
+ ds_name_fields = ("google_service_account",)
17
+ worker_install_require = [
18
+ "gspread",
19
+ ]
20
+
21
+ custom_config_schema_properties = {
22
+ "google_service_account": {
23
+ "type": "string",
24
+ "title": _l("Service Account"),
25
+ "description": _l("Google service account with permissions to access the spreadsheet"),
26
+ "ui:field": "ProjectConnectionSelectorField",
27
+ "ui:options": {
28
+ "supportTypes": [
29
+ "google_service_account",
30
+ ],
31
+ },
32
+ },
33
+ "file_url": {
34
+ "type": "string",
35
+ "title": _l("Spreadsheet URL"),
36
+ "description": _l("URL of the Google spreadsheet (defaults to first sheet if no sheet ID specified)"),
37
+ "ui:field": "CodeEditorWithReferencesField",
38
+ "ui:options": {
39
+ "type": "plain",
40
+ },
41
+ },
42
+ "cell_range": {
43
+ "type": "string",
44
+ "title": _l("Data Range"),
45
+ "description": _l("Cell range in A1 notation (e.g. A1:B10). Reads entire sheet if empty"),
46
+ "ui:field": "CodeEditorWithReferencesField",
47
+ "ui:options": {
48
+ "type": "plain",
49
+ },
50
+ },
51
+ }
52
+ custom_config_schema_required = ["google_service_account", "file_url"]
53
+
54
+ def read_origin_df(self) -> "pd.DataFrame":
55
+ conf = self.rendered_config
56
+
57
+ ds = self.must_get_connection_by_name(conf.google_service_account)
58
+ service_account = ds.recurve_connector
59
+ spread_sheet_id, sheet_id = service_account.parse_sheet_url(conf.file_url)
60
+ logger.info(f"reading {conf.file_url}, gid {sheet_id}")
61
+
62
+ sheet = service_account.get_sheet(conf.file_url, sheet_id)
63
+ df = service_account.read_sheet_to_df(sheet, cell_range=conf.cell_range)
64
+ logger.info(f"original DataFrame shape {df.shape}, dtypes:\n{df.dtypes}")
65
+ logger.info(df.head())
66
+ return df
@@ -0,0 +1,168 @@
1
+ import copy
2
+ import json
3
+ import logging
4
+
5
+ try:
6
+ from bson import json_util
7
+
8
+ from recurvedata.pigeon.dumper.mongodb import MongoDBDumper
9
+ except ImportError:
10
+ pass
11
+
12
+ from recurvedata.core.translation import _l
13
+ from recurvedata.operators.transfer_operator import utils
14
+ from recurvedata.operators.transfer_operator.task import DumpTask
15
+ from recurvedata.utils import date_time, extract_dict
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class MongoDBDumpTask(DumpTask):
21
+ ds_name_fields = ("data_source_name",)
22
+ worker_install_require = ["pigeon[mongo]"]
23
+
24
+ @property
25
+ def time_column_tz(self):
26
+ return self.config.get("time_column_tz", "UTC")
27
+
28
+ def determine_time_range(self):
29
+ start_date, end_date = self.get_schedule_time_range()
30
+ # convert timezone
31
+ start_date = date_time.astimezone(start_date, tz=self.time_column_tz)
32
+ end_date = date_time.astimezone(end_date, tz=self.time_column_tz)
33
+
34
+ return start_date.replace(tzinfo=None), end_date.replace(tzinfo=None)
35
+
36
+ def execute_impl(self, *args, **kwargs):
37
+ ds = self.must_get_connection_by_name(self.config["data_source_name"])
38
+ hf = self.create_handler_factory()
39
+ dump_options = extract_dict(self.rendered_config, keys=["collection", "filter", "projection"])
40
+ dump_options.update({"connector": ds.connector, "handler_factories": [hf], "database": ds.database})
41
+
42
+ # projection 设置为 null 或 '' 都当作 None 处理,即包含所有字段
43
+ proj = dump_options.get("projection")
44
+ if proj:
45
+ dump_options["projection"] = json.loads(proj)
46
+ else:
47
+ dump_options["projection"] = None
48
+
49
+ if dump_options["filter"]:
50
+ flt = json_util.loads(dump_options["filter"])
51
+ else:
52
+ flt = {}
53
+ if not self.dag.is_once and self.config.incremental_by_time:
54
+ start, end = self.determine_time_range()
55
+ time_flt = {self.config.time_column: {"$gte": start, "$lt": end}}
56
+ flt.update(time_flt)
57
+
58
+ dump_options["filter"] = flt
59
+
60
+ logger.info("Dump options: %s", dump_options)
61
+ dumper = MongoDBDumper(**dump_options)
62
+ return dumper.execute()
63
+
64
+ @classmethod
65
+ def config_schema(cls):
66
+ # get_choices_by_type = cls.get_connection_names_by_type
67
+ # dss = get_choices_by_type('mongodb')
68
+ return {
69
+ "type": "object",
70
+ "properties": {
71
+ "data_source_name": {
72
+ "type": "string",
73
+ "title": _l("MongoDB Connection"),
74
+ "ui:field": "ProjectConnectionSelectorField",
75
+ "ui:options": {
76
+ "supportTypes": [
77
+ "mongodb",
78
+ ],
79
+ },
80
+ },
81
+ "collection": {
82
+ "type": "string",
83
+ "title": _l("MongoDB Collection"),
84
+ "ui:field": "CodeEditorWithReferencesField",
85
+ "ui:options": {
86
+ "type": "plain",
87
+ },
88
+ },
89
+ "filter": {
90
+ "type": "string",
91
+ "title": _l("Query Filter"),
92
+ "default": "{}",
93
+ "description": _l(
94
+ "MongoDB query filter in JSON format. Will be deserialized using bson.json_util and passed to find() method. "
95
+ "Supports MongoDB query operators like $gt, $lt, $in etc. See MongoDB documentation for details."
96
+ ),
97
+ "ui:field": "CodeEditorWithReferencesField",
98
+ "ui:options": {
99
+ "type": "code",
100
+ "lang": "json",
101
+ },
102
+ },
103
+ "projection": {
104
+ "type": "string",
105
+ "title": _l("Field Selection"),
106
+ "description": _l(
107
+ "Specify which fields to return in JSON format. Empty value returns all fields. Passed directly to MongoDB find() function."
108
+ ),
109
+ "ui:field": "CodeEditorWithReferencesField",
110
+ "ui:options": {
111
+ "type": "code",
112
+ "lang": "json",
113
+ },
114
+ },
115
+ "transform": copy.deepcopy(utils.TRANSFORM),
116
+ "incremental_by_time": {
117
+ "type": "boolean",
118
+ "title": _l("Enable Time-based Incremental Sync"),
119
+ "default": False,
120
+ "description": _l("Sync data incrementally based on a time column"),
121
+ "ui:widget": "BaseCheckbox",
122
+ "ui:options": {
123
+ "label": _l("Enable Time-based Incremental Sync"),
124
+ },
125
+ },
126
+ "time_column": {
127
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
128
+ "type": "string",
129
+ "title": _l("Time Column Name"),
130
+ "default": "snapshot_time",
131
+ "description": _l(
132
+ "Name of the time column used for incremental sync. Column should be indexed for better performance."
133
+ ),
134
+ "ui:field": "CodeEditorWithReferencesField",
135
+ "ui:options": {
136
+ "type": "plain",
137
+ },
138
+ },
139
+ "time_column_tz": {
140
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
141
+ "type": "string",
142
+ "title": _l("Time Column Timezone"),
143
+ "default": "UTC",
144
+ "enum": [
145
+ "UTC",
146
+ "Asia/Shanghai",
147
+ ],
148
+ "enumNames": [
149
+ "UTC",
150
+ "Asia/Shanghai",
151
+ ],
152
+ },
153
+ "time_auto_round": {
154
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
155
+ "type": "boolean",
156
+ "title": "Auto Round Time Range",
157
+ "default": True,
158
+ "description": _l(
159
+ "Automatically round time ranges to appropriate intervals. For example:\n"
160
+ "- Daily tasks running at 01:23 will sync previous day's data from 00:00 to 00:00\n"
161
+ "- Weekly tasks will round to Monday 00:00\n"
162
+ "- Monthly tasks will round to 1st day 00:00\n"
163
+ "If disabled, exact execution times will be used (e.g. 01:23 to 01:23)"
164
+ ),
165
+ },
166
+ },
167
+ "required": ["data_source_name", "collection"],
168
+ }