recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,130 @@
1
+ import csv
2
+ import logging
3
+ from typing import Any
4
+
5
+ from recurvedata.core.translation import _l
6
+ from recurvedata.operators.transfer_operator import const
7
+ from recurvedata.operators.transfer_operator.task import LoadTask
8
+ from recurvedata.pigeon.utils import fs
9
+
10
+ logger = logging.getLogger(__name__)
11
+ GOOGLE_SHEET_MAX_ROWS = 1000000
12
+ GOOGLE_SHEET_MAX_COLUMNS = 18278
13
+
14
+
15
+ class GoogleSheetLoadTask(LoadTask):
16
+ ds_name_fields = ("google_service_account",)
17
+ should_write_header = True
18
+ worker_install_require = ["gspread"]
19
+
20
+ @staticmethod
21
+ def check_csv_content(filename: str) -> tuple[int, int]:
22
+ """Check if the CSV file row and column counts exceed the maximum allowed limits for Google Sheets."""
23
+ row_count = 0
24
+ col_count = 0
25
+ with open(filename, "r") as file:
26
+ reader = csv.reader(file)
27
+ for row in reader:
28
+ row_count += 1
29
+ if row_count == 1:
30
+ col_count = len(row)
31
+ if col_count > GOOGLE_SHEET_MAX_COLUMNS:
32
+ raise ValueError(
33
+ f"CSV file contains {col_count} columns, which exceeds the maximum allowed "
34
+ f"{GOOGLE_SHEET_MAX_COLUMNS} columns in Google Sheets."
35
+ )
36
+ if row_count > GOOGLE_SHEET_MAX_ROWS:
37
+ raise ValueError(
38
+ f"CSV file contains {row_count} rows, which exceeds the maximum allowed "
39
+ f"{GOOGLE_SHEET_MAX_ROWS} rows in Google Sheets."
40
+ )
41
+ return row_count, col_count
42
+
43
+ def execute_impl(self, *args: Any, **kwargs: Any) -> None:
44
+ import pandas as pd
45
+
46
+ if fs.is_file_empty(self.filename):
47
+ logger.warning("File %s does not exist or has no content, skipping.", self.filename)
48
+ return
49
+
50
+ ds = self.must_get_connection_by_name(self.config["google_service_account"])
51
+ service_account = ds.recurve_connector
52
+ _, sheet_id = service_account.parse_sheet_url(self.config["file_url"])
53
+ sheet = service_account.get_sheet(self.config["file_url"], sheet_id)
54
+
55
+ logger.info(f'Loading to {self.config["file_url"]}, gid {sheet_id}')
56
+
57
+ # Perform all necessary checks
58
+ csv_row_count, csv_col_count = self.check_csv_content(self.filename)
59
+ current_sheet_rows, current_sheet_cols = sheet.row_count, sheet.col_count
60
+
61
+ if self.config["mode"] == const.LOAD_APPEND:
62
+ csv_row_count += current_sheet_rows
63
+ csv_col_count = max(current_sheet_cols, csv_col_count)
64
+
65
+ if csv_row_count > GOOGLE_SHEET_MAX_ROWS:
66
+ raise ValueError(
67
+ f"Appending the CSV file will exceed the maximum allowed {GOOGLE_SHEET_MAX_ROWS} rows in Google Sheets."
68
+ )
69
+ if csv_col_count > GOOGLE_SHEET_MAX_COLUMNS:
70
+ raise ValueError(
71
+ f"Appending the CSV file will exceed the maximum allowed {GOOGLE_SHEET_MAX_COLUMNS} columns in Google Sheets."
72
+ )
73
+
74
+ # Load the CSV file into a DataFrame after checking the row count
75
+ df = pd.read_csv(self.filename, keep_default_na=False)
76
+ df.fillna("", inplace=True)
77
+
78
+ try:
79
+ service_account.load_df_to_sheet(df, sheet, self.config["mode"], value_input_option="USER_ENTERED")
80
+ logger.info(
81
+ f'Data loaded successfully into {self.config["file_url"]}, mode: {self.config["mode"]}, '
82
+ f"rows: {csv_row_count}, cols: {csv_col_count}"
83
+ )
84
+ except Exception as e:
85
+ logger.error(f'Failed to load data into {self.config["file_url"]}: {e}')
86
+ raise
87
+
88
+ @classmethod
89
+ def config_schema(cls) -> dict[str, Any]:
90
+ schema = {
91
+ "type": "object",
92
+ "properties": {
93
+ "google_service_account": {
94
+ "type": "string",
95
+ "title": _l("Google Service Account Connection"),
96
+ "description": _l(
97
+ "Select the Google Service Account connection with write permissions to the target spreadsheet"
98
+ ),
99
+ "ui:field": "ProjectConnectionSelectorField",
100
+ "ui:options": {
101
+ "supportTypes": ["google_service_account"],
102
+ },
103
+ },
104
+ "file_url": {
105
+ "type": "string",
106
+ "title": _l("Google Sheet URL"),
107
+ "description": _l(
108
+ "URL of the target Google Sheet in format: "
109
+ "https://docs.google.com/spreadsheets/d/{Spreadsheet ID}/edit#gid={Sheet GID}. "
110
+ "If no sheet GID is specified, the first sheet will be used."
111
+ ),
112
+ "ui:field": "CodeEditorWithReferencesField",
113
+ "ui:options": {
114
+ "type": "plain",
115
+ },
116
+ },
117
+ "mode": {
118
+ "type": "string",
119
+ "title": _l("Import Mode"),
120
+ "enum": [const.LOAD_OVERWRITE, const.LOAD_APPEND],
121
+ "enumNames": [const.LOAD_OVERWRITE, const.LOAD_APPEND],
122
+ "default": const.LOAD_OVERWRITE,
123
+ "description": _l(
124
+ "OVERWRITE: Replace existing data with new data. " "APPEND: Add new data after existing data."
125
+ ),
126
+ },
127
+ },
128
+ "required": ["google_service_account", "file_url", "mode"],
129
+ }
130
+ return schema
@@ -0,0 +1,158 @@
1
+ import copy
2
+ import glob
3
+ import json
4
+ import os
5
+
6
+ try:
7
+ from recurvedata.pigeon.loader.csv_to_hive import CSVToHiveLoader
8
+ from recurvedata.pigeon.utils import fs
9
+ except ImportError:
10
+ pass
11
+
12
+ from recurvedata.core.translation import _l
13
+ from recurvedata.operators.transfer_operator import utils
14
+ from recurvedata.operators.transfer_operator.task import LoadTask
15
+
16
+
17
+ class HiveLoadTask(LoadTask):
18
+ ds_name_fields = ("hive_data_source_name",)
19
+ ds_types = ("hive",)
20
+ default_dumper_handler_options = {
21
+ "hive": True,
22
+ "merge_files": False, # do not merge intermediate files, pass in file pattern
23
+ }
24
+ worker_install_require = ["pigeon[hive_impala]"]
25
+
26
+ def execute_impl(self, *args, **kwargs):
27
+ hive_ds = self.must_get_connection_by_name(self.config["hive_data_source_name"])
28
+
29
+ load_options = self.rendered_config.copy()
30
+ for k in ["hive_data_source_name", "impala_data_source_name"]:
31
+ load_options.pop(k, None)
32
+
33
+ partition = load_options.pop("partition", None)
34
+ if partition:
35
+ load_options["partition"] = json.loads(partition)
36
+
37
+ sub_files = glob.glob(f"{self.filename}.[0-9]*")
38
+ if os.path.exists(self.filename) and not sub_files:
39
+ # dumper merged file
40
+ filename = self.filename
41
+ else:
42
+ # dump without merging, use pattern
43
+ # if upstream dump result is empty, sub_files is empty array, force to [self.filename] to ensure array is not empty
44
+ if all([fs.is_file_empty(x) for x in sub_files]):
45
+ sub_files = [self.filename]
46
+ filename = sub_files
47
+
48
+ load_options.update(
49
+ {
50
+ "filename": filename,
51
+ "hive_connector": hive_ds.connector,
52
+ "delete_file": True,
53
+ }
54
+ )
55
+ impala_ds = self.get_connection_by_name(self.config["impala_data_source_name"])
56
+ if impala_ds:
57
+ load_options.update({"impala_connector": impala_ds.connector})
58
+ loader = CSVToHiveLoader(**load_options)
59
+ return loader.execute()
60
+
61
+ @classmethod
62
+ def config_schema(cls):
63
+ # hive_dss = cls.get_connection_names_by_type(cls.ds_types)
64
+ # impala_dss = cls.get_connection_names_by_type('impala')
65
+ schema = {
66
+ "type": "object",
67
+ "properties": {
68
+ "hive_data_source_name": {
69
+ "type": "string",
70
+ "title": _l("Hive Connection"),
71
+ "ui:field": "ProjectConnectionSelectorField",
72
+ "ui:options": {
73
+ "supportTypes": cls.ds_types,
74
+ },
75
+ # 'default': cls.first_or_default(hive_dss, ''),
76
+ },
77
+ "impala_data_source_name": {
78
+ "type": "string",
79
+ "title": _l("Impala Connection"),
80
+ "description": _l("Optional Impala connection for faster data loading"),
81
+ "ui:field": "ProjectConnectionSelectorField",
82
+ "ui:options": {
83
+ "supportTypes": [
84
+ "impala",
85
+ ],
86
+ },
87
+ # 'default': cls.first_or_default(impala_dss, ''),
88
+ },
89
+ "database": {
90
+ "type": "string",
91
+ "title": _l("Database Name"),
92
+ "description": _l("Name of the Hive database to load data into. Supports template variables."),
93
+ "ui:field": "CodeEditorWithReferencesField",
94
+ "ui:options": {
95
+ "type": "plain",
96
+ },
97
+ },
98
+ "table": {
99
+ "type": "string",
100
+ "title": _l("Table Name"),
101
+ "description": _l("Name of the Hive table to load data into. Supports template variables."),
102
+ "ui:field": "CodeEditorWithReferencesField",
103
+ "ui:options": {
104
+ "type": "plain",
105
+ },
106
+ },
107
+ "create_table_ddl": {
108
+ "type": "string",
109
+ "title": _l("Table Creation SQL"),
110
+ "description": _l(
111
+ "SQL statement to create the table if it doesn't exist. "
112
+ "PARQUET storage format is recommended for better performance. "
113
+ ),
114
+ "ui:field": "CodeEditorWithReferencesField",
115
+ "ui:options": {
116
+ "type": "code",
117
+ "lang": "sql",
118
+ "sqlLang": "hive",
119
+ },
120
+ },
121
+ "partition": {
122
+ "type": "string",
123
+ "title": _l("Partition Specification"),
124
+ "description": _l(
125
+ "JSON object specifying the partition to load data into. "
126
+ "For T+1 tasks, use {'dt': '{{ yesterday_dt }}'} to load yesterday's partition. "
127
+ "Supports template variables."
128
+ ),
129
+ "ui:field": "CodeEditorWithReferencesField",
130
+ "ui:options": {
131
+ "type": "plain",
132
+ },
133
+ },
134
+ "compression_codec": {
135
+ "type": "string",
136
+ "title": _l("Compression Method"),
137
+ "enum": ["snappy", "none", "gzip"],
138
+ "enumNames": ["snappy", "none", "gzip"],
139
+ "description": _l(
140
+ "Data compression format. 'none' for no compression, 'gzip' for maximum compression, "
141
+ "'snappy' for balanced compression/performance."
142
+ ),
143
+ "default": "snappy",
144
+ },
145
+ },
146
+ # NOTE: frontend uses vue-json-schema, which has a bug where enum fields must be required...
147
+ "required": [
148
+ "hive_data_source_name",
149
+ "impala_data_source_name",
150
+ "database",
151
+ "table",
152
+ "mode",
153
+ "compression_codec",
154
+ ],
155
+ }
156
+ properties_schema = schema["properties"]
157
+ properties_schema.update(copy.deepcopy(utils.LOAD_COMMON))
158
+ return schema
@@ -0,0 +1,105 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ from recurvedata.core.translation import _l
5
+ from recurvedata.operators.transfer_operator import const
6
+ from recurvedata.operators.transfer_operator.task import LoadTask
7
+ from recurvedata.operators.transfer_operator.utils import allowed_modes
8
+
9
+ try:
10
+ from recurvedata.pigeon.loader.csv_to_microsoft_fabric import CSVToMsFabricLoader
11
+ except ImportError:
12
+ pass
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class MicrosoftFabricLoadTask(LoadTask):
18
+ ds_name_fields = ("data_source_name",)
19
+ ds_types = ("microsoft_fabric",)
20
+ worker_install_require = ["pigeon[azure]"]
21
+
22
+ def execute_impl(self, *args: Any, **kwargs: Any) -> None:
23
+ ds = self.must_get_connection_by_name(self.config["data_source_name"])
24
+ load_options: dict[str, Any] = self.rendered_config.copy()
25
+ for k in ["data_source_name"]:
26
+ load_options.pop(k, None)
27
+ columns = load_options.get("columns", "")
28
+ columns = [x.strip() for x in columns.split(",")] if columns.strip(" ,") else []
29
+ load_options["lineterminator"] = "\r\n" if self.dump_task_type == "PythonDumpTask" else "0x0D0A"
30
+ load_options.update(
31
+ {
32
+ "filename": self.filename,
33
+ "connector": ds.connector,
34
+ "delete_file": True,
35
+ "using_insert": False,
36
+ "columns": columns,
37
+ "database": ds.database,
38
+ "schema": ds.data.get("schema"),
39
+ "compress": True, # Enable compression for better performance
40
+ "blob_options": ds.data.get("blob_options", {}),
41
+ }
42
+ )
43
+ logger.info(load_options)
44
+ loader = CSVToMsFabricLoader(**load_options)
45
+ return loader.execute()
46
+
47
+ @classmethod
48
+ def config_schema(cls):
49
+ schema = {
50
+ "type": "object",
51
+ "properties": {
52
+ "data_source_name": {
53
+ "type": "string",
54
+ "title": _l("Microsoft Fabric Connection"),
55
+ "description": _l("The Microsoft Fabric data source to load data into"),
56
+ "ui:field": "ProjectConnectionSelectorField",
57
+ "ui:options": {
58
+ "supportTypes": cls.ds_types,
59
+ },
60
+ },
61
+ "table": {
62
+ "type": "string",
63
+ "title": _l("Target Table"),
64
+ "description": _l("Name of the table to load data into"),
65
+ "ui:field": "CodeEditorWithReferencesField",
66
+ "ui:options": {
67
+ "type": "plain",
68
+ },
69
+ },
70
+ "create_table_ddl": {
71
+ "type": "string",
72
+ "title": _l("Table Creation SQL"),
73
+ "description": _l("SQL statement to create the target table if it doesn't exist"),
74
+ "ui:field": "CodeEditorWithReferencesField",
75
+ "ui:options": {
76
+ "type": "code",
77
+ "lang": "sql",
78
+ "sqlLang": "sql",
79
+ },
80
+ },
81
+ "mode": {
82
+ "type": "string",
83
+ "title": _l("Load Mode"),
84
+ "description": _l("How to handle existing data in the target table"),
85
+ "enum": list(allowed_modes),
86
+ "enumNames": list(allowed_modes),
87
+ "default": const.LOAD_OVERWRITE,
88
+ },
89
+ "primary_keys": {
90
+ "ui:hidden": '{{parentFormData.mode !== "MERGE"}}',
91
+ "type": "string",
92
+ "title": _l("Primary Keys"),
93
+ "description": _l(
94
+ "Comma-separated list of columns used for deduplication in MERGE mode. "
95
+ "Should be primary or unique key columns."
96
+ ),
97
+ "ui:field": "CodeEditorWithReferencesField",
98
+ "ui:options": {
99
+ "type": "plain",
100
+ },
101
+ },
102
+ },
103
+ "required": ["data_source_name", "table"],
104
+ }
105
+ return schema
@@ -0,0 +1,153 @@
1
+ import logging
2
+
3
+ try:
4
+ from recurvedata.pigeon.loader.csv_to_mssql import CSVToMsSQLLoader
5
+ except ImportError:
6
+ pass
7
+
8
+ from recurvedata.core.translation import _l
9
+ from recurvedata.operators.transfer_operator import const
10
+ from recurvedata.operators.transfer_operator.task import LoadTask
11
+ from recurvedata.operators.transfer_operator.utils import allowed_modes
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class MsSQLLoadTask(LoadTask):
17
+ ds_name_fields = ("data_source_name",)
18
+ ds_types = ("mssql", "azure_mssql")
19
+ worker_install_require = ["pigeon[azure]"]
20
+
21
+ def execute_impl(self, *args, **kwargs):
22
+ ds = self.must_get_connection_by_name(self.config["data_source_name"])
23
+ load_options = self.rendered_config.copy()
24
+ for k in ["data_source_name"]:
25
+ load_options.pop(k, None)
26
+ columns = load_options.get("columns", "")
27
+ columns = [x.strip() for x in columns.split(",")] if columns.strip(" ,") else []
28
+ load_options.update(
29
+ {
30
+ "filename": self.filename,
31
+ "connector": ds.connector,
32
+ "delete_file": True,
33
+ "using_insert": False, # 自动推导,优先使用批量加载文件
34
+ "columns": columns,
35
+ "database": ds.database,
36
+ "schema": ds.data.get("schema"),
37
+ }
38
+ )
39
+ logger.info(load_options)
40
+ loader = CSVToMsSQLLoader(**load_options)
41
+ return loader.execute()
42
+
43
+ @classmethod
44
+ def config_schema(cls):
45
+ # get_choices_by_type = cls.get_connection_names_by_type
46
+ # dss = get_choices_by_type(cls.ds_types)
47
+ schema = {
48
+ "type": "object",
49
+ "properties": {
50
+ "data_source_name": {
51
+ "type": "string",
52
+ "title": _l("MSSQL Connection"),
53
+ "description": _l("The MSSQL data source to load data into"),
54
+ "ui:field": "ProjectConnectionSelectorField",
55
+ "ui:options": {
56
+ "supportTypes": cls.ds_types,
57
+ },
58
+ },
59
+ # "database": {
60
+ # "type": "string",
61
+ # "title": _l("Target Database"),
62
+ # "description": _l("Name of the database to load data into"),
63
+ # "ui:field": "CodeEditorWithReferencesField",
64
+ # "ui:options": {
65
+ # "type": "plain",
66
+ # },
67
+ # },
68
+ # "schema": {
69
+ # "type": "string",
70
+ # "title": _l("Database Schema"),
71
+ # "description": _l("Schema name in the target database (default: dbo)"),
72
+ # "default": "dbo",
73
+ # "ui:field": "CodeEditorWithReferencesField",
74
+ # "ui:options": {
75
+ # "type": "plain",
76
+ # },
77
+ # },
78
+ "table": {
79
+ "type": "string",
80
+ "title": _l("Target Table"),
81
+ "description": _l("Name of the table to load data into"),
82
+ "ui:field": "CodeEditorWithReferencesField",
83
+ "ui:options": {
84
+ "type": "plain",
85
+ },
86
+ },
87
+ "create_table_ddl": {
88
+ "type": "string",
89
+ "title": _l("Table Creation SQL"),
90
+ "description": _l("SQL statement to create the target table if it doesn't exist"),
91
+ "ui:field": "CodeEditorWithReferencesField",
92
+ "ui:options": {
93
+ "type": "code",
94
+ "lang": "sql",
95
+ "sqlLang": "sql",
96
+ },
97
+ },
98
+ "mode": {
99
+ "type": "string",
100
+ "title": _l("Load Mode"),
101
+ "description": _l("How to handle existing data in the target table"),
102
+ "enum": list(allowed_modes),
103
+ "enumNames": list(allowed_modes),
104
+ "default": const.LOAD_OVERWRITE,
105
+ },
106
+ "primary_keys": {
107
+ "ui:hidden": '{{parentFormData.mode !== "MERGE"}}',
108
+ "type": "string",
109
+ "title": _l("Primary Keys"),
110
+ "description": _l(
111
+ "Comma-separated list of columns used for deduplication in MERGE mode. "
112
+ "Should be primary or unique key columns."
113
+ ),
114
+ "ui:field": "CodeEditorWithReferencesField",
115
+ "ui:options": {
116
+ "type": "plain",
117
+ },
118
+ },
119
+ # 'insert_batch_size': {
120
+ # 'type': 'number',
121
+ # "ui:options": {"controls": False},
122
+ # 'title': 'INSERT Batch Size',
123
+ # 'default': 500,
124
+ # 'minimum': 1,
125
+ # 'maximum': 2000,
126
+ # 'description': '如果不支持批量加载 INSERT 导入数据,该参数设定 batch 大小'
127
+ # }
128
+ # "pre_queries": {
129
+ # "type": "string",
130
+ # "title": "Queries Ran Before Loading",
131
+ # "description": '新数据导入前运行的 SQL,多条 SQL 用 `;` 分隔;支持传入变量,详见 <a target="_blank" href="http://bit.ly/2JMutjn">文档</a>',
132
+ # "ui:field": "CodeEditorWithReferencesField",
133
+ # "ui:options": {
134
+ # "type": "code",
135
+ # "lang": "sql",
136
+ # "sqlLang": "sql",
137
+ # },
138
+ # },
139
+ # "post_queries": {
140
+ # "type": "string",
141
+ # "title": "Queries Ran After Loading",
142
+ # "description": '新数据导入后运行的 SQL,多条 SQL 用 `;` 分隔;支持传入变量,详见 <a target="_blank" href="http://bit.ly/2JMutjn">文档</a>',
143
+ # "ui:field": "CodeEditorWithReferencesField",
144
+ # "ui:options": {
145
+ # "type": "code",
146
+ # "lang": "sql",
147
+ # "sqlLang": "sql",
148
+ # },
149
+ # },
150
+ },
151
+ "required": ["data_source_name", "table"],
152
+ }
153
+ return schema