recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,242 @@
1
+ from typing import TYPE_CHECKING, Any, List, Optional, Union
2
+
3
+ from recurvedata.pigeon import const
4
+ from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
5
+ from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
6
+
7
+ if TYPE_CHECKING:
8
+ from recurvedata.pigeon.connector.microsoft_fabric import MsFabricConnector
9
+
10
+ allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
11
+
12
+
13
+ class CSVToMsFabricLoader(BaseLoader, CSVToDBAPIMixin):
14
+ """Loader for Microsoft Fabric that supports bulk loading data using COPY command.
15
+
16
+ This loader provides Microsoft Fabric specific data loading capabilities.
17
+ It uses the COPY command for efficient data loading and supports various
18
+ loading modes (OVERWRITE, MERGE, APPEND).
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ table: str,
24
+ filename: str,
25
+ connector: "MsFabricConnector",
26
+ schema: Optional[str] = None,
27
+ create_table_ddl: Optional[str] = None,
28
+ mode: str = const.LOAD_MERGE,
29
+ primary_keys: Optional[Union[str, List[str]]] = None,
30
+ columns: Optional[Union[str, List[str]]] = None,
31
+ compress: bool = True,
32
+ delete_file: bool = True,
33
+ dedup: bool = False,
34
+ dedup_uniq_keys: Optional[Union[str, List[str]]] = None,
35
+ dedup_orderby: Optional[Union[str, List[str]]] = None,
36
+ pre_queries: Optional[Union[str, List[str]]] = None,
37
+ post_queries: Optional[Union[str, List[str]]] = None,
38
+ lineterminator: Optional[str] = "0x0D0A",
39
+ *args: Any,
40
+ **kwargs: Any,
41
+ ):
42
+ """Initialize the loader.
43
+
44
+ Args:
45
+ table: Target table name
46
+ filename: Source file path
47
+ connector: MsFabricConnector instance
48
+ schema: Schema name
49
+ create_table_ddl: SQL to create target table
50
+ mode: Loading mode (OVERWRITE/MERGE/APPEND)
51
+ primary_keys: Primary key columns for MERGE mode
52
+ columns: Column list to load
53
+ compress: Whether to compress data before loading
54
+ delete_file: Whether to delete source file after loading
55
+ dedup: Whether to deduplicate data
56
+ dedup_uniq_keys: Columns for deduplication
57
+ dedup_orderby: Order by clause for deduplication
58
+ pre_queries: Queries to run before loading
59
+ post_queries: Queries to run after loading
60
+ """
61
+ if "." in table:
62
+ self.schema, self.table = table.split(".")
63
+ else:
64
+ self.schema = schema or "dbo"
65
+ self.table = table
66
+
67
+ self.connector = connector
68
+ self.filename = filename
69
+ self.create_table_ddl = create_table_ddl
70
+ self.compress = compress
71
+ self.delete_file = delete_file
72
+
73
+ if mode not in allowed_modes:
74
+ raise ValueError(f"mode should be one of ({allowed_modes})")
75
+
76
+ self.mode = mode
77
+ self.primary_keys = ensure_str_list(primary_keys)
78
+ self.columns = ensure_str_list(columns)
79
+
80
+ # dedup stuff
81
+ self.dedup = dedup
82
+ self.dedup_uniq_keys = ensure_str_list(dedup_uniq_keys)
83
+ self.dedup_orderby = dedup_orderby
84
+ if self.dedup and not self.dedup_uniq_keys:
85
+ raise ValueError("dedup_uniq_keys should not be empty if dedup is true")
86
+
87
+ self.pre_queries = ensure_query_list(pre_queries) or []
88
+ self.post_queries = ensure_query_list(post_queries) or []
89
+ self.lineterminator = lineterminator
90
+
91
+ super().__init__()
92
+
93
+ @property
94
+ def staging_table(self) -> str:
95
+ return f"{self.table}_staging"
96
+
97
+ @property
98
+ def full_staging_table_name(self) -> str:
99
+ return f"{self.schema}.{self.staging_table}"
100
+
101
+ @property
102
+ def full_table_name(self) -> str:
103
+ return f"{self.schema}.{self.table}"
104
+
105
+ @property
106
+ def quoted_full_staging_table(self) -> str:
107
+ return self.connector.quote_identifier(self.full_staging_table_name)
108
+
109
+ @property
110
+ def quoted_full_table_name(self) -> str:
111
+ return self.connector.quote_identifier(self.full_table_name)
112
+
113
+ def execute_impl(self) -> None:
114
+ """Execute the data loading process."""
115
+ if fs.is_file_empty(self.filename):
116
+ self.logger.error("file not exists or has no content. %s", self.filename)
117
+ fs.remove_files_safely(fs.schema_filename(self.filename))
118
+ return
119
+
120
+ self._prepare_target_table()
121
+ self._prepare_staging_table()
122
+ self._load_to_staging()
123
+ self._merge_into_target_table()
124
+
125
+ # do cleaning things
126
+ if self.delete_file:
127
+ self.logger.info("delete local file %s", self.filename)
128
+ fs.remove_files_safely(self.filename)
129
+ fs.remove_files_safely(fs.schema_filename(self.filename))
130
+
131
+ def _prepare_staging_table(self) -> None:
132
+ """Prepare the staging table for data loading."""
133
+ schema, table = self.full_staging_table_name.split(".")
134
+ drop = self._make_drop_table_query(schema, table)
135
+ ddl = f"SELECT TOP 0 * INTO {self.quoted_full_staging_table} FROM {self.quoted_full_table_name}"
136
+ self.connector.execute([drop, ddl])
137
+
138
+ def _make_drop_table_query(self, schema: str, table: str) -> str:
139
+ """Generate SQL to drop a table if it exists."""
140
+ if "." in table:
141
+ schema, table = table.split(".")
142
+ if not schema:
143
+ schema = self.schema
144
+ full_table = f"{schema}.{table}"
145
+ query = f"""
146
+ IF EXISTS (
147
+ SELECT * FROM sys.tables
148
+ WHERE schema_name(schema_id) = '{schema}' AND name = '{table}'
149
+ )
150
+ DROP TABLE {self.connector.quote_identifier(full_table)}
151
+ """
152
+ return query
153
+
154
+ def _load_to_staging(self) -> None:
155
+ """Load data into staging table using COPY command."""
156
+ self.logger.info(f"load {self.filename} into staging table {self.full_staging_table_name}")
157
+ self.connector.load_csv_bulk(
158
+ table=self.full_staging_table_name,
159
+ filename=self.filename,
160
+ columns=self.columns,
161
+ compress=self.compress,
162
+ lineterminator=self.lineterminator,
163
+ )
164
+
165
+ if self.dedup:
166
+ dedup_query = self._construct_dedup_query()
167
+ self.connector.execute(dedup_query, autocommit=False, commit_on_close=True)
168
+
169
+ def _construct_dedup_query(self) -> str:
170
+ """Construct query for deduplication."""
171
+ partitions_cols = []
172
+ for col in self.dedup_uniq_keys:
173
+ partitions_cols.append(self.connector.quote_identifier(col))
174
+ partition_by = ", ".join(partitions_cols)
175
+ columns = " ,".join(self.connector.get_columns(schema=self.schema, table=self.staging_table))
176
+ tmp_table = f"{self.full_staging_table_name}_tmp"
177
+ quoted_tmp_table = self.connector.quote_identifier(tmp_table)
178
+ quoted_bak_table = self.connector.quote_identifier(f"{self.staging_table}_bak")
179
+
180
+ queries = f"""
181
+ {self._make_drop_table_query(self.schema, tmp_table)};
182
+
183
+ CREATE TABLE {quoted_tmp_table} AS
184
+ SELECT {', '.join(self.connector.quote_identifier(x) for x in columns)}
185
+ FROM (
186
+ SELECT *, ROW_NUMBER() OVER (PARTITION BY {partition_by} ORDER BY {self.dedup_orderby}) rn
187
+ FROM {self.quoted_full_staging_table}
188
+ ) AS t
189
+ WHERE rn = 1;
190
+
191
+ RENAME OBJECT {self.quoted_full_staging_table} TO {quoted_bak_table};
192
+ RENAME OBJECT {quoted_tmp_table} TO {self.staging_table};
193
+ DROP TABLE {quoted_bak_table};
194
+ """
195
+ return queries
196
+
197
+ def _merge_into_target_table(self) -> None:
198
+ """Merge data from staging table into target table."""
199
+ target = self.quoted_full_table_name
200
+ staging = self.quoted_full_staging_table
201
+
202
+ append_sql = f"INSERT INTO {target} SELECT * FROM {staging}"
203
+ if self.mode == const.LOAD_OVERWRITE:
204
+ queries = [f"TRUNCATE TABLE {target}", append_sql]
205
+ elif self.mode == const.LOAD_MERGE:
206
+ queries = self._ingest_by_merging()
207
+ else:
208
+ # APPEND
209
+ queries = [append_sql]
210
+
211
+ queries.append(f"DROP TABLE {staging}")
212
+
213
+ queries = self.pre_queries + queries + self.post_queries
214
+ self.logger.info("running Microsoft Fabric queries...")
215
+ self.connector.execute(queries, autocommit=True, commit_on_close=True)
216
+ self.logger.info("done.")
217
+
218
+ def _ingest_by_merging(self) -> List[str]:
219
+ """Construct merge query for MERGE mode."""
220
+ merge_table = f"{self.full_table_name}_merge"
221
+ quote = self.connector.quote_identifier
222
+ join = " AND ".join([f"a.{quote(x)} = b.{quote(x)}" for x in self.primary_keys])
223
+
224
+ drop_merge_table = self._make_drop_table_query(self.schema, merge_table)
225
+ queries = f"""
226
+ {drop_merge_table};
227
+
228
+ CREATE TABLE {quote(merge_table)} WITH (DISTRIBUTION = ROUND_ROBIN)
229
+ AS
230
+ SELECT a.*
231
+ FROM {self.quoted_full_table_name} AS a
232
+ LEFT JOIN {self.quoted_full_staging_table} AS b ON {join}
233
+ WHERE b.{quote(self.primary_keys[0])} IS NULL
234
+ UNION ALL
235
+ SELECT * FROM {self.quoted_full_staging_table};
236
+
237
+ TRUNCATE TABLE {self.quoted_full_table_name};
238
+ INSERT INTO {self.quoted_full_table_name} SELECT * FROM {quote(merge_table)};
239
+
240
+ {drop_merge_table};
241
+ """
242
+ return queries.split(";")
@@ -0,0 +1,174 @@
1
+ from recurvedata.pigeon import const
2
+ from recurvedata.pigeon.connector.mssql import SQLServerConnector
3
+ from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
4
+ from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
5
+
6
+ allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
7
+ STATING_TABLE_NAME_PLACEHOLDER = "<TABLE>"
8
+
9
+
10
+ class CSVToMsSQLLoader(BaseLoader, CSVToDBAPIMixin):
11
+ def __init__(
12
+ self,
13
+ database,
14
+ table,
15
+ filename,
16
+ connector: SQLServerConnector,
17
+ schema=None,
18
+ create_table_ddl=None,
19
+ staging_create_table_ddl=None,
20
+ mode=const.LOAD_OVERWRITE,
21
+ primary_keys=None,
22
+ skiprows=0,
23
+ columns=None,
24
+ using_insert=True,
25
+ insert_batch_size=500,
26
+ insert_concurrency=1,
27
+ delete_file=False,
28
+ pre_queries=None,
29
+ post_queries=None,
30
+ *args,
31
+ **kwargs,
32
+ ):
33
+ self.database = database
34
+ self.table = table
35
+
36
+ if "." in table:
37
+ self.schema, self.table = table.split(".")
38
+ else:
39
+ self.schema = schema or "dbo"
40
+ self.table = table
41
+
42
+ connector.database = self.database
43
+ self.connector = connector
44
+ self.filename = filename
45
+ self.create_table_ddl = create_table_ddl
46
+ # 考虑到完整的复制表结构(包括约束和索引)比较复杂,允许指定 staging 表的 DDL
47
+ # 表名用特殊符号 <TABLE> 占位
48
+ self.staging_create_table_ddl = staging_create_table_ddl
49
+ if self.staging_create_table_ddl and STATING_TABLE_NAME_PLACEHOLDER not in self.staging_create_table_ddl:
50
+ raise ValueError(f"use {STATING_TABLE_NAME_PLACEHOLDER} as table name placeholder")
51
+
52
+ if mode not in allowed_modes:
53
+ raise ValueError("mode should be one of ({})".format(allowed_modes))
54
+
55
+ self.mode = mode
56
+ self.primary_keys = ensure_str_list(primary_keys)
57
+ if self.mode == const.LOAD_MERGE and not self.primary_keys:
58
+ raise ValueError("primary_keys should not be empty in mode {}".format(const.LOAD_MERGE))
59
+
60
+ # self.columns = columns or self.csvfile.header
61
+ # self.skiprows = int(skiprows or self.csvfile.has_header)
62
+ self.columns = columns
63
+ self.skiprows = int(skiprows)
64
+ self.using_insert = using_insert
65
+ self.insert_batch_size = insert_batch_size
66
+ self.insert_concurrency = insert_concurrency
67
+ self.delete_file = delete_file
68
+
69
+ self.pre_queries = ensure_query_list(pre_queries) or []
70
+ self.post_queries = ensure_query_list(post_queries) or []
71
+
72
+ super().__init__()
73
+
74
+ @property
75
+ def staging_table(self):
76
+ return f"{self.schema}.{self.table}_staging"
77
+
78
+ @property
79
+ def full_table_name(self):
80
+ return f"{self.schema}.{self.table}"
81
+
82
+ @property
83
+ def quoted_staging_table(self):
84
+ return self.connector.quote_identifier(self.staging_table)
85
+
86
+ @property
87
+ def quoted_full_table_name(self):
88
+ return self.connector.quote_identifier(self.full_table_name)
89
+
90
+ def execute_impl(self):
91
+ if fs.is_file_empty(self.filename):
92
+ self.logger.error("file not exists or has no content. %s", self.filename)
93
+ fs.remove_files_safely(fs.schema_filename(self.filename))
94
+ return
95
+
96
+ self._prepare_target_table()
97
+ self._prepare_staging_table()
98
+ self._load_to_staging()
99
+ self._merge_into_target_table()
100
+
101
+ # do cleaning things
102
+ if self.delete_file:
103
+ self.logger.info("delete local file %s", self.filename)
104
+ fs.remove_files_safely(self.filename)
105
+ fs.remove_files_safely(fs.schema_filename(self.filename))
106
+
107
+ def _prepare_staging_table(self):
108
+ schema, table = self.staging_table.split(".")
109
+ if self.staging_create_table_ddl:
110
+ ddl: str = self.staging_create_table_ddl.replace(STATING_TABLE_NAME_PLACEHOLDER, self.quoted_staging_table)
111
+ ddl = ddl.rstrip(";")
112
+ else:
113
+ ddl = f"SELECT TOP 0 * INTO {self.quoted_staging_table} FROM {self.quoted_full_table_name}"
114
+
115
+ query = f"""
116
+ IF EXISTS (
117
+ SELECT * FROM sys.tables
118
+ WHERE schema_name(schema_id) = '{schema}' AND name = '{table}'
119
+ )
120
+ DROP TABLE {self.quoted_staging_table};
121
+
122
+ {ddl}
123
+ """
124
+ self.connector.execute(query)
125
+
126
+ def _load_to_staging(self):
127
+ self.logger.info("load %s into staging table %s", self.filename, self.staging_table)
128
+ self.connector.load_csv(
129
+ table=self.staging_table,
130
+ filename=self.filename,
131
+ schema=self.schema,
132
+ columns=self.columns,
133
+ skiprows=self.skiprows,
134
+ using_insert=self.using_insert,
135
+ null_values=("NULL", r"\N", ""),
136
+ batch_size=self.insert_batch_size,
137
+ concurrency=self.insert_concurrency,
138
+ )
139
+
140
+ def _merge_into_target_table(self):
141
+ target = self.quoted_full_table_name
142
+ staging = self.quoted_staging_table
143
+
144
+ queries = []
145
+ if self.mode == const.LOAD_OVERWRITE:
146
+ queries.append(f"TRUNCATE TABLE {target}")
147
+ append_sql = f"INSERT INTO {target} SELECT * FROM {staging}"
148
+ queries.append(append_sql)
149
+ elif self.mode == const.LOAD_MERGE:
150
+ joins = []
151
+ for field in self.primary_keys:
152
+ field = self.connector.quote_identifier(field)
153
+ join = f"{target}.{field} = {staging}.{field}"
154
+ joins.append(join)
155
+
156
+ join_conditions = " AND ".join(joins)
157
+ # Delete existing records that match primary keys
158
+ delete_sql = f"DELETE {target} FROM {target} INNER JOIN {staging} ON {join_conditions}"
159
+ queries.append(delete_sql)
160
+
161
+ # Insert all data from staging table to target table
162
+ insert_sql = f"INSERT INTO {target} SELECT * FROM {staging}"
163
+ queries.append(insert_sql)
164
+ else:
165
+ # APPEND mode
166
+ append_sql = f"INSERT INTO {target} SELECT * FROM {staging}"
167
+ queries.append(append_sql)
168
+
169
+ queries.append(f"DROP TABLE {staging}")
170
+
171
+ queries = self.pre_queries + queries + self.post_queries
172
+ self.logger.info("running SQL Server queries...")
173
+ self.connector.execute(queries, autocommit=False, commit_on_close=True)
174
+ self.logger.info("done.")
@@ -0,0 +1,180 @@
1
+ from recurvedata.pigeon import const
2
+ from recurvedata.pigeon.connector import new_mysql_connector
3
+ from recurvedata.pigeon.csv import CSV
4
+ from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
5
+ from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
6
+ from recurvedata.pigeon.utils.sql import bak_table_of, staging_table_of
7
+
8
+ allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
9
+
10
+
11
+ class CSVToMySQLLoader(BaseLoader, CSVToDBAPIMixin):
12
+ def __init__(
13
+ self,
14
+ database,
15
+ table,
16
+ filename,
17
+ connector=None,
18
+ create_table_ddl=None,
19
+ mode=const.LOAD_OVERWRITE,
20
+ primary_keys=None,
21
+ skiprows=0,
22
+ columns=None,
23
+ using_insert=False,
24
+ insert_batch_size=1000,
25
+ insert_concurrency=1,
26
+ delete_file=False,
27
+ tidb_dml_batch_size=500,
28
+ pre_queries=None,
29
+ post_queries=None,
30
+ *args,
31
+ **kwargs,
32
+ ):
33
+ self.database = database
34
+ self.table = table
35
+
36
+ if isinstance(filename, CSV):
37
+ filename = filename.path
38
+ self.filename = filename
39
+ self.csvfile = CSV(self.filename)
40
+
41
+ if connector is None:
42
+ connector = new_mysql_connector(database=self.database)
43
+ else:
44
+ connector.database = self.database
45
+ self.connector = connector
46
+
47
+ self.create_table_ddl = create_table_ddl
48
+
49
+ if mode not in allowed_modes:
50
+ raise ValueError("mode should be one of ({})".format(allowed_modes))
51
+
52
+ self.mode = mode
53
+ self.primary_keys = ensure_str_list(primary_keys)
54
+ if self.mode == const.LOAD_MERGE and not self.primary_keys:
55
+ raise ValueError("primary_keys should not be empty in mode {}".format(const.LOAD_MERGE))
56
+
57
+ # self.columns = columns or self.csvfile.header
58
+ # self.skiprows = int(skiprows or self.csvfile.has_header)
59
+ self.columns = columns
60
+ self.skiprows = int(skiprows)
61
+ self.using_insert = using_insert
62
+ self.insert_batch_size = insert_batch_size
63
+ self.insert_concurrency = insert_concurrency
64
+ self.delete_file = delete_file
65
+
66
+ # https://pingcap.com/docs-cn/sql/tidb-specific/#tidb-dml-batch-size
67
+ self.tidb_dml_batch_size = tidb_dml_batch_size
68
+
69
+ self.pre_queries = ensure_query_list(pre_queries) or []
70
+ self.post_queries = ensure_query_list(post_queries) or []
71
+
72
+ super().__init__()
73
+
74
+ @property
75
+ def staging_table(self):
76
+ return staging_table_of(self.table)
77
+
78
+ def execute_impl(self):
79
+ if fs.is_file_empty(self.filename):
80
+ self.logger.error("file not exists or has no content. %s", self.filename)
81
+ fs.remove_files_safely(fs.schema_filename(self.filename))
82
+ return
83
+ self._prepare_target_table()
84
+ self._prepare_staging_table()
85
+ self._load_to_staging()
86
+ self._merge_into_target_table()
87
+
88
+ # do cleaning things
89
+ if self.delete_file:
90
+ self.logger.info("delete local file %s", self.filename)
91
+ fs.remove_files_safely(self.filename)
92
+ fs.remove_files_safely(fs.schema_filename(self.filename))
93
+
94
+ def _prepare_staging_table(self):
95
+ queries = """
96
+ DROP TABLE IF EXISTS {staging};
97
+ CREATE TABLE {staging} LIKE {table};
98
+ """.format(
99
+ staging=self.staging_table, table=self.table
100
+ )
101
+ self.connector.execute(queries, autocommit=True)
102
+
103
+ def _load_to_staging(self):
104
+ self.connector.load_csv(
105
+ table=self.staging_table,
106
+ filename=self.csvfile.path,
107
+ columns=self.columns,
108
+ lineterminator=self.csvfile.dialect.lineterminator,
109
+ skiprows=self.skiprows,
110
+ using_insert=self.using_insert,
111
+ null_values=("NULL", r"\N", ""),
112
+ batch_size=self.insert_batch_size,
113
+ concurrency=self.insert_concurrency,
114
+ )
115
+
116
+ def _merge_into_target_table(self):
117
+ queries = []
118
+ if self.connector.is_tidb():
119
+ queries.append("SET autocommit=1")
120
+ queries.append("SET @@session.tidb_batch_delete=ON")
121
+ queries.append("SET @@session.tidb_batch_insert=ON")
122
+ if self.tidb_dml_batch_size:
123
+ queries.append(f"SET @@session.tidb_dml_batch_size={self.tidb_dml_batch_size}")
124
+
125
+ if self.mode == const.LOAD_MERGE:
126
+ queries.extend(self._ingest_by_merging())
127
+ elif self.mode == const.LOAD_OVERWRITE:
128
+ bak_table = bak_table_of(self.table)
129
+ queries.append(f"DROP TABLE IF EXISTS {bak_table}")
130
+ queries.append(f"RENAME TABLE {self.table} TO {bak_table}")
131
+ queries.append(f"RENAME TABLE {self.staging_table} TO {self.table}")
132
+ queries.append(f"DROP TABLE IF EXISTS {bak_table}")
133
+ else:
134
+ queries.append(f"INSERT INTO {self.table} SELECT * FROM {self.staging_table}")
135
+ queries.append(f"DROP TABLE {self.staging_table}")
136
+
137
+ queries = self.pre_queries + queries + self.post_queries
138
+ self.logger.info("running MySQL queries within a transaction")
139
+ self.connector.execute(queries, autocommit=False, commit_on_close=True)
140
+
141
+ def _ingest_by_merging(self):
142
+ """Merge with deduplication based on specified primary_keys"""
143
+ # First, deduplicate staging table based on primary_keys using window function
144
+ pk_columns = ", ".join(self.primary_keys)
145
+
146
+ # Get all columns from staging table (excluding the rn column we'll add)
147
+ cols = self.connector.get_columns(self.staging_table)
148
+ cols_str = ", ".join(self.connector.quote_identifier(x) for x in cols)
149
+
150
+ # Create a temporary table with deduplicated data
151
+ tmp_table = f"{self.staging_table}_dedup"
152
+ dedup_sql = f"""
153
+ DROP TABLE IF EXISTS {tmp_table};
154
+ CREATE TABLE {tmp_table} LIKE {self.staging_table};
155
+ INSERT INTO {tmp_table}
156
+ SELECT {cols_str} FROM (
157
+ SELECT *, ROW_NUMBER() OVER(PARTITION BY {pk_columns} ORDER BY {pk_columns}) AS rn
158
+ FROM {self.staging_table}
159
+ ) t
160
+ WHERE rn = 1;
161
+ """
162
+
163
+ # Replace staging table with deduplicated data
164
+ replace_sql = f"""
165
+ DROP TABLE {self.staging_table};
166
+ RENAME TABLE {tmp_table} TO {self.staging_table};
167
+ """
168
+
169
+ # Delete records from target table that have the same primary keys as staging table
170
+ join_condition = " AND ".join([f"a.{pk} = b.{pk}" for pk in self.primary_keys])
171
+ delete_sql = f"""
172
+ DELETE a FROM {self.table} a
173
+ INNER JOIN {self.staging_table} b ON {join_condition}
174
+ """
175
+
176
+ # Insert deduplicated data into target table
177
+ insert_sql = f"INSERT INTO {self.table} SELECT * FROM {self.staging_table}"
178
+ drop_sql = f"DROP TABLE {self.staging_table}"
179
+
180
+ return [dedup_sql, replace_sql, delete_sql, insert_sql, drop_sql]