recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,248 @@
1
+ from recurvedata.pigeon import const
2
+ from recurvedata.pigeon.connector import new_postgresql_connector
3
+ from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
4
+ from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
5
+
6
+ allowed_modes = (const.LOAD_RENAME_OVERWRITE, const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
7
+
8
+
9
+ class CSVToPostgresqlLoader(BaseLoader, CSVToDBAPIMixin):
10
+ def __init__(
11
+ self,
12
+ database,
13
+ table,
14
+ filename,
15
+ connector=None,
16
+ schema=None,
17
+ create_table_ddl=None,
18
+ mode=const.LOAD_OVERWRITE,
19
+ primary_keys=None,
20
+ using_insert=False,
21
+ insert_batch_size=1000,
22
+ insert_concurrency=1,
23
+ delete_file=False,
24
+ dedup=False,
25
+ dedup_uniq_keys=None,
26
+ dedup_orderby=None,
27
+ pre_queries=None,
28
+ post_queries=None,
29
+ *args,
30
+ **kwargs,
31
+ ):
32
+ """Loads a csv file into a Redshift table. Internally using a S3 bucket.
33
+
34
+ :param database: the target database name
35
+ :type database: str
36
+ :param table: target table name, should not contains database portion
37
+ :type table: str
38
+ :param filename: the absolute path to csv file
39
+ :type filename: str
40
+ :param connector: a PostgresConnector object used to query PG
41
+ :type connector: pigeon.connector.PostgresConnector
42
+ :param create_table_ddl: create table
43
+ :type create_table_ddl: str
44
+ :param mode: one of (LOAD_OVERWRITE, LOAD_MERGE, LOAD_APPEND)
45
+ :param primary_keys: columns that identifies a unique row, e.g. ['dt', 'product_id'].
46
+ Required if mode is LOAD_MERGE
47
+ :type primary_keys: list
48
+ :param delete_file: delete the CSV file after loading, default is True
49
+ :type delete_file: bool
50
+ :param dedup: remove duplicated records from staging table before being merged into target
51
+ :type dedup: bool
52
+ :param dedup_uniq_keys: columns that identifies a unique row.
53
+ :type dedup_uniq_keys: list
54
+ :param dedup_orderby: determine which row should be kept.
55
+ e.g. "to keep the row has minimal timestamp", then set `dedup_orderby='timestamp ASC'
56
+ :param pre_queries: queries executed before loading
57
+ :type pre_queries: list | str
58
+ :param post_queries: queries after loading
59
+ :type post_queries: list | str
60
+ """
61
+ self.database = database
62
+
63
+ if "." in table:
64
+ self.schema, self.table = table.split(".")
65
+ else:
66
+ self.schema = schema or "public"
67
+ self.table = table
68
+
69
+ if connector is None:
70
+ connector = new_postgresql_connector(database=self.database)
71
+ else:
72
+ connector.database = self.database
73
+ self.connector = connector
74
+
75
+ self.filename = filename
76
+ self.create_table_ddl = create_table_ddl
77
+
78
+ if mode not in allowed_modes:
79
+ raise ValueError("mode should be one of ({})".format(allowed_modes))
80
+
81
+ self.mode = mode
82
+ self.primary_keys = ensure_str_list(primary_keys)
83
+ if self.mode == const.LOAD_MERGE and not self.primary_keys:
84
+ raise ValueError("primary_keys should not be empty in mode {}".format(const.LOAD_MERGE))
85
+
86
+ self.using_insert = using_insert
87
+ self.insert_batch_size = insert_batch_size
88
+ self.insert_concurrency = insert_concurrency
89
+ self.delete_file = delete_file
90
+
91
+ self.dedup = dedup
92
+ self.dedup_uniq_keys = ensure_str_list(dedup_uniq_keys)
93
+ self.dedup_orderby = dedup_orderby
94
+ if self.dedup and not self.dedup_uniq_keys:
95
+ raise ValueError("dedup_uniq_keys should not be empty")
96
+ if not self.dedup_orderby:
97
+ self.dedup_orderby = ", ".join(self.dedup_uniq_keys)
98
+
99
+ self.pre_queries = ensure_query_list(pre_queries) or []
100
+ self.post_queries = ensure_query_list(post_queries) or []
101
+
102
+ super().__init__()
103
+
104
+ def execute_impl(self):
105
+ if fs.is_file_empty(self.filename):
106
+ self.logger.error("file not exists or has no content. %s", self.filename)
107
+ fs.remove_files_safely(fs.schema_filename(self.filename))
108
+ return
109
+ self._prepare_target_table()
110
+ self._prepare_staging_table()
111
+ self._merge_into_target_table()
112
+
113
+ # do cleaning things
114
+ if self.delete_file:
115
+ self.logger.info("delete local file %s", self.filename)
116
+ fs.remove_files_safely(self.filename)
117
+ fs.remove_files_safely(fs.schema_filename(self.filename))
118
+
119
+ @property
120
+ def full_table_name(self):
121
+ return f"{self.schema}.{self.table}"
122
+
123
+ @property
124
+ def staging_table(self):
125
+ return f"{self.schema}.{self.table}_staging"
126
+
127
+ @property
128
+ def bak_table(self):
129
+ return f"{self.schema}.{self.table}_bak"
130
+
131
+ def _prepare_staging_table(self):
132
+ queries = """
133
+ DROP TABLE IF EXISTS {st};
134
+ CREATE TABLE {st} (LIKE {ft});
135
+ """.format(
136
+ st=self.staging_table, ft=self.full_table_name
137
+ )
138
+ self.connector.execute(queries, autocommit=True)
139
+
140
+ self.connector.load_csv(
141
+ table=self.staging_table,
142
+ filename=self.filename,
143
+ using_insert=self.using_insert,
144
+ null_values=("NULL", r"\N", ""),
145
+ batch_size=self.insert_batch_size,
146
+ concurrency=self.insert_concurrency,
147
+ )
148
+
149
+ if self.dedup:
150
+ dedup_query = self._construct_dedup_query(partition_keys=self.dedup_uniq_keys, order_by=self.dedup_orderby)
151
+ self.connector.execute(dedup_query, autocommit=False)
152
+
153
+ def _construct_dedup_query(self, partition_keys=None, order_by=None):
154
+ """Construct deduplication query with specified partition keys and order by clause"""
155
+ if partition_keys is None:
156
+ partition_keys = self.dedup_uniq_keys
157
+ if order_by is None:
158
+ order_by = self.dedup_orderby
159
+
160
+ partition_cols = []
161
+ for col in partition_keys:
162
+ partition_cols.append(self.connector.quote_identifier(col))
163
+ partition_by = ", ".join(partition_cols)
164
+
165
+ cols = self.connector.get_columns(self.staging_table)
166
+ tmp_table = f"{self.staging_table}_tmp"
167
+
168
+ query = f"""
169
+ DROP TABLE IF EXISTS {tmp_table};
170
+ CREATE TABLE {tmp_table} AS
171
+ SELECT {', '.join(self.connector.quote_identifier(x) for x in cols)}
172
+ FROM (
173
+ SELECT *, ROW_NUMBER() OVER(PARTITION BY {partition_by} ORDER BY {order_by}) AS rn
174
+ FROM {self.staging_table}
175
+ ) t
176
+ WHERE rn = 1;
177
+
178
+ TRUNCATE TABLE {self.staging_table};
179
+ INSERT INTO {self.staging_table} SELECT * FROM {tmp_table};
180
+ DROP TABLE IF EXISTS {tmp_table};
181
+ """
182
+ return query
183
+
184
+ def _merge_into_target_table(self):
185
+ queries = []
186
+ pure_bak_table = self.bak_table.split(".")[-1]
187
+ pure_full_table = self.full_table_name.split(".")[-1]
188
+ if self.mode == const.LOAD_OVERWRITE:
189
+ queries.append(f"DROP TABLE IF EXISTS {self.bak_table}")
190
+ queries.append(f"ALTER TABLE {self.full_table_name} RENAME TO {pure_bak_table}")
191
+ queries.append(f"ALTER TABLE {self.staging_table} RENAME TO {pure_full_table}")
192
+ queries.append(f"DROP TABLE IF EXISTS {self.bak_table}")
193
+ elif self.mode == const.LOAD_MERGE:
194
+ # Deduplicate staging table data before merging using primary_keys
195
+ # Use primary_keys order for ordering
196
+ order_by = ", ".join(self.connector.quote_identifier(col) for col in self.primary_keys)
197
+ dedup_query = self._construct_dedup_query(partition_keys=self.primary_keys, order_by=order_by)
198
+ queries.append(dedup_query)
199
+
200
+ joins = []
201
+ for field in self.primary_keys:
202
+ join = "{target}.{field} = {staging}.{field}".format(
203
+ target=self.full_table_name, staging=self.staging_table, field=field
204
+ )
205
+ joins.append(join)
206
+
207
+ join_conditions = " AND ".join(joins)
208
+ delete_sql = "DELETE FROM {target} USING {staging} WHERE {join_conditions}".format(
209
+ target=self.full_table_name, staging=self.staging_table, join_conditions=join_conditions
210
+ )
211
+ queries.append(delete_sql)
212
+
213
+ # Insert data from staging table to target table
214
+ insert_sql = "INSERT INTO {target} SELECT * FROM {source}".format(
215
+ target=self.full_table_name, source=self.staging_table
216
+ )
217
+ queries.append(insert_sql)
218
+ queries.append("DROP TABLE {}".format(self.staging_table))
219
+ else:
220
+ # else APPEND
221
+ append_sql = "INSERT INTO {target} SELECT * FROM {source}".format(
222
+ target=self.full_table_name, source=self.staging_table
223
+ )
224
+ queries.append(append_sql)
225
+ queries.append("DROP TABLE {}".format(self.staging_table))
226
+
227
+ queries = self.pre_queries + queries + self.post_queries
228
+ self.logger.info("running PostgreSQL queries...")
229
+ try:
230
+ self.connector.execute(queries, autocommit=False, commit_on_close=True)
231
+ except Exception as e:
232
+ self.logger.exception("failed to run queries")
233
+ raise e
234
+ finally:
235
+ if (
236
+ self.mode == const.LOAD_OVERWRITE
237
+ and not self.connector.has_table(self.full_table_name)
238
+ and self.connector.has_table(self.bak_table)
239
+ ):
240
+ rename_sql = "ALTER TABLE {} RENAME TO {}".format(self.bak_table, pure_full_table)
241
+ self.connector.execute(rename_sql, autocommit=False, commit_on_close=True)
242
+
243
+ try:
244
+ self.logger.info("running analyze")
245
+ analyze_queries = "VACUUM {t}; ANALYZE {t}".format(t=self.full_table_name)
246
+ self.connector.execute(analyze_queries, autocommit=True)
247
+ except Exception as e:
248
+ self.logger.exception(f"failed to run analyze queries: {e}")
@@ -0,0 +1,240 @@
1
+ from recurvedata.pigeon import const
2
+ from recurvedata.pigeon.connector import new_redshift_connector
3
+ from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
4
+ from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
5
+
6
+ allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
7
+
8
+
9
+ class CSVToRedshiftLoader(BaseLoader, CSVToDBAPIMixin):
10
+ def __init__(
11
+ self,
12
+ database,
13
+ table,
14
+ filename,
15
+ redshift_connector=None,
16
+ schema=None,
17
+ create_table_ddl=None,
18
+ mode=const.LOAD_OVERWRITE,
19
+ primary_keys=None,
20
+ delete_file=False,
21
+ dedup=False,
22
+ dedup_uniq_keys=None,
23
+ dedup_orderby=None,
24
+ pre_queries=None,
25
+ post_queries=None,
26
+ *args,
27
+ **kwargs,
28
+ ):
29
+ """Loads a csv file into a Redshift table. Internally using a S3 bucket.
30
+
31
+ :param database: the target database name
32
+ :type database: str
33
+ :param table: target table name, should not contains database portion
34
+ :type table: str
35
+ :param filename: the absolute path to csv file
36
+ :type filename: str
37
+ :param redshift_connector: a RedshiftConnector object used to query Redshift
38
+ :type redshift_connector: pigeon.database.RedshiftConnector
39
+ :param create_table_ddl: create table
40
+ :type create_table_ddl: str
41
+ :param mode: one of (LOAD_OVERWRITE, LOAD_MERGE, LOAD_APPEND)
42
+ :param primary_keys: columns that identifies a unique row, e.g. ['dt', 'product_id'].
43
+ Required if mode is LOAD_MERGE
44
+ :type primary_keys: list
45
+ :param delete_file: delete the CSV file after loading, default is True
46
+ :type delete_file: bool
47
+ :param dedup: remove duplicated records from staging table before being merged into target
48
+ :type dedup: bool
49
+ :param dedup_uniq_keys: columns that identifies a unique row.
50
+ :type dedup_uniq_keys: list
51
+ :param dedup_orderby: determine which row should be kept.
52
+ e.g. "to keep the row has minimal timestamp", then set `dedup_orderby='timestamp ASC'
53
+ :param pre_queries: queries executed before loading
54
+ :type pre_queries: list | str
55
+ :param post_queries: queries after loading
56
+ :type post_queries: list | str
57
+ """
58
+ self.database = database
59
+
60
+ if "." in table:
61
+ self.schema, self.table = table.split(".")
62
+ else:
63
+ self.schema = schema or "public"
64
+ self.table = table
65
+
66
+ if redshift_connector is None:
67
+ redshift_connector = new_redshift_connector(database=self.database)
68
+ else:
69
+ redshift_connector.database = self.database
70
+ self.redshift = redshift_connector
71
+
72
+ self.filename = filename
73
+ self.create_table_ddl = create_table_ddl
74
+
75
+ if mode not in allowed_modes:
76
+ raise ValueError("mode should be one of ({})".format(allowed_modes))
77
+
78
+ self.mode = mode
79
+ self.primary_keys = ensure_str_list(primary_keys)
80
+ if self.mode == const.LOAD_MERGE and not self.primary_keys:
81
+ raise ValueError("primary_keys should not be empty in mode {}".format(const.LOAD_MERGE))
82
+
83
+ self.delete_file = delete_file
84
+
85
+ self.dedup = dedup
86
+ self.dedup_uniq_keys = ensure_str_list(dedup_uniq_keys)
87
+ self.dedup_orderby = dedup_orderby
88
+ if self.dedup and not self.dedup_uniq_keys:
89
+ raise ValueError("dedup_uniq_keys should not be empty")
90
+ if not self.dedup_orderby:
91
+ self.dedup_orderby = ", ".join(self.dedup_uniq_keys)
92
+
93
+ self.pre_queries = ensure_query_list(pre_queries) or []
94
+ self.post_queries = ensure_query_list(post_queries) or []
95
+
96
+ super().__init__()
97
+
98
+ def execute_impl(self):
99
+ if fs.is_file_empty(self.filename):
100
+ self.logger.error("file not exists or has no content. %s", self.filename)
101
+ fs.remove_files_safely(fs.schema_filename(self.filename))
102
+ return
103
+ self._prepare_target_table()
104
+ self._prepare_staging_table()
105
+ self._merge_into_target_table()
106
+
107
+ # do cleaning things
108
+ if self.delete_file:
109
+ self.logger.info("delete local file %s", self.filename)
110
+ fs.remove_files_safely(self.filename)
111
+ fs.remove_files_safely(fs.schema_filename(self.filename))
112
+
113
+ @property
114
+ def connector(self):
115
+ return self.redshift
116
+
117
+ @property
118
+ def full_table_name(self):
119
+ return f"{self.schema}.{self.table}"
120
+
121
+ @property
122
+ def staging_table(self):
123
+ return f"{self.schema}.{self.table}_staging"
124
+
125
+ @property
126
+ def bak_table(self):
127
+ return f"{self.schema}.{self.table}_bak"
128
+
129
+ def _prepare_staging_table(self):
130
+ queries = """
131
+ DROP TABLE IF EXISTS {st};
132
+ CREATE TABLE {st} (LIKE {ft});
133
+ """.format(
134
+ st=self.staging_table, ft=self.full_table_name
135
+ )
136
+ self.redshift.execute(queries, autocommit=True)
137
+
138
+ self.redshift.load_csv(table=self.staging_table, filename=self.filename, null_values=("NULL", r"\N", ""))
139
+
140
+ if self.dedup:
141
+ dedup_query = self._construct_dedup_query(partition_keys=self.dedup_uniq_keys, order_by=self.dedup_orderby)
142
+ self.redshift.execute(dedup_query, autocommit=False, commit_on_close=True)
143
+
144
+ def _construct_dedup_query(self, partition_keys=None, order_by=None):
145
+ """Construct deduplication query with specified partition keys and order by clause"""
146
+ if partition_keys is None:
147
+ partition_keys = self.dedup_uniq_keys
148
+ if order_by is None:
149
+ order_by = self.dedup_orderby
150
+
151
+ partition_cols = []
152
+ for col in partition_keys:
153
+ partition_cols.append(self.connector.quote_identifier(col))
154
+ partition_by = ", ".join(partition_cols)
155
+
156
+ cols = self.connector.get_columns(self.staging_table)
157
+ tmp_table = f"{self.staging_table}_tmp"
158
+
159
+ query = f"""
160
+ DROP TABLE IF EXISTS {tmp_table};
161
+ CREATE TABLE {tmp_table} AS
162
+ SELECT {', '.join(self.connector.quote_identifier(x) for x in cols)}
163
+ FROM (
164
+ SELECT *, ROW_NUMBER() OVER(PARTITION BY {partition_by} ORDER BY {order_by}) AS rn
165
+ FROM {self.staging_table}
166
+ ) t
167
+ WHERE rn = 1;
168
+
169
+ TRUNCATE TABLE {self.staging_table};
170
+ INSERT INTO {self.staging_table} SELECT * FROM {tmp_table};
171
+ DROP TABLE IF EXISTS {tmp_table};
172
+ """
173
+ return query
174
+
175
+ def _merge_into_target_table(self):
176
+ queries = []
177
+ pure_bak_table = self.bak_table.split(".")[-1]
178
+ pure_full_table = self.full_table_name.split(".")[-1]
179
+ if self.mode == const.LOAD_OVERWRITE:
180
+ queries.append(f"DROP TABLE IF EXISTS {self.bak_table}")
181
+ queries.append(f"ALTER TABLE {self.full_table_name} RENAME TO {pure_bak_table}")
182
+ queries.append(f"ALTER TABLE {self.staging_table} RENAME TO {pure_full_table}")
183
+ queries.append(f"DROP TABLE IF EXISTS {self.bak_table}")
184
+ elif self.mode == const.LOAD_MERGE:
185
+ # Deduplicate staging table data before merging using primary_keys
186
+ # Use primary_keys order for ordering
187
+ order_by = ", ".join(self.connector.quote_identifier(col) for col in self.primary_keys)
188
+ dedup_query = self._construct_dedup_query(partition_keys=self.primary_keys, order_by=order_by)
189
+ queries.append(dedup_query)
190
+
191
+ joins = []
192
+ for field in self.primary_keys:
193
+ join = "{target}.{field} = {staging}.{field}".format(
194
+ target=self.full_table_name, staging=self.staging_table, field=field
195
+ )
196
+ joins.append(join)
197
+
198
+ join_conditions = " AND ".join(joins)
199
+ # Delete existing records that match primary keys
200
+ delete_sql = "DELETE FROM {target} USING {staging} WHERE {join_conditions}".format(
201
+ target=self.full_table_name, staging=self.staging_table, join_conditions=join_conditions
202
+ )
203
+ queries.append(delete_sql)
204
+
205
+ # Insert data from staging table to target table
206
+ insert_sql = "INSERT INTO {target} SELECT * FROM {source}".format(
207
+ target=self.full_table_name, source=self.staging_table
208
+ )
209
+ queries.append(insert_sql)
210
+ queries.append("DROP TABLE {}".format(self.staging_table))
211
+ else:
212
+ # APPEND mode
213
+ append_sql = "INSERT INTO {target} SELECT * FROM {source}".format(
214
+ target=self.full_table_name, source=self.staging_table
215
+ )
216
+ queries.append(append_sql)
217
+ queries.append("DROP TABLE {}".format(self.staging_table))
218
+
219
+ queries = self.pre_queries + queries + self.post_queries
220
+ self.logger.info("running Redshift queries...")
221
+ try:
222
+ self.redshift.execute(queries, autocommit=False, commit_on_close=True)
223
+ except Exception as e:
224
+ self.logger.exception("failed to run queries")
225
+ raise e
226
+ finally:
227
+ if (
228
+ self.mode == const.LOAD_OVERWRITE
229
+ and not self.redshift.has_table(self.full_table_name)
230
+ and self.redshift.has_table(self.bak_table)
231
+ ):
232
+ rename_sql = "ALTER TABLE {} RENAME TO {}".format(self.bak_table, pure_full_table)
233
+ self.redshift.execute(rename_sql, autocommit=False, commit_on_close=True)
234
+
235
+ try:
236
+ self.logger.info("running analyze")
237
+ analyze_queries = "VACUUM {t}; ANALYZE {t}".format(t=self.full_table_name)
238
+ self.redshift.execute(analyze_queries, autocommit=True)
239
+ except Exception:
240
+ self.logger.exception("failed to run analyze queries")