recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,51 @@
1
+ import csv
2
+
3
+ from recurvedata.pigeon.loader.base import BaseLoader
4
+ from recurvedata.pigeon.schema import Schema
5
+ from recurvedata.pigeon.utils import fs
6
+
7
+
8
+ class CSVToElasticSearchLoader(BaseLoader):
9
+ def __init__(
10
+ self,
11
+ index,
12
+ doc_type,
13
+ filename,
14
+ connector,
15
+ id_field=None,
16
+ generate_id=False,
17
+ delete_file=False,
18
+ csv_options=None,
19
+ ):
20
+ self.index = index
21
+ self.doc_type = doc_type
22
+ self.filename = filename
23
+ self.id_field = id_field
24
+ self.generate_id = generate_id
25
+ self.delete_file = delete_file
26
+ self.es = connector
27
+
28
+ self.csv_options = csv_options or {"quoting": csv.QUOTE_ALL, "doublequote": True}
29
+
30
+ super().__init__()
31
+
32
+ def execute_impl(self):
33
+ schema_file = fs.schema_filename(self.filename)
34
+ if fs.exists(schema_file):
35
+ schema = Schema.load(schema_file)
36
+ else:
37
+ schema = None
38
+
39
+ self.es.load_csv(
40
+ self.filename,
41
+ self.index,
42
+ self.doc_type,
43
+ schema,
44
+ id_field=self.id_field,
45
+ generate_id=self.generate_id,
46
+ **self.csv_options,
47
+ )
48
+
49
+ if self.delete_file:
50
+ fs.remove_files_safely(self.filename)
51
+ fs.remove_files_safely(schema_file)
@@ -0,0 +1,169 @@
1
+ from typing import TYPE_CHECKING, List, Union
2
+
3
+ from recurvedata.pigeon import const
4
+ from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
5
+ from recurvedata.pigeon.utils import ensure_query_list, ensure_str_list, fs
6
+ from recurvedata.pigeon.utils.sql import bak_table_of, reconcile_table_of, staging_table_of
7
+
8
+ if TYPE_CHECKING:
9
+ from recurvedata.pigeon.connector.google_bigquery import GoogleBigqueryConnector
10
+
11
+ allowed_modes = (
12
+ const.LOAD_OVERWRITE,
13
+ const.LOAD_MERGE,
14
+ const.LOAD_APPEND
15
+ )
16
+
17
+
18
+ class CSVToGoogleBigqueryLoader(BaseLoader, CSVToDBAPIMixin):
19
+ def __init__(
20
+ self,
21
+ table: str,
22
+ filename: str,
23
+ google_bigquery_connector: 'GoogleBigqueryConnector' = None,
24
+ dataset: str = None,
25
+ create_table_ddl: str = None,
26
+ mode: str = const.LOAD_OVERWRITE,
27
+ primary_keys: Union[str, List[str]] = None,
28
+ columns: Union[str, List[str]] = None,
29
+ skiprows: int = 0,
30
+ delete_file: bool = True,
31
+ pre_queries: str = None,
32
+ post_queries: str = None,
33
+ *args, **kwargs
34
+ ):
35
+ self.table = table
36
+ self.dataset = dataset
37
+
38
+ self.google_bigquery = google_bigquery_connector
39
+
40
+ # determine table name of target table and staging table
41
+ self.filename = filename # full file path
42
+ self.skiprows = skiprows
43
+
44
+ # determine table ddl stuff
45
+ self.create_table_ddl = create_table_ddl
46
+
47
+ # merge stuff
48
+ if mode not in allowed_modes:
49
+ raise ValueError(f'mode should be one of ({allowed_modes})')
50
+
51
+ self.mode = mode
52
+ self.primary_keys = ensure_str_list(primary_keys)
53
+ if self.mode == const.LOAD_MERGE and not self.primary_keys:
54
+ raise ValueError('primary_keys should not be empty in mode {}'.format(const.LOAD_MERGE))
55
+
56
+ self.columns = ensure_str_list(columns)
57
+
58
+ self.pre_queries = ensure_query_list(pre_queries) or []
59
+ self.post_queries = ensure_query_list(post_queries) or []
60
+
61
+ self.delete_file = delete_file
62
+
63
+ super().__init__()
64
+
65
+ def execute_impl(self):
66
+ if fs.is_file_empty(self.filename):
67
+ self.logger.error('file not exists or has no content. %s', self.filename)
68
+ fs.remove_files_safely(fs.schema_filename(self.filename))
69
+ return
70
+
71
+ self._prepare_target_table()
72
+ self._prepare_staging_table()
73
+ self._merge_into_target_table()
74
+
75
+ # do cleaning things
76
+ if self.delete_file:
77
+ self.logger.info('delete local file %s', self.filename)
78
+ fs.remove_files_safely(self.filename)
79
+ fs.remove_files_safely(fs.schema_filename(self.filename))
80
+
81
+ @property
82
+ def connector(self):
83
+ return self.google_bigquery
84
+
85
+ @property
86
+ def staging_table(self):
87
+ return staging_table_of(self.table)
88
+
89
+ @property
90
+ def full_staging_table_name(self):
91
+ return f'{self.dataset}.{self.staging_table}'
92
+
93
+ @property
94
+ def full_table_name(self):
95
+ return f'{self.dataset}.{self.table}'
96
+
97
+ def _prepare_target_table(self):
98
+ # add schema for azure data warehouse, dataset for google bigquery
99
+ if self.connector.has_table(table=self.table, schema=getattr(self, 'schema', None),
100
+ dataset=getattr(self, 'dataset', None)):
101
+ return
102
+
103
+ self.logger.info('table not found, try to create it')
104
+ ddl = self._infer_create_table_ddl()
105
+ if not ddl:
106
+ raise ValueError('table not found, create_table_ddl is required')
107
+ ddl = ddl.strip().rstrip(';')
108
+ self.logger.info('create table ddl: %s\n', ddl)
109
+ with self.connector.cursor() as cursor:
110
+ cursor.execute(ddl)
111
+
112
+ def _prepare_staging_table(self):
113
+ dataset, table = self.full_staging_table_name.split('.')
114
+ drop = f'DROP TABLE IF EXISTS {self.full_staging_table_name}'
115
+ staging_ddl = f'CREATE TABLE IF NOT EXISTS {self.full_staging_table_name} LIKE {self.full_table_name}'
116
+ self.connector.execute([drop, staging_ddl], auto_commit=False, commit_on_close=True)
117
+
118
+ self.logger.info(f'load {self.filename} into staging table {self.full_staging_table_name}')
119
+ self.connector.load_csv(table=self.full_staging_table_name,
120
+ filename=self.filename,
121
+ schema=self.connector.get_schema(table, dataset),
122
+ skiprows=self.skiprows)
123
+
124
+ def _merge_into_target_table(self):
125
+ target = self.full_table_name
126
+ staging = self.full_staging_table_name
127
+
128
+ append_sql = f'INSERT INTO {target} SELECT * FROM {staging}'
129
+ if self.mode == const.LOAD_OVERWRITE:
130
+ queries = [f'TRUNCATE TABLE {target}', append_sql]
131
+ elif self.mode == const.LOAD_MERGE:
132
+ queries = self._ingest_by_merging()
133
+ else:
134
+ # APPEND
135
+ queries = [append_sql]
136
+
137
+ queries.append(f'DROP TABLE {staging}')
138
+
139
+ queries = self.pre_queries + queries + self.post_queries
140
+ self.logger.info('running Google Bigquery queries...')
141
+ self.connector.execute(queries)
142
+ self.logger.info('done.')
143
+
144
+ def _ingest_by_merging(self):
145
+ reconcile = reconcile_table_of(self.table)
146
+ bak = bak_table_of(self.table)
147
+
148
+ quote = self.connector.quote_identifier
149
+ join = ' AND '.join([f'a.{quote(x)} = b.{quote(x)}' for x in self.primary_keys])
150
+
151
+ queries = f"""
152
+ DROP TABLE IF EXISTS {self.dataset}.{reconcile};
153
+ CREATE TABLE IF NOT EXISTS {self.dataset}.{reconcile} LIKE {self.full_table_name};
154
+
155
+ INSERT INTO {self.dataset}.{reconcile}
156
+ SELECT a.*
157
+ FROM {self.full_table_name} AS a
158
+ LEFT JOIN {self.full_staging_table_name} AS b ON {join}
159
+ WHERE b.{quote(self.primary_keys[0])} IS NULL
160
+ UNION ALL
161
+ SELECT * FROM {self.full_staging_table_name};
162
+
163
+ ALTER TABLE {self.full_table_name} RENAME TO {bak};
164
+ ALTER TABLE {self.dataset}.{reconcile} RENAME TO {self.table};
165
+
166
+ DROP TABLE IF EXISTS {self.dataset}.{bak};
167
+ DROP TABLE IF EXISTS {self.dataset}.{reconcile};
168
+ """
169
+ return queries.split(';')
@@ -0,0 +1,468 @@
1
+ import functools
2
+ import glob
3
+ import json
4
+ import os
5
+ import tempfile
6
+ from json.decoder import JSONDecodeError
7
+ from typing import TYPE_CHECKING, Dict, List, Union
8
+
9
+ import cytoolz as toolz
10
+ from slugify import slugify
11
+
12
+ from recurvedata.pigeon import const
13
+ from recurvedata.pigeon.connector import new_hive_connector, new_impala_connector
14
+ from recurvedata.pigeon.csv import CSV
15
+ from recurvedata.pigeon.handler.csv_handler import convert_csv_to_hive_textfile
16
+ from recurvedata.pigeon.loader.base import BaseLoader, CSVToDBAPIMixin
17
+ from recurvedata.pigeon.utils import ensure_list, ensure_query_list, ensure_str_list, fs, silent
18
+ from recurvedata.pigeon.utils.sql import reconcile_table_of, staging_table_of
19
+
20
+ if TYPE_CHECKING:
21
+ from recurvedata.pigeon.connector.hive_impala import HiveConnector, ImpalaConnector
22
+
23
+ allowed_modes = (
24
+ const.LOAD_OVERWRITE,
25
+ const.LOAD_MERGE,
26
+ const.LOAD_APPEND
27
+ )
28
+
29
+ AUTO = object()
30
+
31
+
32
+ def _enable_connection_pooling(method):
33
+ @functools.wraps(method)
34
+ def inner(self: 'CSVToHiveLoader', *args, **kwargs):
35
+ self.hive.enable_connection_pooling(reset_on_return=False)
36
+ self.impala.enable_connection_pooling(reset_on_return=False)
37
+ try:
38
+ return method(self, *args, **kwargs)
39
+ except BaseException as e:
40
+ raise e
41
+ finally:
42
+ self.hive.dispose()
43
+ self.impala.dispose()
44
+
45
+ return inner
46
+
47
+
48
+ class CSVToHiveLoader(BaseLoader, CSVToDBAPIMixin):
49
+ def __init__(
50
+ self,
51
+ database: str,
52
+ table: str,
53
+ filename: str,
54
+ hive_connector: 'HiveConnector' = None,
55
+ impala_connector: 'ImpalaConnector' = None,
56
+ create_table_ddl: str = None,
57
+ dynamic_partition: str = None,
58
+ partition: Dict = None,
59
+ mode: str = const.LOAD_OVERWRITE,
60
+ primary_keys: List[str] = None,
61
+ using_impala: bool = AUTO,
62
+ delete_file: bool = False,
63
+ dedup: bool = False,
64
+ dedup_uniq_keys: List[str] = None,
65
+ dedup_orderby: str = None,
66
+ pre_queries: Union[str, List[str]] = None,
67
+ post_queries: Union[str, List[str]] = None,
68
+ is_std_csv: bool = False,
69
+ has_header: bool = False,
70
+ csv_options: Dict = None,
71
+ compression_codec: str = "snappy",
72
+ dumper_meta: Dict = None,
73
+ refresh_impala_metadata: bool = True
74
+ ):
75
+ """Loads csv file into a Hive table.
76
+
77
+ :param database: the target database name
78
+ :param table: target table name, should not contains database portion
79
+ :param filename: the absolute path to csv file, can be a single string or list of strings
80
+ :param hive_connector: a HiveConnector object used to query Hive
81
+ :param impala_connector: a ImpalaConnector object used to query Impala
82
+ :param create_table_ddl: create table
83
+ :param dynamic_partition: dynamic_partition specs,should be a string like 'site,month'
84
+ :param partition: partition specs, should be a dict like {'dt': '2017-01-01'}
85
+ :param mode: one of (LOAD_OVERWRITE, LOAD_MERGE, LOAD_APPEND)
86
+ :param primary_keys: columns that identifies a unique row, e.g. ['dt', 'product_id']. Required if mode is LOAD_MERGE
87
+ :param using_impala: whether use Impala to merge data or not. Possible values:
88
+ - `AUTO` (by default): determine by whether the table has complex type fields.
89
+ - `True`: use Impala, will fail if the table has complex type fields
90
+ - `False` and other values: fallback to use Hive
91
+ :param delete_file: delete the CSV file after loading, default is True
92
+ :param dedup: remove duplicated records from staging table before being merged into target
93
+ :param dedup_uniq_keys: columns that identifies a unique row.
94
+ :param dedup_orderby: determine which row should be kept.
95
+ e.g. "to keep the row has minimal timestamp", then set `dedup_orderby='timestamp ASC'
96
+ :param pre_queries: queries executed before loading
97
+ :param post_queries: queries after loading
98
+ :param is_std_csv: indicates the input filename is a standard CSV file or not (standard Hive TextFile)
99
+ :param compression_codec: compression format code,emum{none,snappy,gzip}
100
+ :param dumper_meta: dumper output and options like check_dumper_row
101
+ """
102
+ self.database = database
103
+ self.table = table
104
+
105
+ if hive_connector is None:
106
+ hive_connector = new_hive_connector(database=self.database)
107
+ else:
108
+ hive_connector.database = self.database
109
+ self.hive = hive_connector
110
+
111
+ if impala_connector is None:
112
+ impala_connector = new_impala_connector(database=self.database)
113
+ else:
114
+ impala_connector.database = self.database
115
+ self.impala = impala_connector
116
+ self.refresh_impala_metadata = refresh_impala_metadata
117
+
118
+ self.filename = filename
119
+ self._local_data_files = self._determine_local_data_files()
120
+ self._schema_filename = self._infer_schema_filename()
121
+
122
+ self.is_std_csv = is_std_csv
123
+ self.has_header = has_header
124
+ self.csv_options = csv_options or {}
125
+
126
+ self.create_table_ddl = create_table_ddl
127
+ # 原来的partition重新命名为static_partiton,构造函数传入变量不换,避免修改大量的业务代码
128
+ self.static_partition = partition
129
+ if dynamic_partition:
130
+ self.dynamic_partition = dynamic_partition.split(',')
131
+ else:
132
+ self.dynamic_partition = None
133
+ if self.static_partition and self.dynamic_partition:
134
+ raise ValueError('Partition mode only be static or dynamic')
135
+ if self.dynamic_partition and not self.connector.is_table_partitioned(self.database, self.table):
136
+ if not create_table_ddl or 'partitioned by' not in self.create_table_ddl.lower():
137
+ raise ValueError("Table not found or is not partitioned, create_table_ddl is required and "
138
+ "assign partition columns when use dynamic partition mode")
139
+
140
+ if mode not in allowed_modes:
141
+ raise ValueError('mode should be one of ({})'.format(allowed_modes))
142
+
143
+ self.mode = mode
144
+ self.primary_keys = ensure_str_list(primary_keys)
145
+ if self.mode == const.LOAD_MERGE and not self.primary_keys:
146
+ raise ValueError('primary_keys should not be empty in mode {}'.format(const.LOAD_MERGE))
147
+ if self.mode == const.LOAD_MERGE and (self.static_partition or self.dynamic_partition):
148
+ raise ValueError('merge into partitioned table is not supported')
149
+
150
+ self.delete_file = delete_file
151
+ self.using_impala = using_impala
152
+
153
+ self.dedup = dedup
154
+ self.dedup_uniq_keys = ensure_str_list(dedup_uniq_keys)
155
+ self.dedup_orderby = dedup_orderby
156
+ if self.dedup and not self.dedup_uniq_keys:
157
+ raise ValueError('dedup_uniq_keys should not be empty')
158
+ if not self.dedup_orderby:
159
+ self.dedup_orderby = ', '.join(self.dedup_uniq_keys)
160
+
161
+ self.pre_queries = ensure_query_list(pre_queries) or []
162
+ self.post_queries = ensure_query_list(post_queries) or []
163
+ self.compression_codec = compression_codec
164
+ self.dumper_meta = dumper_meta
165
+
166
+ super().__init__()
167
+
168
+ @property
169
+ def schema_filename(self) -> str:
170
+ return self._schema_filename
171
+
172
+ @_enable_connection_pooling
173
+ def execute_impl(self):
174
+ if all([fs.is_file_empty(x) for x in self._local_data_files]):
175
+ self.logger.error('file not exists or has no content. %s', self.filename)
176
+ self._cleanup()
177
+ return
178
+
179
+ self._prepare_target_table()
180
+ self._check_target_table_cols_num()
181
+ self._prepare_staging_table()
182
+ self._merge_into_target_table()
183
+ if self.refresh_impala_metadata:
184
+ self._compute_stats()
185
+
186
+ if self.delete_file:
187
+ self._cleanup()
188
+
189
+ @property
190
+ def slugify_partition(self) -> str:
191
+ if self.static_partition is None:
192
+ return ''
193
+ names = [slugify(str(value), separator='') for _, value in self.static_partition.items()]
194
+ return '_'.join(names)
195
+
196
+ @property
197
+ def staging_table(self) -> str:
198
+ if not self.static_partition:
199
+ table_name = staging_table_of(self.table)
200
+ else:
201
+ table_name = staging_table_of(f'{self.table}_{self.slugify_partition}')
202
+ return table_name[:120]
203
+
204
+ @property
205
+ def reconciled_table(self) -> str:
206
+ if not self.static_partition:
207
+ table_name = reconcile_table_of(self.table)
208
+ else:
209
+ table_name = reconcile_table_of(f'{self.table}_{self.slugify_partition}')
210
+ return table_name[:120]
211
+
212
+ @property
213
+ def connector(self) -> 'HiveConnector':
214
+ return self.hive
215
+
216
+ def _determine_local_data_files(self) -> List[str]:
217
+ if isinstance(self.filename, str) and os.path.isdir(self.filename):
218
+ raise TypeError('filename should neither be a single path or list of paths, directory is not supported')
219
+
220
+ # ignore the empty or non-exist files
221
+ files = [x for x in ensure_list(self.filename) if not x.endswith('.schema') and not fs.is_file_empty(x)]
222
+
223
+ # make sure the first file is not empty
224
+ files.sort(key=lambda x: os.path.getsize(x), reverse=True)
225
+ return files
226
+
227
+ def _infer_schema_filename(self) -> str:
228
+ if self._local_data_files:
229
+ f = self._local_data_files[0]
230
+ elif self.filename:
231
+ f = ensure_list(self.filename)[0]
232
+ else:
233
+ return None
234
+ return fs.schema_filename(os.path.splitext(f)[0])
235
+
236
+ def _cleanup(self):
237
+ fs.remove_files_safely(self.filename)
238
+ fs.remove_files_safely(self._schema_filename)
239
+
240
+ def _check_target_table_cols_num(self):
241
+ # 获取目标表的字段长度信息
242
+ if not self.static_partition:
243
+ exclude = None
244
+ else:
245
+ exclude = self.static_partition.keys()
246
+ target_table_cols = self.connector.get_columns(table=self.table, database=self.database, exclude=exclude)
247
+
248
+ # 解析schema文件,获取fields长度信息
249
+ if not fs.is_file_empty(self._schema_filename):
250
+ with open(self._schema_filename) as f:
251
+ try:
252
+ schema_fields = json.load(f)
253
+ if len(schema_fields) == len(target_table_cols):
254
+ return
255
+ except JSONDecodeError:
256
+ pass
257
+
258
+ # 解析csv数据文件,获取列的数量
259
+ if self.is_std_csv:
260
+ cf = CSV(self._local_data_files[0], **self.csv_options)
261
+ with cf.reader(as_dict=False) as reader:
262
+ row = next(reader)
263
+ schema_fields_num = len(row)
264
+ else:
265
+ # hive格式的csv
266
+ with open(self._local_data_files[0]) as f:
267
+ line = next(f)
268
+ schema_fields_num = len(line.split(const.HIVE_FIELD_DELIMITER))
269
+ if schema_fields_num != len(target_table_cols):
270
+ raise Exception(f'number of columns mismatch, target table has {target_table_cols} columns,'
271
+ f' while data file has {schema_fields_num}')
272
+
273
+ def _prepare_staging_table(self):
274
+ staging_table = self.hive.quote_identifier(self.staging_table)
275
+ queries = [
276
+ f"DROP TABLE IF EXISTS {staging_table} PURGE;"
277
+ ]
278
+ exclude_columns = self.static_partition.keys() if self.static_partition else None
279
+ staging_ddl = self.hive.generate_load_staging_table_ddl(staging_table, self.table, self.database,
280
+ exclude_columns=exclude_columns)
281
+ queries.append(staging_ddl)
282
+ self.hive.execute(queries)
283
+
284
+ path_to_load = self._local_data_files
285
+ if self.is_std_csv:
286
+ self.logger.info('got standard CSV file, convert to Hive text file before loading')
287
+ prefix = os.path.splitext(os.path.basename(self._local_data_files[0]))[0]
288
+ tmp_folder = tempfile.mkdtemp(prefix=f'{prefix}_', dir=os.path.dirname(self._local_data_files[0]))
289
+ if os.path.exists(tmp_folder):
290
+ self.logger.warning(f'tmp folder {tmp_folder} already exists, will overwrite any files if exist')
291
+ fs.remove_folder_safely(tmp_folder)
292
+ os.makedirs(tmp_folder, exist_ok=True)
293
+
294
+ for cf in self._local_data_files:
295
+ convert_csv_to_hive_textfile(cf, folder=tmp_folder, replace=False,
296
+ has_header=self.has_header, **self.csv_options)
297
+ path_to_load = glob.glob(os.path.join(tmp_folder, '*'))
298
+ self.logger.info(f'the real files to be loaded into {self.staging_table} are {path_to_load}')
299
+
300
+ self.hive.load_local_file(self.staging_table, path_to_load)
301
+
302
+ if self._determine_using_impala():
303
+ self.impala.execute(f'INVALIDATE METADATA {self.impala.quote_identifier(self.staging_table)}')
304
+
305
+ self._check_staging_table_rows()
306
+
307
+ # remove the temp files
308
+ if path_to_load != self._local_data_files:
309
+ self.logger.info(f'delete {path_to_load} after being loaded to {self.staging_table}')
310
+ fs.remove_folder_safely(os.path.dirname(path_to_load[0]))
311
+
312
+ def _construct_dedup_query(self) -> str:
313
+ partition_cols = []
314
+ for col in self.dedup_uniq_keys:
315
+ partition_cols.append(self.hive.quote_identifier(col))
316
+ partition_by = ', '.join(partition_cols)
317
+
318
+ cols = self.hive.get_columns(self.staging_table)
319
+ staging_table = self.hive.quote_identifier(self.staging_table)
320
+
321
+ query = f'''
322
+ WITH t AS (
323
+ SELECT *, ROW_NUMBER() OVER(PARTITION BY {partition_by} ORDER BY {self.dedup_orderby}) AS rnk
324
+ FROM {staging_table}
325
+ )
326
+ INSERT OVERWRITE TABLE {staging_table}
327
+ SELECT {', '.join(self.hive.quote_identifier(x) for x in cols)}
328
+ FROM t WHERE rnk = 1
329
+ '''
330
+ return query
331
+
332
+ def _get_compression_sqls(self) -> List[str]:
333
+ using_impala = self._determine_using_impala()
334
+ compression_sqls = []
335
+ if using_impala:
336
+ allow_text = "SET ALLOW_UNSUPPORTED_FORMATS=True"
337
+ set_codec = "SET COMPRESSION_CODEC = {}".format(self.compression_codec)
338
+ compression_sqls = [allow_text, set_codec]
339
+ else:
340
+ if self.compression_codec != "none" and self._is_low_hive_version():
341
+ set_codec = "SET parquet.compression = {}".format(self.compression_codec)
342
+ compression_sqls = [set_codec]
343
+ return compression_sqls
344
+
345
+ def _merge_into_target_table(self):
346
+ if self.dedup:
347
+ self.pre_queries.append(self._construct_dedup_query())
348
+
349
+ if self.mode in (const.LOAD_OVERWRITE, const.LOAD_APPEND):
350
+ queries = self._ingest_by_overwriting_appending()
351
+ else:
352
+ queries = self._ingest_by_merging()
353
+
354
+ queries.append('DROP TABLE IF EXISTS {} PURGE'.format(self.hive.quote_identifier(self.staging_table)))
355
+ all_queries = self.pre_queries + queries + self.post_queries
356
+ self._execute_merge_queries(all_queries)
357
+
358
+ def _ingest_by_overwriting_appending(self) -> List[str]:
359
+ compression_sqls = self._get_compression_sqls()
360
+ insert_mode = {
361
+ const.LOAD_OVERWRITE: 'OVERWRITE',
362
+ const.LOAD_APPEND: 'INTO'
363
+ }
364
+ partition = ''
365
+ if self.static_partition:
366
+ spec = ', '.join([f'{self.hive.quote_identifier(k)}={repr(v)}' for k, v in self.static_partition.items()])
367
+ partition = f'PARTITION ({spec})'
368
+ elif self.dynamic_partition:
369
+ spec = ', '.join(self.hive.quote_identifier(p) for p in self.dynamic_partition)
370
+ partition = f'PARTITION ({spec})'
371
+
372
+ queries = []
373
+ if not self._determine_using_impala():
374
+ queries.append('SET hive.exec.dynamic.partition.mode=nonstrict')
375
+ sql = 'INSERT {mode} TABLE {table} {partition} SELECT * FROM {staging}'.format(
376
+ mode=insert_mode[self.mode], partition=partition,
377
+ table=self.hive.quote_identifier(self.table),
378
+ staging=self.hive.quote_identifier(self.staging_table))
379
+ queries.append(sql)
380
+ return compression_sqls + queries
381
+
382
+ def _ingest_by_merging(self) -> List[str]:
383
+ reconcile = self.reconciled_table
384
+ join = ' AND '.join(
385
+ [f'a.{self.hive.quote_identifier(x)} = b.{self.hive.quote_identifier(x)}' for x in self.primary_keys])
386
+ sql = '''
387
+ DROP TABLE IF EXISTS {reconcile} PURGE;
388
+ CREATE TABLE {reconcile} STORED AS PARQUET AS
389
+ SELECT a.* FROM {table} a LEFT OUTER JOIN {staging} b ON {join} WHERE b.{pk} IS NULL
390
+ UNION ALL
391
+ SELECT * FROM {staging};
392
+ {compression_sqls};
393
+ INSERT OVERWRITE TABLE {table} SELECT * FROM {reconcile};
394
+ DROP TABLE IF EXISTS {reconcile} PURGE;
395
+ '''.format(reconcile=self.hive.quote_identifier(reconcile),
396
+ table=self.hive.quote_identifier(self.table),
397
+ staging=self.hive.quote_identifier(self.staging_table),
398
+ compression_sqls=";".join(self._get_compression_sqls()),
399
+ # bak=self.hive.quote_identifier('{}_bak'.format(self.table)),
400
+ join=join,
401
+ pk=f'{self.hive.quote_identifier(self.primary_keys[0])}')
402
+ queries = sql.split(';')
403
+ return queries
404
+
405
+ def _execute_merge_queries(self, queries: List[str]):
406
+ using_impala = self._determine_using_impala()
407
+ if using_impala:
408
+ # staging_update_meta = f'INVALIDATE METADATA {self.impala.quote_identifier(self.staging_table)}'
409
+ # self.impala.execute(staging_update_meta)
410
+ self.impala.refresh(self.table, compute_stats=False)
411
+ self.impala.execute(queries)
412
+ else:
413
+ if self.dynamic_partition:
414
+ allow_dynamic_partition_queries_list = ['SET hive.exec.dynamic.partition=true',
415
+ 'SET hive.exec.dynamic.partition.mode=nonstrict']
416
+ queries = allow_dynamic_partition_queries_list + queries
417
+
418
+ self.hive.execute(queries)
419
+
420
+ @toolz.memoize
421
+ def _is_low_hive_version(self):
422
+ """
423
+ 2.3.0 以下的版本, 动态修改 parquet 只能通过 SET parquet.compression = "xx" 的方式操作;
424
+ 2.3.0 以上的版本, 则只能在 create table 时指定
425
+ """
426
+ result = self.hive.fetchall('SELECT version()')
427
+ self.logger.info(f"current hive's version: {result[0][0]}")
428
+ return result[0][0] < "2.3.0"
429
+
430
+ @toolz.memoize
431
+ def _determine_using_impala(self) -> bool:
432
+ if self.impala is None:
433
+ self.logger.info('impala connector is not set')
434
+ return False
435
+
436
+ if self.using_impala is True:
437
+ self.logger.info('`using_impala` is set to True by caller')
438
+ return True
439
+
440
+ if self.using_impala is AUTO:
441
+ self.logger.info('`using_impala` is set to AUTO, checking complex type fields')
442
+ if not self.hive.has_complex_type_fields(self.table):
443
+ self.logger.info('found no complex type fields, happy to use Impala')
444
+ return True
445
+ self.logger.info('detected complex type fields, fallback to using Hive')
446
+ return False
447
+
448
+ @silent()
449
+ def _compute_stats(self):
450
+ self.impala.refresh(self.table, True)
451
+
452
+ def _check_staging_table_rows(self):
453
+ if not self.dumper_meta:
454
+ return
455
+ check_dumper_row: bool = self.dumper_meta.get('check_dumper_row', True)
456
+ dumper_rows: int = self.dumper_meta.get('dumper_output_rows')
457
+ if not (check_dumper_row and dumper_rows):
458
+ return
459
+ staging_table = self.impala.quote_identifier(self.staging_table)
460
+ if self._determine_using_impala():
461
+ # self.impala.execute(f'INVALIDATE METADATA {staging_table}')
462
+ staging_table_cnt, = self.impala.fetchone(f'SELECT COUNT(1) AS cnt FROM {staging_table}')
463
+ else:
464
+ staging_table_cnt, = self.hive.fetchone(f'SELECT COUNT(1) AS cnt FROM {staging_table}')
465
+ if staging_table_cnt != dumper_rows:
466
+ raise ValueError(f'staging table {staging_table} cnt {staging_table_cnt} != dumper_rows {dumper_rows} '
467
+ 'maybe something wrong when load csv to staging table, please retry')
468
+ self.logger.info(f'staging_table {staging_table} cnt {staging_table_cnt} equals with dumper_output')