recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,571 @@
1
+ import csv
2
+ import itertools
3
+ import pickle
4
+ import re
5
+ import threading
6
+ from queue import Full, Queue
7
+
8
+ import sqlalchemy
9
+ import sqlalchemy.engine.url
10
+ import sqlparse
11
+ from sqlalchemy.pool import QueuePool
12
+
13
+ from recurvedata.pigeon.schema import Schema
14
+ from recurvedata.pigeon.utils import LoggingMixin, replace_null_values, trim_prefix, trim_suffix
15
+ from recurvedata.pigeon.utils.timing import TimeCounter
16
+
17
+
18
+ class NullCursor(LoggingMixin):
19
+ """
20
+ NullCursor implements DBAPI Cursor but does nothing at all.
21
+ """
22
+
23
+ def execute(self, operation, args=None, **kwargs):
24
+ if args is None:
25
+ sql = operation
26
+ else:
27
+ sql = operation % args
28
+ self.logger.info(sql)
29
+ return 0
30
+
31
+ def executemany(self, query, args):
32
+ if not args:
33
+ return
34
+ return sum(self.execute(query, arg) for arg in args)
35
+
36
+ def fetchone(self):
37
+ return None
38
+
39
+ def fetchmany(self, size=None):
40
+ return None
41
+
42
+ def fetchall(self):
43
+ return []
44
+
45
+ def __iter__(self):
46
+ return iter(self.fetchone, None)
47
+
48
+ def __enter__(self):
49
+ return self
50
+
51
+ def __exit__(self, exc_type, exc_val, exc_tb):
52
+ self.close()
53
+
54
+ def close(self):
55
+ self.logger.info("closing null cursor")
56
+
57
+
58
+ class _ShowTableLikeMixin(object):
59
+ def has_table(self, table, database=None, cursor=None, **kwargs):
60
+ close_cursor_at_exit = False
61
+ if cursor is None:
62
+ cursor = self.cursor()
63
+ close_cursor_at_exit = True
64
+
65
+ if database is not None and database != self.database:
66
+ cursor.execute("USE {}".format(database))
67
+ cursor.execute("SHOW TABLES LIKE '{}'".format(table))
68
+ rv = cursor.fetchall()
69
+ if close_cursor_at_exit:
70
+ cursor.close()
71
+ return bool(rv)
72
+
73
+
74
+ class _ConnectionPoolMixin(object):
75
+ def enable_connection_pooling(self, **pool_kwargs):
76
+ self.pooling_enabled = True
77
+ self._pool_kwargs = pool_kwargs
78
+ self._pools = {} # we do not use lock, threadsafe is not guaranteed
79
+
80
+ def get_connection_pooling_first(self, autocommit=False, *args, **kwargs):
81
+ if not getattr(self, "pooling_enabled", False):
82
+ return self.connect_impl(autocommit=autocommit, *args, **kwargs)
83
+
84
+ pool_id = hash(pickle.dumps((autocommit, args, kwargs)))
85
+ pool = self._pools.get(pool_id)
86
+ if not pool:
87
+
88
+ def creator():
89
+ return self.connect_impl(autocommit=autocommit, *args, **kwargs)
90
+
91
+ pool = QueuePool(creator=creator, **self._pool_kwargs)
92
+ self._pools[pool_id] = pool
93
+ conn = pool.connect()
94
+ return conn
95
+
96
+ def dispose(self):
97
+ try:
98
+ for _, p in self._pools.items():
99
+ if isinstance(p, QueuePool):
100
+ p.dispose()
101
+ except Exception as e:
102
+ self.logger.error(f"dispose error: {e}")
103
+
104
+
105
+ class ClosingCursor:
106
+ def __init__(self, connection, commit_on_close=True):
107
+ self.connection = connection
108
+ self._cursor = connection.cursor()
109
+ self._commit_on_close = commit_on_close
110
+
111
+ def __getattr__(self, name):
112
+ return getattr(self._cursor, name)
113
+
114
+ def __enter__(self):
115
+ return self
116
+
117
+ def __iter__(self):
118
+ # the Iterable check will not invoke __getattr__
119
+ # we must delegates it explictly
120
+ return iter(self._cursor)
121
+
122
+ def __exit__(self, exc_type, exc, traceback):
123
+ if not exc and self._commit_on_close:
124
+ self.connection.commit()
125
+ self.close()
126
+
127
+ def close(self):
128
+ self._cursor.close()
129
+ self.connection.close()
130
+
131
+
132
+ class DBAPIConnector(LoggingMixin, _ConnectionPoolMixin):
133
+ _log_query = True
134
+ _sqla_driver = None
135
+ _sqla_url_query = {}
136
+ _identifier_start_quote = "`"
137
+ _identifier_end_quote = "`"
138
+ _param_placeholder = "%s"
139
+ _default_port = None
140
+ _default_database = None
141
+
142
+ def __init__(self, host, port=None, database=None, user=None, password=None, schema=None, *args, **kwargs):
143
+ self.host = host
144
+ self.port = port or self._default_port
145
+ self.database = database or self._default_database
146
+ self.user = user
147
+ self.password = password
148
+ self.args = args
149
+ self.kwargs = kwargs
150
+ self.schema = schema
151
+
152
+ def connect(self, autocommit=False, *args, **kwargs):
153
+ """Returns a DBAPI connection"""
154
+ return self.get_connection_pooling_first(autocommit, *args, **kwargs)
155
+
156
+ def connect_impl(self, autocommit=False, *args, **kwargs):
157
+ raise NotImplementedError("connect must be implemented by subclasses")
158
+
159
+ def cursor(self, autocommit=False, dryrun=False, commit_on_close=True, **kwargs):
160
+ """Returns a DBAPI cursor"""
161
+ if dryrun:
162
+ return NullCursor()
163
+ conn = self.connect(autocommit, **kwargs)
164
+ return ClosingCursor(conn, commit_on_close=commit_on_close)
165
+
166
+ closing_cursor = cursor
167
+
168
+ def execute(self, query, parameters=None, **cursor_options):
169
+ """Execute one or more sql queries in a same session."""
170
+ if isinstance(query, list):
171
+ queries = list(itertools.chain(*map(sqlparse.split, query)))
172
+ else:
173
+ queries = sqlparse.split(query)
174
+
175
+ with self.cursor(**cursor_options) as cursor:
176
+ for q in queries:
177
+ # remove the trailing `;`
178
+ q = q.rstrip(";")
179
+ if not q:
180
+ continue
181
+ self._log(q)
182
+ if parameters is not None:
183
+ cursor.execute(q, parameters)
184
+ else:
185
+ cursor.execute(q)
186
+
187
+ def fetchone(self, query, parameters=None):
188
+ return self._fetch_query("one", query, parameters)
189
+
190
+ def fetchmany(self, query, parameters=None, size=None):
191
+ return self._fetch_query("many", query, parameters, size=size)
192
+
193
+ def fetchall(self, query, parameters=None):
194
+ return self._fetch_query("all", query, parameters)
195
+
196
+ def _fetch_query(self, howmany, query, parameters=None, size=None):
197
+ self._log(query)
198
+ with self.cursor() as cursor:
199
+ if parameters is not None:
200
+ cursor.execute(query, parameters)
201
+ else:
202
+ cursor.execute(query)
203
+
204
+ if howmany == "many":
205
+ rv = cursor.fetchmany(size=size)
206
+ elif howmany == "all":
207
+ rv = cursor.fetchall()
208
+ else:
209
+ rv = cursor.fetchone()
210
+ return rv
211
+
212
+ def _log(self, msg, *args, **kwargs):
213
+ if not self._log_query:
214
+ return
215
+ self.logger.info("\n" + str(msg), *args, **kwargs)
216
+
217
+ def _get_sqlalchemy_uri(self):
218
+ url = sqlalchemy.engine.url.URL(
219
+ drivername=self._sqla_driver,
220
+ host=self.host,
221
+ port=self.port,
222
+ username=self.user,
223
+ password=self.password,
224
+ database=self.database,
225
+ query=self._sqla_url_query,
226
+ )
227
+ return url.__to_string__(hide_password=False)
228
+
229
+ def create_engine(self, engine_kwargs=None):
230
+ """Returns a SQLAlchemy engine"""
231
+ if engine_kwargs is None:
232
+ engine_kwargs = {}
233
+ # engine_kwargs.update({'encoding': 'utf8'})
234
+ return sqlalchemy.create_engine(self._get_sqlalchemy_uri(), **engine_kwargs)
235
+
236
+ def get_pandas_df(self, query, parameters=None, **kwargs):
237
+ import pandas as pd
238
+
239
+ query = sqlalchemy.text(query) # if '%' in query, it will error without sqlalchemy.text in sqlalchemy 2.0
240
+ con = self.create_engine()
241
+ try:
242
+ df = pd.read_sql_query(sql=query, con=con, params=parameters, **kwargs)
243
+ finally:
244
+ con.dispose()
245
+ return df
246
+
247
+ def has_table(self, table, database=None, **kwargs):
248
+ raise NotImplementedError
249
+
250
+ def clone(self):
251
+ return self.__class__(
252
+ host=self.host,
253
+ port=self.port,
254
+ database=self.database,
255
+ user=self.user,
256
+ password=self.password,
257
+ schema=self.schema,
258
+ *self.args,
259
+ **self.kwargs,
260
+ )
261
+
262
+ def get_columns(self, table, database=None, exclude=None):
263
+ if database is None:
264
+ database = self.database
265
+ with self.cursor() as cursor:
266
+ if not self.has_table(table, database, cursor=cursor):
267
+ raise ValueError("Table {!r} not exists in {!r}".format(table, database))
268
+ cursor.execute(
269
+ "SELECT * FROM {}.{} LIMIT 0".format(self.quote_identifier(database), self.quote_identifier(table))
270
+ )
271
+ cursor.fetchall()
272
+ cols = self.get_columns_from_cursor(cursor)
273
+ if exclude:
274
+ cols = [x for x in cols if x not in exclude]
275
+ return cols
276
+
277
+ @staticmethod
278
+ def get_columns_from_cursor(cursor):
279
+ cols = []
280
+ for item in cursor.description:
281
+ name = item[0]
282
+ if "." in name:
283
+ cols.append(name.split(".")[1])
284
+ else:
285
+ cols.append(name)
286
+ return cols
287
+
288
+ def quote_identifier(self, v):
289
+ parts = []
290
+ for x in v.split("."):
291
+ x = trim_prefix(x, self._identifier_start_quote)
292
+ x = trim_suffix(x, self._identifier_end_quote)
293
+ x = f"{self._identifier_start_quote}{x}{self._identifier_end_quote}"
294
+ parts.append(x)
295
+ return ".".join(parts)
296
+
297
+ def cursor_to_schema(self, cursor):
298
+ schema = Schema()
299
+ for item in cursor.description:
300
+ name = item[0]
301
+ if "." in name:
302
+ name = name.split(".")[1]
303
+
304
+ type_code = item[1]
305
+ size = item[3]
306
+
307
+ ttype = self.to_canonical_type(type_code, size)
308
+ schema.add_field_by_attrs(name, ttype, size)
309
+ return schema
310
+
311
+ @staticmethod
312
+ def to_canonical_type(type_code, size):
313
+ raise NotImplementedError()
314
+
315
+ @staticmethod
316
+ def from_canonical_type(canonical_type, size):
317
+ raise NotImplementedError()
318
+
319
+ def generate_create_table_ddl(self, name, schema, **kwargs):
320
+ cols = []
321
+ for f in schema:
322
+ col_name = self.quote_identifier(f.name)
323
+ if f.comment:
324
+ cols.append(f"{col_name} {self.from_canonical_type(f.type, f.size)} COMMENT {f.comment!r}")
325
+ else:
326
+ cols.append(f"{col_name} {self.from_canonical_type(f.type, f.size)}")
327
+
328
+ col_types = ",\n".join(cols)
329
+ name = self.quote_identifier(name)
330
+ ddl = f"CREATE TABLE {name} (\n{col_types}\n)"
331
+ return ddl
332
+
333
+ def generate_ddl(self, table, database=None, if_exists=True):
334
+ raise NotImplementedError
335
+
336
+ def load_csv_by_inserting(
337
+ self,
338
+ table,
339
+ filename,
340
+ columns=None,
341
+ delimiter=",",
342
+ quotechar='"',
343
+ lineterminator="\r\n",
344
+ escapechar=None,
345
+ skiprows=0,
346
+ null_values=("NULL", r"\N"),
347
+ null_replacer=None,
348
+ batch_size=1000,
349
+ values_hook=None,
350
+ concurrency=1,
351
+ **kwargs,
352
+ ):
353
+ csv_options = dict(
354
+ delimiter=delimiter, quotechar=quotechar, lineterminator=lineterminator, escapechar=escapechar
355
+ )
356
+ csv_options.update(kwargs)
357
+
358
+ if values_hook is None:
359
+ values_hook = lambda x: x # noqa: E731
360
+
361
+ if concurrency <= 1:
362
+ # fallback to use the main thread itself to avoid the overhead of queue
363
+ self._insert_in_serial(
364
+ table, filename, columns, csv_options, skiprows, null_values, null_replacer, batch_size, values_hook
365
+ )
366
+ else:
367
+ self._insert_in_parallel(
368
+ table,
369
+ filename,
370
+ columns,
371
+ csv_options,
372
+ skiprows,
373
+ null_values,
374
+ null_replacer,
375
+ batch_size,
376
+ values_hook,
377
+ concurrency,
378
+ )
379
+
380
+ def _insert_in_serial(
381
+ self,
382
+ table,
383
+ filename,
384
+ columns,
385
+ csv_options,
386
+ skiprows=0,
387
+ null_values=("NULL", r"\N"),
388
+ null_replacer=None,
389
+ batch_size=1000,
390
+ values_hook=None,
391
+ ):
392
+ cursor = self.cursor()
393
+
394
+ counter = TimeCounter(name="main", log_threshold=batch_size * 10, logger=self.logger)
395
+ with open(filename, newline="") as fd:
396
+ if skiprows:
397
+ for _ in range(skiprows):
398
+ fd.readline()
399
+
400
+ reader = csv.reader(fd, **csv_options)
401
+ rows = []
402
+ for row in reader:
403
+ row = replace_null_values(row, null_values, null_replacer)
404
+ row = values_hook(row)
405
+ rows.append(row)
406
+ counter.incr(1)
407
+ if len(rows) == batch_size:
408
+ self._bulk_insert(cursor, table, columns, rows)
409
+ rows = []
410
+
411
+ self._bulk_insert(cursor, table, columns, rows)
412
+
413
+ counter.show_stat()
414
+ cursor.close()
415
+
416
+ def _insert_in_parallel(
417
+ self,
418
+ table,
419
+ filename,
420
+ columns,
421
+ csv_options,
422
+ skiprows=0,
423
+ null_values=("NULL", r"\N"),
424
+ null_replacer=None,
425
+ batch_size=1000,
426
+ values_hook=None,
427
+ concurrency=1,
428
+ ):
429
+ data_queue = Queue(maxsize=2 * concurrency)
430
+ exc_queue = Queue()
431
+ # start workers
432
+ workers = []
433
+ for _ in range(concurrency):
434
+ t = threading.Thread(target=self._write_worker, args=(table, columns, batch_size, data_queue, exc_queue))
435
+ t.setDaemon(True)
436
+ t.start()
437
+ workers.append(t)
438
+
439
+ # send tasks to queue
440
+ counter = TimeCounter(name="main", log_threshold=batch_size * 10, logger=self.logger)
441
+ with open(filename, newline="") as fd:
442
+ if skiprows:
443
+ for _ in range(skiprows):
444
+ fd.readline()
445
+
446
+ reader = csv.reader(fd, **csv_options)
447
+
448
+ rows = []
449
+ for row in reader:
450
+ row = replace_null_values(row, null_values, null_replacer)
451
+ row = values_hook(row)
452
+ counter.incr(1)
453
+ rows.append(row)
454
+ if len(rows) == batch_size:
455
+ while True:
456
+ try:
457
+ # wait up to 2 minutes before checking state of workers
458
+ # terminate immediately if any worker fails
459
+ data_queue.put(rows, block=True, timeout=120)
460
+ except Full:
461
+ if not exc_queue.empty():
462
+ raise RuntimeError(f"{exc_queue.qsize()} of {concurrency} workers failed")
463
+ else:
464
+ break
465
+ rows = []
466
+
467
+ if rows:
468
+ # this operation should not be fail in most cases
469
+ data_queue.put(rows)
470
+
471
+ self.logger.info("sending finish signal to all workers")
472
+ for _ in workers:
473
+ data_queue.put(None)
474
+
475
+ self.logger.info("waiting for workers to exit")
476
+ for t in workers:
477
+ t.join()
478
+
479
+ if not exc_queue.empty():
480
+ raise RuntimeError(f"{exc_queue.qsize()} of {concurrency} workers failed")
481
+
482
+ counter.show_stat()
483
+
484
+ def _write_worker(self, table, cols, batch_size, data_queue: Queue, exc_queue: Queue):
485
+ log_threshold = 5 * batch_size
486
+ cursor = self.cursor()
487
+ counter = TimeCounter(name="worker", log_threshold=log_threshold, logger=self.logger)
488
+ while True:
489
+ rows = data_queue.get()
490
+ if rows is None:
491
+ break
492
+
493
+ # data_queue.task_done()
494
+ counter.incr(len(rows))
495
+ try:
496
+ self._bulk_insert(cursor, table, cols, rows)
497
+ rows = []
498
+ except Exception as e:
499
+ self.logger.exception("failed to insert %d rows, break", len(rows))
500
+ # 发生异常就终止
501
+ exc_queue.put(e)
502
+ break
503
+
504
+ counter.show_stat()
505
+ cursor.close()
506
+ self.logger.info("ready to exit.")
507
+
508
+ def _bulk_insert(self, cursor, table, cols, rows):
509
+ if not rows:
510
+ return
511
+
512
+ col_count = len(rows[0])
513
+
514
+ if cols:
515
+ field_names = "({})".format(", ".join([self.quote_identifier(x) for x in cols]))
516
+ else:
517
+ field_names = ""
518
+
519
+ placeholders = ", ".join([self._param_placeholder] * col_count)
520
+ sql = f"INSERT INTO {table} {field_names} VALUES ({placeholders})"
521
+
522
+ cursor.executemany(sql, rows)
523
+ cursor.connection.commit()
524
+
525
+ def add_leading_comment(self, query, comment):
526
+ tokens = []
527
+ for q in sqlparse.split(query.strip()):
528
+ tokens.append(self._add_leading_comment_impl(q.strip().rstrip(";"), comment))
529
+ return ";\n".join(tokens)
530
+
531
+ def _add_leading_comment_impl(self, query, comment):
532
+ comment = self._safe_comment(comment)
533
+ return "/* {} */\n{}".format(comment, query)
534
+
535
+ def _safe_comment(self, comment):
536
+ # 强行将 comment 中可能存在的 */ 或 /* 替换为 '', 以免 comment 失效报错
537
+ comment = re.sub(pattern=r"\*\/|\/\*", repl="", string=comment)
538
+ return ", ".join(comment.split("\n"))
539
+
540
+ def is_mysql(self):
541
+ return False
542
+
543
+ def is_impala(self):
544
+ return False
545
+
546
+ def is_hive(self):
547
+ return False
548
+
549
+ def is_postgres(self):
550
+ return False
551
+
552
+ def is_redshift(self):
553
+ return False
554
+
555
+ def is_mssql(self):
556
+ return False
557
+
558
+ def is_azure_synapse(self):
559
+ return False
560
+
561
+ def is_clickhouse(self):
562
+ return False
563
+
564
+ def is_clickhouse_native(self):
565
+ return False
566
+
567
+ def is_phoenix(self):
568
+ return False
569
+
570
+ def is_google_bigquery(self):
571
+ return False