recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,51 @@
1
+ """
2
+ Azure Synapse Analytics(previous Azure SQL Data Warehouse)
3
+
4
+ doc:https://docs.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-overview-what-is
5
+ """
6
+
7
+ import pandas as pd
8
+
9
+ from recurvedata.pigeon.connector._registry import register_connector_class
10
+ from recurvedata.pigeon.connector.mssql import AzureSQLServerConnector
11
+
12
+
13
+ @register_connector_class(["azure_synapse", "azure_dw"])
14
+ class AzureSynapseConnector(AzureSQLServerConnector):
15
+ _autocommit = True
16
+
17
+ def is_azure_synapse(self):
18
+ return True
19
+
20
+ def load_csv(
21
+ self,
22
+ table,
23
+ filename,
24
+ schema="dbo",
25
+ columns=None,
26
+ delimiter=",",
27
+ quotechar='"',
28
+ lineterminator="\r\n",
29
+ escapechar=None,
30
+ skiprows=0,
31
+ **kwargs,
32
+ ):
33
+ options = dict(
34
+ columns=columns,
35
+ delimiter=delimiter,
36
+ quotechar=quotechar,
37
+ lineterminator=lineterminator,
38
+ escapechar=escapechar,
39
+ skiprows=skiprows,
40
+ )
41
+ options.update(**kwargs)
42
+ self.load_csv_bulk(table, filename, schema, **options)
43
+
44
+ def get_pandas_df(self, query, parameters=None, **kwargs):
45
+ # 没有 AUTOCOMMIT 的话, 会报错 An attempt to complete a transaction has failed. No corresponding transaction found.
46
+ con = self.create_engine({"isolation_level": "AUTOCOMMIT"})
47
+ try:
48
+ df = pd.read_sql_query(sql=query, con=con, params=parameters, **kwargs)
49
+ finally:
50
+ con.dispose()
51
+ return df
@@ -0,0 +1,151 @@
1
+ import retrying
2
+ from cassandra import ReadTimeout, cqltypes
3
+ from cassandra.auth import PlainTextAuthProvider
4
+ from cassandra.cluster import Cluster, default_lbp_factory
5
+ from cassandra.encoder import Encoder
6
+ from cassandra.policies import ConstantReconnectionPolicy, RetryPolicy
7
+ from cassandra.query import bind_params
8
+
9
+ from recurvedata.pigeon.connector._registry import register_connector_class
10
+ from recurvedata.pigeon.schema import Schema, types
11
+ from recurvedata.pigeon.utils import LoggingMixin
12
+
13
+
14
+ class NullSession(LoggingMixin):
15
+ """
16
+ NullCursor implements some methods of Cassandra Session, but does nothing at all.
17
+ """
18
+
19
+ def execute(self, query, parameters=None, *args, **kwargs):
20
+ query_string = bind_params(query, parameters, Encoder())
21
+ self.logger.info(query_string)
22
+ return None
23
+
24
+ def shutdown(self):
25
+ self.logger.info("shutting down null session")
26
+ return None
27
+
28
+ def __enter__(self):
29
+ return self
30
+
31
+ def __exit__(self, exc_type, exc_val, exc_tb):
32
+ self.shutdown()
33
+
34
+
35
+ class ClosingSession(object):
36
+ def __init__(self, session):
37
+ self._session = session
38
+ self._cluster = session.cluster
39
+
40
+ def __getattr__(self, name):
41
+ return getattr(self._session, name)
42
+
43
+ def __enter__(self):
44
+ return self
45
+
46
+ def __exit__(self, exc_type, exc_val, exc_tb):
47
+ self.shutdown()
48
+
49
+ def close(self):
50
+ self.shutdown()
51
+
52
+ def shutdown(self):
53
+ self._session.shutdown()
54
+ self._cluster.shutdown()
55
+
56
+
57
+ @register_connector_class("cassandra")
58
+ class CassandraConnector(object):
59
+ _cqltype_to_canonical_type = {
60
+ cqltypes.BooleanType: types.BOOLEAN,
61
+ cqltypes.ByteType: types.INT8,
62
+ cqltypes.ShortType: types.INT16,
63
+ cqltypes.Int32Type: types.INT32,
64
+ cqltypes.IntegerType: types.INT64,
65
+ cqltypes.LongType: types.INT64,
66
+ cqltypes.TimeType: types.INT64,
67
+ cqltypes.FloatType: types.FLOAT32,
68
+ cqltypes.DoubleType: types.FLOAT64,
69
+ cqltypes.Decimal: types.FLOAT64,
70
+ cqltypes.SimpleDateType: types.DATE,
71
+ cqltypes.DateType: types.DATETIME,
72
+ cqltypes.TimestampType: types.DATETIME,
73
+ cqltypes.VarcharType: types.STRING,
74
+ cqltypes.UUIDType: types.STRING,
75
+ cqltypes.UTF8Type: types.STRING,
76
+ }
77
+
78
+ _canonical_type_to_cqltype = {
79
+ types.BOOLEAN: cqltypes.BooleanType.typename,
80
+ types.INT8: cqltypes.ByteType.typename,
81
+ types.INT16: cqltypes.ShortType.typename,
82
+ types.INT32: cqltypes.Int32Type.typename,
83
+ types.INT64: cqltypes.LongType.typename,
84
+ types.FLOAT32: cqltypes.FloatType.typename,
85
+ types.FLOAT64: cqltypes.DoubleType.typename,
86
+ types.DATE: cqltypes.SimpleDateType.typename,
87
+ types.DATETIME: cqltypes.TimestampType.typename,
88
+ types.STRING: cqltypes.UTF8Type.typename,
89
+ types.JSON: cqltypes.UTF8Type.typename,
90
+ }
91
+
92
+ def __init__(self, host, port, database=None, user=None, password=None, *args, **kwargs):
93
+ self.host = host
94
+ self.port = int(port)
95
+ self.database = database
96
+ self.user = user
97
+ self.password = password
98
+ self.args = args
99
+ self.kwargs = kwargs
100
+
101
+ def connect(self, *args, **kwargs):
102
+ auth = PlainTextAuthProvider(username=self.user, password=self.password)
103
+ cluster = Cluster(
104
+ contact_points=self.host,
105
+ auth_provider=auth,
106
+ protocol_version=3,
107
+ load_balancing_policy=default_lbp_factory(),
108
+ default_retry_policy=RetryPolicy(),
109
+ reconnection_policy=ConstantReconnectionPolicy(delay=1, max_attempts=10),
110
+ *args,
111
+ **kwargs,
112
+ )
113
+ return cluster
114
+
115
+ def session(self, *args, **kwargs):
116
+ cluster = self.connect(*args, **kwargs)
117
+ return cluster.connect(self.database)
118
+
119
+ def closing_session(self, dryrun=False, *args, **kwargs):
120
+ if dryrun:
121
+ session = NullSession()
122
+ else:
123
+ real_session = self.session(*args, **kwargs)
124
+ session = ClosingSession(real_session)
125
+ return session
126
+
127
+ def execute(self, query, parameters=None, timeout=20, retry=3):
128
+ with self.closing_session() as session:
129
+ retry_handler = retrying.Retrying(retry_on_exception=_retry_if_timeout, stop_max_attempt_number=retry)
130
+ return retry_handler.call(_execute_query, session, query, parameters, timeout)
131
+
132
+ def get_data_schema(self, result_set):
133
+ schema = Schema()
134
+ for name, ctype in zip(result_set.column_names, result_set.column_types):
135
+ ttype = self.to_canonical_type(ctype)
136
+ schema.add_field_by_attrs(name, ttype)
137
+ return schema
138
+
139
+ def to_canonical_type(self, ctype):
140
+ return self._cqltype_to_canonical_type.get(ctype, types.STRING)
141
+
142
+ def from_canonical_type(self, canonical_type, size):
143
+ return self._canonical_type_to_cqltype.get(canonical_type, cqltypes.UTF8Type.typename)
144
+
145
+
146
+ def _retry_if_timeout(exc):
147
+ return isinstance(exc, ReadTimeout)
148
+
149
+
150
+ def _execute_query(session, query, parameters, timeout, *args, **kwargs):
151
+ return session.execute(query, parameters, timeout=timeout, *args, **kwargs)
@@ -0,0 +1,403 @@
1
+ import datetime
2
+ import functools
3
+ import json
4
+ import re
5
+ import shutil
6
+ import subprocess
7
+
8
+ import cytoolz as toolz
9
+ import requests
10
+ from infi.clickhouse_orm import fields
11
+ from sqlalchemy_clickhouse import connector as clickhouse
12
+
13
+ from recurvedata.pigeon.connector._registry import register_connector_class
14
+ from recurvedata.pigeon.connector.dbapi import ClosingCursor, DBAPIConnector, NullCursor, _ShowTableLikeMixin
15
+ from recurvedata.pigeon.schema import types
16
+ from recurvedata.pigeon.utils import fs
17
+
18
+
19
+ # Patch sqlalchemy_clickhouse, use requests session (keep alive)
20
+ def _send(self, data, settings=None, stream=False):
21
+ if isinstance(data, str):
22
+ data = data.encode("utf-8")
23
+ if not hasattr(self, "_session"):
24
+ self._session = requests.session()
25
+ params = self._build_params(settings)
26
+ r = self._session.post(self.db_url, params=params, data=data, stream=stream)
27
+ if r.status_code != 200:
28
+ raise Exception(r.text)
29
+ return r
30
+
31
+
32
+ clickhouse.Database._send = _send
33
+
34
+
35
+ class ParamEscaper(clickhouse.ParamEscaper):
36
+ def escape_item(self, item):
37
+ if item is None:
38
+ return "NULL"
39
+ elif isinstance(item, (int, float)):
40
+ return self.escape_number(item)
41
+ elif isinstance(item, str):
42
+ return self.escape_string(item)
43
+ elif isinstance(item, datetime.date):
44
+ return self.escape_string(str(item))
45
+ else:
46
+ raise Exception("Unsupported object {}".format(item))
47
+
48
+
49
+ # Hack: sqlalchemy_clickhouse 的 ParamEscaper 不支持日期类型
50
+ clickhouse._escaper = ParamEscaper()
51
+
52
+ _clickhouse_type_to_canonical_type = {
53
+ # pigeon 没有定义 uint, 用「更长」的 INT 表示,防止溢出
54
+ "UInt8": types.INT16,
55
+ "UInt16": types.INT32,
56
+ "UInt32": types.INT64,
57
+ "UInt64": types.INT64,
58
+ "Int8": types.INT8,
59
+ "Int16": types.INT16,
60
+ "Int32": types.INT32,
61
+ "Int64": types.INT64,
62
+ "Float32": types.FLOAT32,
63
+ "Float64": types.FLOAT64,
64
+ "String": types.STRING,
65
+ "FixedString": types.STRING,
66
+ "Date": types.DATE,
67
+ "DateTime": types.DATETIME,
68
+ "Enum": types.STRING,
69
+ "Array": types.JSON,
70
+ }
71
+
72
+ _canonical_type_to_clickhouse_type = {
73
+ types.BOOLEAN: "UInt8",
74
+ types.INT8: "Int8",
75
+ types.INT16: "Int16",
76
+ types.INT32: "Int32",
77
+ types.INT64: "Int64",
78
+ types.FLOAT32: "Float32",
79
+ types.FLOAT64: "Float64",
80
+ types.DATE: "Date",
81
+ types.DATETIME: "DateTime",
82
+ types.STRING: "String",
83
+ types.JSON: "String",
84
+ }
85
+
86
+ _clickhouse_type_to_orm_filed = {
87
+ "UInt8": fields.UInt8Field(),
88
+ "UInt16": fields.UInt16Field(),
89
+ "UInt32": fields.UInt32Field(),
90
+ "UInt64": fields.UInt64Field(),
91
+ "Int8": fields.Int8Field(),
92
+ "Int16": fields.Int16Field(),
93
+ "Int32": fields.Int32Field(),
94
+ "Int64": fields.Int64Field(),
95
+ "Float32": fields.Float32Field(),
96
+ "Float64": fields.Float64Field(),
97
+ "String": fields.StringField(),
98
+ "Date": fields.DateField(),
99
+ }
100
+
101
+ nullable_type_p = re.compile(r"Nullable\((?P<inner_type_code>.*)\)")
102
+ array_type_p = re.compile(r"Array\((?P<inner_type_code>.*)\)")
103
+ low_cardinality_type_p = re.compile(r"LowCardinality\((?P<inner_type_code>.*)\)")
104
+
105
+
106
+ def _format_sql(operation, parameters=None):
107
+ if parameters is None or not parameters:
108
+ sql = operation
109
+ else:
110
+ sql = operation % clickhouse._escaper.escape_args(parameters)
111
+ return sql
112
+
113
+
114
+ class WrappedCursor(ClosingCursor):
115
+ @property
116
+ def description(self):
117
+ return self._description
118
+
119
+ def execute(self, operation: str, parameters=None):
120
+ is_response = self._determine_is_response(operation)
121
+ self._cursor.execute(operation, parameters, is_response)
122
+
123
+ # sqlalchemy-clickhouse 的 cursor 默认的查询方式,如果结果为空,则没有 description
124
+ # 可以使用 FORMAT JSON 查询得到
125
+ self._description = self._cursor.description
126
+ if not self._cursor.description and is_response:
127
+ self._description = self._get_cursor_description(operation, parameters)
128
+
129
+ def _determine_is_response(self, query: str):
130
+ # 简单判断是否 SELECT 查询
131
+ keywords = ["INSERT", "CREATE", "ALTER", "DROP", "RENAME", "SET", "KILL QUERY", "ATTACH", "DETACH"]
132
+ for kw in keywords:
133
+ if re.search(f"\\b{kw}\\b", query, re.IGNORECASE):
134
+ return False
135
+ return True
136
+
137
+ def _get_cursor_description(self, operation: str, parameters=None):
138
+ query = _format_sql(operation, parameters)
139
+ query += " FORMAT JSON"
140
+ rv = self._cursor._db.raw(query)
141
+ data = json.loads(rv)
142
+ return [
143
+ # name, type_code, display_size, internal_size, precision, scale, null_ok
144
+ (x["name"], x["type"], None, None, None, None, True)
145
+ for x in data["meta"]
146
+ ]
147
+
148
+
149
+ class ClickHouseField(object):
150
+ def __init__(self, name, type_code):
151
+ self.name = name
152
+ self.type_code = type_code # ClickHouse 的类型,比如 Array(String)
153
+
154
+ if self.is_array() or self.is_nullable() or self.is_low_cardinality():
155
+ self.inner_type = self._infer_inner_type()
156
+ else:
157
+ self.inner_type = None
158
+
159
+ def is_array(self):
160
+ return self.type_code.startswith("Array")
161
+
162
+ def is_nullable(self):
163
+ return self.type_code.startswith("Nullable")
164
+
165
+ def is_low_cardinality(self):
166
+ return self.type_code.startswith("LowCardinality")
167
+
168
+ def is_int(self):
169
+ return self._real_type in ["UInt8", "UInt16", "UInt32", "UInt64", "Int8", "Int16", "Int32", "Int64"]
170
+
171
+ def is_float(self):
172
+ return self._real_type in ["Float32", "Float64"]
173
+
174
+ def is_string(self):
175
+ return self._real_type == "String"
176
+
177
+ @property
178
+ def _real_type(self):
179
+ if self.is_nullable():
180
+ return self.inner_type
181
+ return self.type_code
182
+
183
+ def _infer_inner_type(self):
184
+ if self.is_array():
185
+ return array_type_p.search(self.type_code).groupdict()["inner_type_code"]
186
+ if self.is_nullable():
187
+ return nullable_type_p.search(self.type_code).groupdict()["type_code"]
188
+ if self.is_low_cardinality():
189
+ return low_cardinality_type_p.search(self.type_code).groupdict()["inner_type_code"]
190
+ raise TypeError("No inner type, use type_code instead")
191
+
192
+ def cast(self, value):
193
+ if value is None:
194
+ if self.is_string():
195
+ return ""
196
+ else:
197
+ return None
198
+
199
+ if self.type_code == "DateTime":
200
+ value = self._convert_datetime(value)
201
+ return str(value)
202
+
203
+ if self.type_code == "Date":
204
+ return str(value)
205
+ if self.is_string():
206
+ return value
207
+
208
+ if self.is_int() or self.is_float():
209
+ if value == "":
210
+ return 0
211
+ return _clickhouse_type_to_orm_filed[self._real_type].to_python(value, timezone_in_use=None)
212
+
213
+ # 处理数组类型
214
+ if self.is_array():
215
+ if isinstance(value, str):
216
+ try:
217
+ value = json.loads(value)
218
+ except Exception:
219
+ value = []
220
+
221
+ if self.inner_type == "DateTime":
222
+ value = str(value)
223
+ value = [self._convert_datetime(x) for x in value]
224
+ inner = _clickhouse_type_to_orm_filed[self.inner_type]
225
+ return fields.ArrayField(inner).to_db_string(value)
226
+
227
+ # 其他类型,先不处理,需要的时候再说
228
+ return _clickhouse_type_to_orm_filed[self.type_code].to_db_string(value)
229
+
230
+ def _convert_datetime(self, value):
231
+ return str(value)
232
+
233
+ def __repr__(self):
234
+ return f"<ClickHouseField({repr(self.name)}, {repr(self.type_code)})>"
235
+
236
+
237
+ @register_connector_class(["clickhouse"])
238
+ class ClickHouseConnector(_ShowTableLikeMixin, DBAPIConnector):
239
+ _sqla_driver = "clickhouse"
240
+ _default_port = 8123
241
+ _default_database = "default"
242
+
243
+ def __init__(self, *args, **kwargs):
244
+ super().__init__(*args, **kwargs)
245
+
246
+ self._tcp_port = self.kwargs.get("tcp_port", 9000)
247
+ self._http_port = self.port or self._default_port
248
+
249
+ @toolz.memoize
250
+ def connect_impl(self, autocommit=False, *args, **kwargs):
251
+ db_url = f"http://{self.host}:{self.port}"
252
+ return clickhouse.connect(db_name=self.database, db_url=db_url, username=self.user, password=self.password)
253
+
254
+ def cursor(self, autocommit=False, dryrun=False, commit_on_close=True, **kwargs):
255
+ if dryrun:
256
+ return NullCursor()
257
+ return WrappedCursor(self.connect(autocommit))
258
+
259
+ def is_clickhouse(self):
260
+ return True
261
+
262
+ @staticmethod
263
+ def to_canonical_type(type_code, size):
264
+ if "nullable" in type_code.lower():
265
+ type_code = nullable_type_p.search(type_code).groupdict()["inner_type_code"]
266
+ if "lowcardinality" in type_code.lower():
267
+ type_code = low_cardinality_type_p.search(type_code).groupdict()["inner_type_code"]
268
+ if "FixedString" in type_code:
269
+ type_code = "FixedString"
270
+ if "Array" in type_code:
271
+ type_code = "Array"
272
+ return _clickhouse_type_to_canonical_type.get(type_code, types.STRING)
273
+
274
+ @staticmethod
275
+ def from_canonical_type(canonical_type, size):
276
+ return _canonical_type_to_clickhouse_type.get(canonical_type, "String")
277
+
278
+ def generate_create_table_ddl(self, name, schema, **kwargs):
279
+ """从 schema 文件生成建表语句。Table engine 需要从 kwargs 传入,否则默认使用 Log"""
280
+ # Nullable
281
+ cols = []
282
+ for f in schema:
283
+ col_name = self.quote_identifier(f.name)
284
+ if f.comment:
285
+ cols.append(f"{col_name} Nullable({self.from_canonical_type(f.type, f.size)}) COMMENT {f.comment!r}")
286
+ else:
287
+ cols.append(f"{col_name} Nullable({self.from_canonical_type(f.type, f.size)})")
288
+
289
+ col_types = ",\n".join(cols)
290
+ name = self.quote_identifier(name)
291
+ ddl = f"CREATE TABLE {name} (\n{col_types}\n)"
292
+
293
+ # ddl = super().generate_create_table_ddl(name, schema)
294
+
295
+ # Table Engines: https://clickhouse.yandex/docs/en/operations/table_engines/
296
+ engine = kwargs.get("ENGINE", "Log")
297
+ ddl += f" ENGINE = {engine}"
298
+ return ddl
299
+
300
+ def load_csv(
301
+ self,
302
+ table,
303
+ filename,
304
+ delimiter=",",
305
+ quotechar='"',
306
+ lineterminator="\r\n",
307
+ escapechar=None,
308
+ skiprows=0,
309
+ using_insert=False,
310
+ **kwargs,
311
+ ):
312
+ """Load CSV file to ClickHouse table, support both batch INSERT by Python and clickhouse-client binary"""
313
+ infile = filename
314
+ if skiprows:
315
+ infile = fs.skip_lines(filename, skiprows)
316
+
317
+ clickhouse_client_binary = shutil.which("clickhouse-client")
318
+ try_clickhouse_client = (not using_insert) and clickhouse_client_binary
319
+ if try_clickhouse_client:
320
+ self.logger.info("found clickhouse-client in %s, try to load file using it", clickhouse_client_binary)
321
+ self._load_csv_by_clickhouse_client(clickhouse_client_binary, table, filename, delimiter)
322
+ else:
323
+ # fallback to perform INSERT
324
+ self._load_csv_by_inserting(table, filename, delimiter, quotechar, lineterminator, escapechar, **kwargs)
325
+
326
+ if infile != filename:
327
+ fs.remove_files_safely(infile)
328
+
329
+ def _load_csv_by_clickhouse_client(self, binary, table, filename, delimiter=","):
330
+ if "." not in table:
331
+ table = f"{self.database}.{table}"
332
+ command = " ".join(
333
+ [
334
+ binary,
335
+ f"--host {self.host}",
336
+ f"--port {self._tcp_port}",
337
+ f"--user {self.user}",
338
+ f"--password {self.password}",
339
+ f'--format_csv_delimiter="{delimiter}"',
340
+ f'--query="INSERT INTO {table} FORMAT CSV"' f"< {filename}",
341
+ ]
342
+ )
343
+ self.logger.info(command)
344
+ subprocess.check_call(command, shell=True)
345
+
346
+ def _load_csv_by_inserting(self, table, filename, delimiter, quotechar, lineterminator, escapechar, **kwargs):
347
+ # https://clickhouse.yandex/docs/en/query_language/insert_into/
348
+ # Performance Considerations
349
+ # INSERT sorts the input data by primary key and splits them into partitions by a partition key
350
+ # If you insert data into several partitions at once, it can significantly reduce the performance.
351
+ # To avoid this:
352
+ #
353
+ # - Add data in fairly large batches, such as 100,000 rows at a time.
354
+ # - Group data by month before uploading it to ClickHouse.
355
+ batch_size = kwargs.get("batch_size") or 10000
356
+
357
+ # https://clickhouse.yandex/docs/en/single/#strong-typing
358
+ columns = self._get_columns_with_type(table)
359
+ values_hook = functools.partial(self._handle_row, columns=columns)
360
+ column_names = [x.name for x in columns]
361
+
362
+ self.logger.info("columns: %s", columns)
363
+ self.logger.info("batch size: %s", batch_size)
364
+ self.load_csv_by_inserting(
365
+ table=table,
366
+ filename=filename,
367
+ columns=column_names,
368
+ delimiter=delimiter,
369
+ quotechar=quotechar,
370
+ lineterminator=lineterminator,
371
+ escapechar=escapechar,
372
+ skiprows=0,
373
+ batch_size=batch_size,
374
+ values_hook=values_hook,
375
+ concurrency=kwargs.get("concurrency", 1),
376
+ )
377
+
378
+ def _handle_row(self, row, columns):
379
+ rv = []
380
+ for col, value in zip(columns, row):
381
+ rv.append(col.cast(value))
382
+ return tuple(rv)
383
+
384
+ def _get_columns_with_type(self, table):
385
+ with self.cursor() as cursor:
386
+ cursor.execute(
387
+ "SELECT * FROM {}.{} LIMIT 0".format(self.quote_identifier(self.database), self.quote_identifier(table))
388
+ )
389
+ cursor.fetchall()
390
+ cols = [ClickHouseField(x[0], x[1]) for x in cursor.description]
391
+ return cols
392
+
393
+ def generate_ddl(self, table, database=None, if_exists=True):
394
+ if database is None:
395
+ database = self.database
396
+ if not self.has_table(table, database):
397
+ raise ValueError(f"Table {table!r} not exists in {database!r}")
398
+
399
+ with self.cursor() as cursor:
400
+ cursor.execute(f"SHOW CREATE TABLE {database}.{table}")
401
+ if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
402
+ body = re.search(r"CREATE TABLE (.*)", cursor.fetchall()[0][1], flags=re.S).group(1)
403
+ return f"CREATE TABLE{if_exists_stmt}{body}"