recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,351 @@
1
+ # description : use clickhouse-driver [https://github.com/mymarilyn/clickhouse-driver]
2
+
3
+ import datetime
4
+ import functools
5
+ import json
6
+ import re
7
+ import shutil
8
+ import subprocess
9
+
10
+ import clickhouse_driver
11
+
12
+ from recurvedata.pigeon.connector._registry import register_connector_class
13
+ from recurvedata.pigeon.connector.dbapi import DBAPIConnector
14
+ from recurvedata.pigeon.schema import types
15
+ from recurvedata.pigeon.utils import fs
16
+
17
+ _clickhouse_type_to_canonical_type = {
18
+ # pigeon 没有定义 uint, 用「更长」的 INT 表示,防止溢出
19
+ "UInt8": types.INT16,
20
+ "UInt16": types.INT32,
21
+ "UInt32": types.INT64,
22
+ "UInt64": types.INT64,
23
+ "Int8": types.INT8,
24
+ "Int16": types.INT16,
25
+ "Int32": types.INT32,
26
+ "Int64": types.INT64,
27
+ "Float32": types.FLOAT32,
28
+ "Float64": types.FLOAT64,
29
+ "String": types.STRING,
30
+ "FixedString": types.STRING,
31
+ "Date": types.DATE,
32
+ "DateTime": types.DATETIME,
33
+ "Enum": types.STRING,
34
+ "Array": types.JSON,
35
+ }
36
+
37
+ _canonical_type_to_clickhouse_type = {
38
+ types.BOOLEAN: "UInt8",
39
+ types.INT8: "Int8",
40
+ types.INT16: "Int16",
41
+ types.INT32: "Int32",
42
+ types.INT64: "Int64",
43
+ types.FLOAT32: "Float32",
44
+ types.FLOAT64: "Float64",
45
+ types.DATE: "Date",
46
+ types.DATETIME: "DateTime",
47
+ types.STRING: "String",
48
+ types.JSON: "String",
49
+ }
50
+
51
+ nullable_type_p = re.compile(r"Nullable\((?P<inner_type_code>.*)\)")
52
+ array_type_p = re.compile(r"Array\((?P<inner_type_code>.*)\)")
53
+ low_cardinality_type_p = re.compile(r"LowCardinality\((?P<inner_type_code>.*)\)")
54
+
55
+
56
+ @register_connector_class(["clickhouse_native", "clickhouse"])
57
+ class ClickHouseConnector(DBAPIConnector):
58
+ _sqla_driver = "clickhouse+native"
59
+ _default_port = 9000
60
+ _default_database = "default"
61
+
62
+ def is_clickhouse_native(self):
63
+ return True
64
+
65
+ def connect_impl(self, autocommit=False, *args, **kwargs):
66
+ conn_kwargs = {
67
+ "host": self.host,
68
+ "port": self.port,
69
+ "user": self.user,
70
+ "password": self.password,
71
+ "database": self.database,
72
+ "compression": True,
73
+ }
74
+ conn_kwargs.update(self.kwargs)
75
+ conn_kwargs.update(kwargs)
76
+ return clickhouse_driver.connect(**conn_kwargs)
77
+
78
+ def cursor(self, autocommit=False, dryrun=False, commit_on_close=True, stream=False, max_rows=0, **kwargs):
79
+ """Returns a clickhouse DBAPI cursor
80
+ stream: enable or disable results streaming
81
+ max_rows: specifies the maximum number of rows to buffer at a time
82
+ """
83
+ ch_cursor = super().cursor(autocommit=autocommit, dryrun=dryrun, commit_on_close=commit_on_close, **kwargs)
84
+ if stream:
85
+ ch_cursor._cursor.set_stream_results(stream_results=stream, max_row_buffer=max_rows)
86
+ return ch_cursor
87
+
88
+ def has_table(self, table, database=None, **kwargs) -> bool:
89
+ # check if table exists: https://clickhouse.com/docs/en/sql-reference/statements/exists/
90
+ database = database or self.database
91
+ rows = self.fetchall(f"EXISTS `{database}`.`{table}`")
92
+ return bool(rows[0][0])
93
+
94
+ def get_columns(self, table, database=None, exclude=None):
95
+ database = database or self.database
96
+ if not self.has_table(table, database):
97
+ raise ValueError("Table {!r} not exists in {!r}".format(table, database))
98
+ with self.cursor() as cursor:
99
+ cursor.execute(
100
+ "SELECT * FROM {}.{} LIMIT 0".format(self.quote_identifier(database), self.quote_identifier(table))
101
+ )
102
+ cols = [x.name for x in cursor.description if x not in (exclude or ())]
103
+ return cols
104
+
105
+ def generate_ddl(self, table, database=None, if_exists=True):
106
+ database = database or self.database
107
+ if not self.has_table(table, database):
108
+ raise ValueError(f"Table {table!r} not exists in {database!r}")
109
+
110
+ with self.cursor() as cursor:
111
+ cursor.execute(f"SHOW CREATE TABLE {self.quote_identifier(database)}.{self.quote_identifier(table)}")
112
+ if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
113
+ body = re.search(r"CREATE TABLE (.*)", cursor.fetchall()[0][0], flags=re.S).group(1)
114
+ return f"CREATE TABLE{if_exists_stmt}{body}"
115
+
116
+ @staticmethod
117
+ def to_canonical_type(type_code, size):
118
+ if "nullable" in type_code.lower():
119
+ type_code = nullable_type_p.search(type_code).groupdict()["inner_type_code"]
120
+ if "lowcardinality" in type_code.lower():
121
+ type_code = low_cardinality_type_p.search(type_code).groupdict()["inner_type_code"]
122
+ if "FixedString" in type_code:
123
+ type_code = "FixedString"
124
+ if "Array" in type_code:
125
+ type_code = "Array"
126
+ return _clickhouse_type_to_canonical_type.get(type_code, types.STRING)
127
+
128
+ @staticmethod
129
+ def from_canonical_type(canonical_type, size):
130
+ return _canonical_type_to_clickhouse_type.get(canonical_type, "String")
131
+
132
+ def generate_create_table_ddl(self, name, schema, **kwargs):
133
+ """从 schema 文件生成建表语句。Table engine 需要从 kwargs 传入,否则默认使用 Log"""
134
+ # Nullable
135
+ cols = []
136
+ for f in schema:
137
+ col_name = self.quote_identifier(f.name)
138
+ if f.comment:
139
+ cols.append(f"{col_name} Nullable({self.from_canonical_type(f.type, f.size)}) COMMENT {f.comment!r}")
140
+ else:
141
+ cols.append(f"{col_name} Nullable({self.from_canonical_type(f.type, f.size)})")
142
+
143
+ col_types = ",\n".join(cols)
144
+ name = self.quote_identifier(name)
145
+ ddl = f"CREATE TABLE {name} (\n{col_types}\n)"
146
+
147
+ # ddl = super().generate_create_table_ddl(name, schema)
148
+
149
+ # Table Engines: https://clickhouse.yandex/docs/en/operations/table_engines/
150
+ engine = kwargs.get("ENGINE", "Log")
151
+ ddl += f" ENGINE = {engine}"
152
+ return ddl
153
+
154
+ def _handle_row(self, row, columns):
155
+ rv = []
156
+ for col, value in zip(columns, row):
157
+ rv.append(col.cast(value))
158
+ return tuple(rv)
159
+
160
+ def _get_columns_with_type(self, table):
161
+ with self.cursor() as cursor:
162
+ cursor.execute(
163
+ "SELECT * FROM {}.{} LIMIT 0".format(self.quote_identifier(self.database), self.quote_identifier(table))
164
+ )
165
+ cursor.fetchall()
166
+ cols = [ClickHouseField(x.name, x.type_code) for x in cursor.description]
167
+ return cols
168
+
169
+ def _bulk_insert(self, cursor, table, cols, rows):
170
+ if not rows:
171
+ return
172
+ if cols:
173
+ field_names = "({})".format(", ".join([self.quote_identifier(x) for x in cols]))
174
+ else:
175
+ field_names = ""
176
+ sql = f"INSERT INTO {table} {field_names} VALUES"
177
+ cursor.executemany(sql, rows)
178
+ cursor.connection.commit()
179
+
180
+ def load_csv(
181
+ self,
182
+ table,
183
+ filename,
184
+ delimiter=",",
185
+ quotechar='"',
186
+ lineterminator="\r\n",
187
+ escapechar=None,
188
+ skiprows=0,
189
+ using_insert=False,
190
+ **kwargs,
191
+ ):
192
+ """Load CSV file to ClickHouse table, support both batch INSERT by Python and clickhouse-client binary"""
193
+ infile = filename
194
+ if skiprows:
195
+ infile = fs.skip_lines(filename, skiprows)
196
+
197
+ clickhouse_client_binary = shutil.which("clickhouse-client")
198
+ try_clickhouse_client = (not using_insert) and clickhouse_client_binary
199
+ if try_clickhouse_client:
200
+ self.logger.info("found clickhouse-client in %s, try to load file using it", clickhouse_client_binary)
201
+ self._load_csv_by_clickhouse_client(clickhouse_client_binary, table, filename, delimiter)
202
+ else:
203
+ # fallback to perform INSERT
204
+ self._load_csv_by_inserting(table, filename, delimiter, quotechar, lineterminator, escapechar, **kwargs)
205
+
206
+ if infile != filename:
207
+ fs.remove_files_safely(infile)
208
+
209
+ def _load_csv_by_clickhouse_client(self, binary, table, filename, delimiter=","):
210
+ if "." not in table:
211
+ table = f"{self.database}.{table}"
212
+ command = " ".join(
213
+ [
214
+ binary,
215
+ f"--host {self.host}",
216
+ f"--port {self.port}",
217
+ f"--user {self.user}",
218
+ f"--password {self.password}",
219
+ f'--format_csv_delimiter="{delimiter}"',
220
+ f'--query="INSERT INTO {table} FORMAT CSV"' f"< {filename}",
221
+ ]
222
+ )
223
+ self.logger.info(command)
224
+ subprocess.check_call(command, shell=True)
225
+
226
+ def _load_csv_by_inserting(self, table, filename, delimiter, quotechar, lineterminator, escapechar, **kwargs):
227
+ # https://clickhouse.yandex/docs/en/query_language/insert_into/
228
+ # Performance Considerations
229
+ # INSERT sorts the input data by primary key and splits them into partitions by a partition key
230
+ # If you insert data into several partitions at once, it can significantly reduce the performance.
231
+ # To avoid this:
232
+ #
233
+ # - Add data in fairly large batches, such as 100,000 rows at a time.
234
+ # - Group data by month before uploading it to ClickHouse.
235
+ batch_size = kwargs.get("batch_size") or 10000
236
+
237
+ # https://clickhouse.yandex/docs/en/single/#strong-typing
238
+ columns = self._get_columns_with_type(table)
239
+ values_hook = functools.partial(self._handle_row, columns=columns)
240
+ column_names = [x.name for x in columns]
241
+
242
+ self.logger.info("columns: %s", columns)
243
+ self.logger.info("batch size: %s", batch_size)
244
+ self.load_csv_by_inserting(
245
+ table=table,
246
+ filename=filename,
247
+ columns=column_names,
248
+ delimiter=delimiter,
249
+ quotechar=quotechar,
250
+ lineterminator=lineterminator,
251
+ escapechar=escapechar,
252
+ skiprows=0,
253
+ batch_size=batch_size,
254
+ values_hook=values_hook,
255
+ concurrency=kwargs.get("concurrency", 1),
256
+ )
257
+
258
+
259
+ class ClickHouseField:
260
+ """将 clickhouse datatype 转换成 python datatype"""
261
+
262
+ def __init__(self, name, type_code):
263
+ self.name = name
264
+ self.type_code = type_code
265
+
266
+ if self.is_array() or self.is_nullable() or self.is_low_cardinality():
267
+ self.inner_type = self._infer_inner_type()
268
+ else:
269
+ self.inner_type = None
270
+
271
+ @classmethod
272
+ def get_converters(cls, columns_with_type: dict):
273
+ return {name: cls(type_code) for name, type_code in columns_with_type.items()}
274
+
275
+ def is_array(self):
276
+ return self.type_code.startswith("Array")
277
+
278
+ def is_nullable(self):
279
+ return self.type_code.startswith("Nullable")
280
+
281
+ def is_low_cardinality(self):
282
+ return self.type_code.startswith("LowCardinality")
283
+
284
+ @property
285
+ def _real_type(self):
286
+ if self.is_nullable():
287
+ return self.inner_type
288
+ return self.type_code
289
+
290
+ def is_int(self):
291
+ return self._real_type in ["UInt8", "UInt16", "UInt32", "UInt64", "Int8", "Int16", "Int32", "Int64"]
292
+
293
+ def is_float(self):
294
+ return self._real_type in ["Float32", "Float64"]
295
+
296
+ def is_string(self):
297
+ return self._real_type == "String"
298
+
299
+ def _infer_inner_type(self):
300
+ for f, p in [
301
+ (self.is_array, array_type_p),
302
+ (self.is_nullable, nullable_type_p),
303
+ (self.is_low_cardinality, low_cardinality_type_p),
304
+ ]:
305
+ if f():
306
+ return p.search(self.type_code).groupdict()["inner_type_code"]
307
+ raise TypeError("No inner type, use type_code instead")
308
+
309
+ def _convert_datetime(self, value, type_code):
310
+ if type_code == "DateTime":
311
+ return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
312
+ return datetime.datetime.strptime(value, "%Y-%m-%d").date()
313
+
314
+ def cast(self, value):
315
+ if value is None:
316
+ if self.is_string():
317
+ return ""
318
+ else:
319
+ return None
320
+
321
+ if self.type_code in ("DateTime", "Date"):
322
+ return self._convert_datetime(value, self.type_code)
323
+
324
+ if self.is_string():
325
+ return value
326
+
327
+ if self.is_int() or self.is_float():
328
+ if value == "":
329
+ return 0
330
+ if self.is_int():
331
+ return int(value)
332
+ else:
333
+ return float(value)
334
+
335
+ # 处理数组类型
336
+ if self.is_array():
337
+ if isinstance(value, str):
338
+ try:
339
+ value = json.loads(value)
340
+ except Exception:
341
+ value = []
342
+
343
+ if self.inner_type == "DateTime":
344
+ value = [self._convert_datetime(x, self.inner_type) for x in value]
345
+ return value
346
+
347
+ # 其他类型,先不处理,需要的时候再说
348
+ return value
349
+
350
+ def __repr__(self):
351
+ return f"<ClickHouseField({repr(self.name)}, {repr(self.type_code)})>"