recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,166 @@
1
+ import json
2
+ import subprocess
3
+ import time
4
+
5
+ from recurvedata.pigeon.connector._registry import register_connector_class
6
+ from recurvedata.pigeon.connector.mysql import MySQLConnector
7
+ from recurvedata.pigeon.schema import types
8
+
9
+ _canonical_type_to_doris_type = {
10
+ types.BOOLEAN: "TINYINT",
11
+ types.INT8: "TINYINT",
12
+ types.INT16: "SMALLINT",
13
+ types.INT32: "INT",
14
+ types.INT64: "BIGINT",
15
+ types.FLOAT32: "FLOAT",
16
+ types.FLOAT64: "DOUBLE",
17
+ types.DATE: "DATE",
18
+ types.DATETIME: "DATETIME",
19
+ types.STRING: "STRING",
20
+ types.JSON: "STRING",
21
+ }
22
+
23
+
24
+ @register_connector_class(["doris"])
25
+ class DorisConnector(MySQLConnector):
26
+ _sqla_driver = "doris+pymysql"
27
+ _default_port = 9030
28
+ _default_fe_http_port = 8030
29
+
30
+ def __init__(self, host, port=None, http_port=None, database=None, user=None, password=None, *args, **kwargs):
31
+ self.http_port = http_port or self._default_fe_http_port
32
+ super().__init__(host=host, port=port, database=database, user=user, password=password, *args, **kwargs)
33
+
34
+ @property
35
+ def load_strict_mode(self):
36
+ if not hasattr(self, "_load_strict_mode"):
37
+ return False
38
+ return self._load_strict_mode
39
+
40
+ @load_strict_mode.setter
41
+ def load_strict_mode(self, mode: bool):
42
+ self._load_strict_mode = mode
43
+
44
+ @property
45
+ def max_filter_ratio(self):
46
+ if not hasattr(self, "_max_filter_ratio"):
47
+ return 0
48
+ return self._max_filter_ratio
49
+
50
+ def has_table(self, table, database=None, cursor=None, **kwargs):
51
+ retry_num = 6
52
+ for attempt in range(retry_num):
53
+ if super().has_table(table, database, cursor, **kwargs):
54
+ return True
55
+ if attempt < retry_num - 1:
56
+ wait_time = (attempt + 1) ** 2
57
+ time.sleep(wait_time) # wait for table to be created and visible
58
+ return False
59
+
60
+ @max_filter_ratio.setter
61
+ def max_filter_ratio(self, ratio: float):
62
+ if ratio < 0:
63
+ self._max_filter_ratio = 0
64
+ elif ratio > 1:
65
+ self._max_filter_ratio = 1
66
+ else:
67
+ self._max_filter_ratio = ratio
68
+
69
+ def _load_csv_mysql(
70
+ self,
71
+ table,
72
+ filename,
73
+ columns=None,
74
+ delimiter=",",
75
+ quotechar='"',
76
+ lineterminator="\r\n",
77
+ escapechar=None,
78
+ skiprows=0,
79
+ **kwargs,
80
+ ):
81
+ """
82
+ stream load data from csv file into table
83
+ """
84
+
85
+ def _split_database_table(table_name: str):
86
+ tmp_lst = table_name.split(".")
87
+ if len(tmp_lst) == 1:
88
+ return self.database, table_name
89
+ return tmp_lst
90
+
91
+ db_name, table_name = _split_database_table(table)
92
+ shell_cmd = self._format_load_shell(filename, db_name, table_name)
93
+
94
+ # Set only authentication environment variables
95
+ _env = {}
96
+ if self.user is not None:
97
+ _env["DORIS_USER"] = self.user
98
+ if self.password is not None:
99
+ _env["DORIS_PASSWORD"] = self.password
100
+
101
+ output = subprocess.check_output(shell_cmd, env=_env, shell=True)
102
+ self._log(output)
103
+ res_txt = output.decode()
104
+ res = json.loads(res_txt)
105
+ self._log(res_txt)
106
+
107
+ if res["Status"] != "Success":
108
+ if "ErrorURL" not in res:
109
+ err_output = res["Message"]
110
+ else:
111
+ err_url = res["ErrorURL"]
112
+ err_output = subprocess.check_output(["curl", err_url])
113
+ self._log(f"error: {err_output}")
114
+ raise Exception("load csv failed")
115
+
116
+ def _format_load_shell(self, filename: str, db_name: str, table_name: str) -> str:
117
+ """Format the curl command for Doris stream load.
118
+
119
+ Args:
120
+ filename: Path to the CSV file to load
121
+ db_name: Target database name
122
+ table_name: Target table name
123
+
124
+ Returns:
125
+ Formatted curl command string for stream loading data
126
+ """
127
+
128
+ def __format_column_mapping(db_name: str, table_name: str) -> str:
129
+ columns = self.get_columns(table_name, db_name)
130
+ cols_txt = ",".join(columns)
131
+ return f"columns: {cols_txt}"
132
+
133
+ def __format_stream_load_url(db_name: str, table_name: str) -> str:
134
+ return f"http://{self.host}:{self.http_port}/api/{db_name}/{table_name}/_stream_load"
135
+
136
+ # Clean table and db names
137
+ db_name = db_name.strip("`")
138
+ table_name = table_name.strip("`")
139
+
140
+ # Build command components
141
+ url = __format_stream_load_url(db_name, table_name)
142
+ strict_mode = "true" if self.load_strict_mode else "false"
143
+ column_mapping = __format_column_mapping(db_name, table_name)
144
+
145
+ # Construct the full curl command with properly escaped quotes
146
+ return (
147
+ f"curl --location-trusted -u $DORIS_USER:$DORIS_PASSWORD "
148
+ f'-H "Expect:100-continue" '
149
+ f'-H "max_filter_ratio:{self.max_filter_ratio}" '
150
+ f'-H "column_separator:," '
151
+ f'-H "enclose:\\"" '
152
+ f'-H "trim_double_quotes:true" '
153
+ f'-H "strict_mode:{strict_mode}" '
154
+ f'-H "escape:\'" '
155
+ f'-H "{column_mapping}" '
156
+ f"-T {filename} -XPUT "
157
+ f"{url}"
158
+ ).strip()
159
+
160
+ @staticmethod
161
+ def from_canonical_type(canonical_type, size):
162
+ if canonical_type == types.STRING:
163
+ doris_type = "STRING"
164
+ else:
165
+ doris_type = _canonical_type_to_doris_type.get(canonical_type, "STRING")
166
+ return doris_type
@@ -0,0 +1,176 @@
1
+ import base64
2
+ import hashlib
3
+ import pickle
4
+ from collections import defaultdict
5
+
6
+ from elasticsearch import Elasticsearch, helpers
7
+ from elasticsearch.exceptions import NotFoundError
8
+
9
+ from recurvedata.pigeon.connector._registry import register_connector_class
10
+ from recurvedata.pigeon.csv import CSV
11
+ from recurvedata.pigeon.schema import Schema, types
12
+ from recurvedata.pigeon.utils import LoggingMixin, ensure_str_list, replace_null_values
13
+
14
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html#_field_datatypes
15
+ _es_type_to_canonical_type = {
16
+ "boolean": types.BOOLEAN,
17
+ "byte": types.INT8,
18
+ "short": types.INT16,
19
+ "integer": types.INT32,
20
+ "long": types.INT64,
21
+ "half_float": types.FLOAT32,
22
+ "float": types.FLOAT32,
23
+ "double": types.FLOAT64,
24
+ "scaled_float": types.FLOAT64,
25
+ "date": types.DATETIME,
26
+ "text": types.STRING,
27
+ "keyword": types.STRING,
28
+ "ip": types.STRING,
29
+ "object": types.STRING,
30
+ "nested": types.STRING,
31
+ }
32
+
33
+ _canonical_type_to_es_type = {
34
+ types.BOOLEAN: "boolean",
35
+ types.INT8: "byte",
36
+ types.INT16: "short",
37
+ types.INT32: "integer",
38
+ types.INT64: "long",
39
+ types.FLOAT32: "float",
40
+ types.FLOAT64: "double",
41
+ types.DATETIME: "date",
42
+ types.STRING: "text",
43
+ }
44
+
45
+
46
+ @register_connector_class(["es", "elasticsearch"])
47
+ class ElasticSearchConnector(LoggingMixin):
48
+ def __init__(self, host, **kwargs):
49
+ self.host = host
50
+ self._es = Elasticsearch(self.host, **kwargs)
51
+
52
+ def scan(self, query=None, index=None, doc_type=None, fields=None, **search_kwargs):
53
+ if isinstance(query, str):
54
+ real_query = {"query": {"query_string": {"query": query}}}
55
+ else:
56
+ real_query = query
57
+
58
+ search_kwargs = search_kwargs.copy()
59
+ search_kwargs.update({"index": index, "doc_type": doc_type})
60
+ if fields:
61
+ search_kwargs["_source_include"] = fields
62
+ return helpers.scan(self._es, query=real_query, **search_kwargs)
63
+
64
+ def get_mapping(self, index, doc_type):
65
+ try:
66
+ result = self._es.indices.get_mapping(index=index, doc_type=doc_type)
67
+ except NotFoundError as e:
68
+ self.logger.error(str(e))
69
+ return None
70
+
71
+ mappings = list(result.values())[0]["mappings"]
72
+ if doc_type is not None:
73
+ properties = mappings[doc_type]["properties"]
74
+ else:
75
+ properties = list(mappings.values())[0]["properties"]
76
+ return properties
77
+
78
+ def get_schema(self, index, doc_type):
79
+ mapping = self.get_mapping(index, doc_type)
80
+ schema = Schema()
81
+ for name, attrs in mapping.items():
82
+ es_type = attrs.get("type", "text").lower()
83
+ schema.add_field_by_attrs(name, self.to_canonical_type(es_type))
84
+ return schema
85
+
86
+ @staticmethod
87
+ def get_meta_field_type(name):
88
+ return {
89
+ "_index": types.STRING,
90
+ "_type": types.STRING,
91
+ "_id": types.STRING,
92
+ "_score": types.FLOAT64,
93
+ }[name]
94
+
95
+ @staticmethod
96
+ def to_canonical_type(es_type):
97
+ return _es_type_to_canonical_type.get(es_type, types.STRING)
98
+
99
+ @staticmethod
100
+ def from_canonical_type(canonical_type):
101
+ return _canonical_type_to_es_type[canonical_type]
102
+
103
+ def load_csv(
104
+ self,
105
+ filename,
106
+ index,
107
+ doc_type="_doc",
108
+ schema=None,
109
+ id_field=None,
110
+ generate_id=False,
111
+ null_values=("NULL", r"\N"),
112
+ null_replacer=None,
113
+ **csv_options,
114
+ ):
115
+ csv_proxy = CSV(filename, **csv_options)
116
+ if not csv_proxy.has_header:
117
+ raise ValueError(f"missing header in CSV file {filename}")
118
+
119
+ # ensure id fields are present in header
120
+ if id_field:
121
+ fields = ensure_str_list(id_field)
122
+ if not all(x in csv_proxy.header for x in fields):
123
+ raise ValueError(f"{id_field} is invalid, only {csv_proxy.header} are support")
124
+ else:
125
+ fields = None
126
+
127
+ if schema is not None:
128
+ typed_fields = {x.name: x for x in schema.fields}
129
+ else:
130
+ typed_fields = {}
131
+
132
+ def actions_generator():
133
+ counters = defaultdict(int)
134
+ with csv_proxy.reader(as_dict=True) as reader:
135
+ for doc in reader:
136
+ doc = replace_null_values(doc, null_values, null_replacer)
137
+ doc = self.values_hook(doc, typed_fields)
138
+
139
+ action = {"_index": index, "_type": doc_type, "_source": doc}
140
+
141
+ if fields:
142
+ # fields = ensure_str_list(id_field)
143
+ if len(fields) == 1:
144
+ action["_id"] = doc[fields[0]]
145
+ else:
146
+ action["_id"] = self.encode_id([doc[x] for x in fields])
147
+ if generate_id:
148
+ action["_id"] = self.encode_id(doc.values())
149
+
150
+ counters["rows_read"] += 1
151
+ counters["rows_yield"] += 1
152
+ if counters["rows_yield"] % 10000 == 0:
153
+ self.logger.info("progress: %s", counters)
154
+
155
+ yield action
156
+
157
+ # 消费生成器
158
+ for _ in helpers.parallel_bulk(
159
+ self._es, actions=actions_generator(), thread_count=8, chunk_size=1000, queue_size=8
160
+ ):
161
+ pass
162
+
163
+ @staticmethod
164
+ def encode_id(values):
165
+ content = pickle.dumps(tuple(values))
166
+ return base64.urlsafe_b64encode(hashlib.sha1(content).digest()).decode()
167
+
168
+ @staticmethod
169
+ def values_hook(doc: dict, typed_fields: dict):
170
+ for k, v in doc.items():
171
+ field = typed_fields.get(k)
172
+ if field is None or field.type in [types.STRING]:
173
+ continue
174
+
175
+ doc[k] = typed_fields[k].cast(v)
176
+ return doc