recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,294 @@
1
+ from functools import partial
2
+
3
+ from recurvedata.pigeon.connector._registry import get_connector_class
4
+
5
+
6
+ def new_azure_synapse_connector(connection=None, database=None):
7
+ """
8
+ only connection string accepted
9
+ database switching between azure data warehouses is not allowed.
10
+ """
11
+ from .azure_synapse import AzureSynapseConnector
12
+
13
+ conf = connection.copy()
14
+ if database:
15
+ conf["database"] = database
16
+ return AzureSynapseConnector(**conf)
17
+
18
+
19
+ def new_azure_blob_connector(
20
+ conn_string: str = None,
21
+ account_url: str = None,
22
+ endpoint_suffix: str = "core.chinacloudapi.cn",
23
+ account_name: str = None,
24
+ sas_token: str = None,
25
+ **kwargs,
26
+ ):
27
+ """only connection string accepted"""
28
+ from .azure_blob import AzureBlobConnector
29
+
30
+ return AzureBlobConnector(
31
+ connection_string=conn_string,
32
+ account_url=account_url,
33
+ endpoint_suffix=endpoint_suffix,
34
+ account_name=account_name,
35
+ sas_token=sas_token,
36
+ **kwargs,
37
+ )
38
+
39
+
40
+ def new_mysql_connector(connection=None, database=None, **kwargs):
41
+ """Factory function to create a new MySQLConnector.
42
+
43
+ :param connection: the connection properties,
44
+ :type connection: dict
45
+ :param database: the optional database name
46
+ :type database: str
47
+ """
48
+ from .mysql import MySQLConnector
49
+
50
+ conf = connection.copy()
51
+ return MySQLConnector(database=database, **conf)
52
+
53
+
54
+ def new_tidb_connector(connection=None, database=None):
55
+ """Factory function to create a new TiDBConnector (MySQLConnector).
56
+
57
+ Similar to new_mysql_connector, but with different default connection parameter.
58
+
59
+ :param connection: the connection properties
60
+ :type connection: dict
61
+ :param database: the optional database name
62
+ :type database: str
63
+ """
64
+ return new_mysql_connector(connection, database)
65
+
66
+
67
+ def new_starrocks_connector(connection=None, database=None):
68
+ """Factory function to create a new StarRocksConnector.
69
+
70
+ :type connection: dict
71
+ :param database: the optional database name
72
+ :type database: str
73
+ """
74
+ from .starrocks import StarRocksConnector
75
+
76
+ return StarRocksConnector(database=database, **connection)
77
+
78
+
79
+ def new_hive_connector(connection=None, database=None, **kwargs):
80
+ """Factory function to create a new HiveConnector.
81
+
82
+ :param connection: the connection properties
83
+ :type connection: dict
84
+ :param database: the optional database name
85
+ :type database: str
86
+ """
87
+ from .hive_impala import HiveConnector
88
+
89
+ conf = connection.copy()
90
+ return HiveConnector(database=database, **conf)
91
+
92
+
93
+ def new_impala_connector(connection=None, database=None, **kwargs):
94
+ """Factory function to create a new ImpalaConnector.
95
+
96
+ :param connection: the connection properties
97
+ :type connection: dict
98
+ :param database: the optional database name
99
+ :type database: str
100
+ """
101
+ from .hive_impala import ImpalaConnector
102
+
103
+ conf = connection.copy()
104
+ return ImpalaConnector(database=database, **conf)
105
+
106
+
107
+ def new_webhdfs_connector(conf=None, **kwargs):
108
+ from .hdfs import HDFSConnector
109
+
110
+ conf = conf.copy()
111
+ return HDFSConnector(**conf)
112
+
113
+
114
+ def new_redshift_connector(connection=None, database=None):
115
+ """Factory function to create a new RedshiftConnector.
116
+
117
+ :param connection: the connection properties
118
+ :type connection: dict
119
+ :param database: the optional database name
120
+ :type database: str
121
+ """
122
+ from .redshift import RedshiftConnector
123
+
124
+ conf = connection.copy()
125
+ return RedshiftConnector(database=database, **conf)
126
+
127
+
128
+ def new_postgresql_connector(connection=None, database=None):
129
+ """Factory function to create a new PostgresConnector.
130
+
131
+ :param connection: the connection properties
132
+ :type connection: dict
133
+ :param database: the optional database name
134
+ :type database: str
135
+ """
136
+ from .postgresql import PostgresConnector
137
+
138
+ conf = connection.copy()
139
+ if database is not None:
140
+ conf["database"] = database
141
+ return PostgresConnector(**conf)
142
+
143
+
144
+ def new_cassandra_connector(connection, database=None):
145
+ """Factory function to create a new CassandraConnector
146
+
147
+ :param connection: the connection properties
148
+ :type connection: dict
149
+ :param database: the optional database name
150
+ :type database: str
151
+ """
152
+ from .cass import CassandraConnector
153
+
154
+ return CassandraConnector(database=database, **connection)
155
+
156
+
157
+ def new_s3_connector(conf=None):
158
+ from .awss3 import S3Connector
159
+
160
+ conf = conf.copy()
161
+ return S3Connector(**conf)
162
+
163
+
164
+ def new_elasticsearch_connector(conf=None):
165
+ from .es import ElasticSearchConnector
166
+
167
+ return ElasticSearchConnector(**conf)
168
+
169
+
170
+ def new_ftp_connector(conf=None):
171
+ from .ftp import FtpConnector
172
+
173
+ conf = (conf or {}).copy()
174
+ return FtpConnector(**conf)
175
+
176
+
177
+ def new_sftp_connector(conf):
178
+ from .sftp import SFtpConnector
179
+
180
+ return SFtpConnector(**conf)
181
+
182
+
183
+ def new_mssql_connector(connection=None, database=None, is_azure=False):
184
+ from .mssql import AzureSQLServerConnector, MSSQLConnector
185
+
186
+ conf = connection.copy()
187
+ if database:
188
+ conf["database"] = database
189
+ if is_azure:
190
+ connector_cls = AzureSQLServerConnector
191
+ else:
192
+ connector_cls = MSSQLConnector
193
+ return connector_cls(**conf)
194
+
195
+
196
+ def new_clickhouse_connector(connection=None, database=None, native=True):
197
+ conf = connection.copy()
198
+ if not native:
199
+ from .clickhouse import ClickHouseConnector
200
+ else:
201
+ from .clickhouse_native import ClickHouseConnector
202
+ return ClickHouseConnector(database=database, **conf)
203
+
204
+
205
+ def new_phoenix_connector(connection=None, **kwargs):
206
+ from .hbase_phoenix import PhoenixConnector
207
+
208
+ conf = connection.copy()
209
+ return PhoenixConnector(**conf)
210
+
211
+
212
+ def new_mongodb_connector(connection=None, **kwargs):
213
+ from .mongodb import MongoDBConnector
214
+
215
+ conf = connection.copy()
216
+ return MongoDBConnector(**conf)
217
+
218
+
219
+ def new_google_bigquery_connector(*args, **kwargs):
220
+ from .google_bigquery import GoogleBigqueryConnector
221
+
222
+ return GoogleBigqueryConnector(*args, **kwargs)
223
+
224
+
225
+ def new_feishu_connector(app_id=None, app_secret=None):
226
+ from .feishu import FeishuBot
227
+
228
+ conf = {}
229
+ if app_id:
230
+ conf["app_id"] = app_id
231
+ conf["app_secret"] = app_secret
232
+ return FeishuBot(**conf)
233
+
234
+
235
+ def new_owncloud_connector(url: str = None, user: str = None, password: str = None, **kwargs):
236
+ from .owncloud import OwncloudConnector
237
+
238
+ conf = {}
239
+ if url and user and password:
240
+ conf["url"] = url
241
+ conf["user"] = user
242
+ conf["password"] = password
243
+ conf.update(kwargs)
244
+ else:
245
+ raise ValueError("You must provide owncloud URL, user and password.")
246
+ return OwncloudConnector(**conf)
247
+
248
+
249
+ def new_sqlite_connector(in_memory: bool, max_memory_gb: int = 2, **kwargs):
250
+ if not in_memory:
251
+ raise ValueError("Currently only supports in-memory database.")
252
+ from .sqlite import SQLiteMemoryDbConnector
253
+
254
+ conf = {}
255
+ conf.update(kwargs)
256
+ return SQLiteMemoryDbConnector(max_memory_gb=max_memory_gb, **conf)
257
+
258
+
259
+ def new_doris_connector(connection=None, database=None):
260
+ from .doris import DorisConnector
261
+
262
+ conf = connection.copy()
263
+ return DorisConnector(database=database, **conf)
264
+
265
+
266
+ _factory_registry = {
267
+ "mysql": new_mysql_connector,
268
+ "tidb": new_tidb_connector,
269
+ "hive": new_hive_connector,
270
+ "impala": new_impala_connector,
271
+ "redshift": new_redshift_connector,
272
+ "cassandra": new_cassandra_connector,
273
+ "s3": new_s3_connector,
274
+ "elasticsearch": new_elasticsearch_connector,
275
+ "es": new_elasticsearch_connector,
276
+ "ftp": new_ftp_connector,
277
+ "azure_synapse": new_azure_synapse_connector,
278
+ "azure_blob": new_azure_blob_connector,
279
+ "mssql": new_mssql_connector,
280
+ "clickhouse": new_clickhouse_connector,
281
+ "clickhouse_native": partial(new_clickhouse_connector, native=True),
282
+ "phoenix": new_phoenix_connector,
283
+ "mongodb": new_mongodb_connector,
284
+ "gbq": new_google_bigquery_connector,
285
+ "google_bigquery": new_google_bigquery_connector,
286
+ "sqlite": new_sqlite_connector,
287
+ "postgres": new_postgresql_connector,
288
+ "doris": new_doris_connector,
289
+ "starrocks": new_starrocks_connector,
290
+ }
291
+
292
+
293
+ def get_connector(db_type, *args, **kwargs):
294
+ return _factory_registry[db_type](*args, **kwargs)
@@ -0,0 +1,17 @@
1
+ from recurvedata.pigeon.utils import ensure_str_list
2
+
3
+ _registry = {}
4
+
5
+
6
+ class register_connector_class(object):
7
+ def __init__(self, ctype):
8
+ self.ctype = ensure_str_list(ctype)
9
+
10
+ def __call__(self, connector):
11
+ for t in self.ctype:
12
+ _registry[t] = connector
13
+ return connector
14
+
15
+
16
+ def get_connector_class(ctype):
17
+ return _registry[ctype]
@@ -0,0 +1,80 @@
1
+ import logging
2
+ import os
3
+
4
+ import oss2
5
+
6
+ from recurvedata.pigeon.connector._registry import register_connector_class
7
+ from recurvedata.pigeon.utils.timing import ProgressCallback
8
+
9
+
10
+ @register_connector_class("oss")
11
+ class OSSBucketConnector(object):
12
+ def __init__(self, access_key_id, access_key_secret, endpoint, bucket_name, **kwargs):
13
+ self.access_key_id = access_key_id
14
+ self.access_key_secret = access_key_secret
15
+ self.endpoint = endpoint
16
+ self.bucket_name = bucket_name
17
+
18
+ if not all((self.access_key_id, self.access_key_secret)):
19
+ logging.info("access_key_id or access_key_secret is missing, fallback to ")
20
+ self._auth = oss2.AnonymousAuth()
21
+ else:
22
+ self._auth = oss2.make_auth(self.access_key_id, self.access_key_secret)
23
+
24
+ self.bucket = oss2.Bucket(self._auth, self.endpoint, self.bucket_name)
25
+ proxies = kwargs.get("proxies")
26
+ if proxies:
27
+ # pass proxies to the underlying requests.Session
28
+ logging.info("use %s as proxies", proxies)
29
+ self.bucket.session.session.proxies = proxies
30
+
31
+ def has_object(self, key):
32
+ return self.bucket.object_exists(key)
33
+
34
+ def delete_key(self, key):
35
+ self.bucket.delete_object(key)
36
+
37
+ def delete_keys_by_prefix(self, prefix):
38
+ keys = []
39
+ batch_size = 100
40
+ for obj in oss2.ObjectIteratorV2(bucket=self.bucket, prefix=prefix):
41
+ keys.append(obj.key)
42
+ if len(keys) >= batch_size:
43
+ self.bucket.batch_delete_objects(keys)
44
+ keys = []
45
+ if keys:
46
+ self.bucket.batch_delete_objects(keys)
47
+
48
+ def get_keys(self, prefix="", delimiter=""):
49
+ keys = [x.key for x in oss2.ObjectIteratorV2(bucket=self.bucket, prefix=prefix, delimiter=delimiter)]
50
+ if delimiter:
51
+ keys = [x for x in keys if not x.endswith(delimiter)]
52
+
53
+ return keys
54
+
55
+ def upload(self, filename, key=None, folder=None, overwrite=True, num_threads=4, **kwargs):
56
+ if not key:
57
+ key = os.path.basename(filename)
58
+ if folder:
59
+ key = os.path.join(folder, key)
60
+
61
+ if not overwrite:
62
+ if self.has_object(key=key):
63
+ return key
64
+
65
+ oss2.resumable_upload(self.bucket, key, filename, progress_callback=ProgressCallback(), num_threads=num_threads)
66
+ return key
67
+
68
+ def download(self, key, folder=None, filename=None, overwrite=True, num_threads=4, **kwargs):
69
+ if not filename:
70
+ filename = os.path.basename(key)
71
+ if folder:
72
+ filename = os.path.join(folder, filename)
73
+
74
+ if not overwrite and os.path.exists(filename):
75
+ return filename
76
+
77
+ oss2.resumable_download(
78
+ self.bucket, key, filename, progress_callback=ProgressCallback(), num_threads=num_threads
79
+ )
80
+ return filename
@@ -0,0 +1,123 @@
1
+ import os
2
+ import urllib.parse
3
+
4
+ import boto3
5
+ import botocore.exceptions
6
+ from botocore.config import Config
7
+
8
+ from recurvedata.pigeon.connector._registry import register_connector_class
9
+ from recurvedata.pigeon.utils.timing import DisplayProgress
10
+
11
+
12
+ @register_connector_class("s3")
13
+ class S3Connector(object):
14
+ def __init__(self, aws_access_key_id, aws_secret_access_key, region="cn-north-1", proxies=None, **kwargs):
15
+ self.aws_access_key_id = aws_access_key_id
16
+ self.aws_secret_access_key = aws_secret_access_key
17
+ self.region = region
18
+
19
+ self.s3 = boto3.resource(
20
+ "s3",
21
+ region_name=self.region,
22
+ aws_access_key_id=aws_access_key_id,
23
+ aws_secret_access_key=aws_secret_access_key,
24
+ config=Config(proxies=proxies),
25
+ )
26
+
27
+ def create_bucket(self, bucket_name):
28
+ return self.s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": self.region})
29
+
30
+ def has_bucket(self, bucket_name):
31
+ exists = True
32
+ try:
33
+ self.s3.meta.client.head_bucket(Bucket=bucket_name)
34
+ except botocore.exceptions.ClientError as e:
35
+ error_code = int(e.response["Error"]["Code"])
36
+ if error_code == 404:
37
+ exists = False
38
+ return exists
39
+
40
+ def get_bucket(self, bucket_name):
41
+ if self.has_bucket(bucket_name):
42
+ return self.s3.Bucket(bucket_name)
43
+ return self.create_bucket(bucket_name)
44
+
45
+ def delete_bucket(self, bucket_name):
46
+ bucket = self.get_bucket(bucket_name)
47
+ for key in bucket.objects.all():
48
+ key.delete()
49
+ bucket.delete()
50
+
51
+ def has_object(self, bucket_name, key):
52
+ exists = True
53
+ try:
54
+ self.s3.meta.client.head_object(Bucket=bucket_name, Key=key)
55
+ except botocore.exceptions.ClientError as e:
56
+ error_code = int(e.response["Error"]["Code"])
57
+ if error_code == 404:
58
+ exists = False
59
+ return exists
60
+
61
+ @staticmethod
62
+ def parse_s3_url(s3url):
63
+ parsed_url = urllib.parse.urlparse(s3url)
64
+ if not parsed_url.netloc:
65
+ raise ValueError("Please provide a bucket_name")
66
+
67
+ bucket_name = parsed_url.netloc
68
+ key = parsed_url.path.strip("/")
69
+ return bucket_name, key
70
+
71
+ def delete_key(self, key, bucket_name=None):
72
+ if bucket_name is None:
73
+ bucket_name, key = self.parse_s3_url(key)
74
+ bucket = self.get_bucket(bucket_name)
75
+ bucket.Object(key).delete()
76
+
77
+ def delete_keys_by_prefix(self, bucket_name, prefix):
78
+ bucket = self.get_bucket(bucket_name)
79
+ for key in bucket.objects.filter(Prefix=prefix):
80
+ key.delete()
81
+
82
+ def get_keys(self, bucket_name, prefix=None):
83
+ bucket = self.get_bucket(bucket_name)
84
+ if prefix is not None:
85
+ all_keys = bucket.objects.filter(Prefix=prefix)
86
+ else:
87
+ all_keys = bucket.objects.all()
88
+
89
+ return [x.key for x in all_keys]
90
+
91
+ def upload(self, bucket_name, filename, key=None, folder=None, overwrite=True, **kwargs):
92
+ if not key:
93
+ key = os.path.basename(filename)
94
+ if folder:
95
+ key = os.path.join(folder, key)
96
+
97
+ if not overwrite:
98
+ if self.has_object(bucket_name=bucket_name, key=key):
99
+ return key
100
+
101
+ size = os.path.getsize(filename)
102
+ bucket = self.get_bucket(bucket_name)
103
+ with open(filename, "rb") as data:
104
+ bucket.upload_fileobj(data, key, Callback=DisplayProgress(size), **kwargs)
105
+ return key
106
+
107
+ def download(self, bucket_name, key, folder=None, filename=None, overwrite=True, **kwargs):
108
+ if not self.has_object(bucket_name, key):
109
+ raise ValueError(f"{key} not exists in {bucket_name}")
110
+
111
+ if not filename:
112
+ filename = os.path.basename(key)
113
+ if folder:
114
+ filename = os.path.join(folder, filename)
115
+
116
+ if not overwrite and os.path.exists(filename):
117
+ return filename
118
+
119
+ size = float(self.s3.meta.client.head_object(Bucket=bucket_name, Key=key)["ContentLength"])
120
+ bucket = self.get_bucket(bucket_name)
121
+ with open(filename, "wb") as data:
122
+ bucket.download_fileobj(key, data, Callback=DisplayProgress(size), **kwargs)
123
+ return filename
@@ -0,0 +1,176 @@
1
+ import logging
2
+ import os
3
+ from typing import List, Optional
4
+
5
+ from azure.core.exceptions import ResourceExistsError
6
+ from azure.identity import AzureAuthorityHosts, ClientSecretCredential
7
+ from azure.storage.blob import BlobServiceClient, StorageStreamDownloader
8
+
9
+ from recurvedata.pigeon.connector._registry import register_connector_class
10
+ from recurvedata.pigeon.utils.timing import DisplayProgress
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @register_connector_class("azure_blob")
16
+ class AzureBlobConnector:
17
+ """Connector for Azure Blob Storage.
18
+
19
+ Four ways to config:
20
+ - using connection_string
21
+ - using account_url + sas_token
22
+ - using endpoint_suffix + account_name + sas_token
23
+ - using endpoint_suffix + account_name + tenant_id + client_id + client_secret
24
+
25
+ kwargs:
26
+ spn_authority_host: authority host for spn, default is AzureAuthorityHosts.AZURE_CHINA
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ connection_string: str = None,
32
+ account_url: str = None,
33
+ endpoint_suffix: str = "core.chinacloudapi.cn",
34
+ account_name: str = None,
35
+ sas_token: str = None,
36
+ tenant_id: str = None,
37
+ client_id: str = None,
38
+ client_secret: str = None,
39
+ **kwargs,
40
+ ):
41
+ self.conn_string = connection_string
42
+ self.account_url = account_url
43
+ self.endpoint_suffix = endpoint_suffix
44
+ self.account_name = account_name
45
+ self.sas_token = sas_token
46
+ self.kwargs = kwargs
47
+ self.spn_authority_host = self.kwargs.get("spn_authority_host") or AzureAuthorityHosts.AZURE_CHINA
48
+
49
+ authorize_by_conn_string = False
50
+ authorize_by_sas_token = False
51
+ authorize_by_spn_secret = False
52
+ if connection_string:
53
+ authorize_by_conn_string = True
54
+ if sas_token and (account_url or all((account_name, endpoint_suffix))):
55
+ authorize_by_sas_token = True
56
+ if tenant_id and client_id and client_secret and all((account_name, endpoint_suffix)):
57
+ authorize_by_spn_secret = True
58
+
59
+ if not any((authorize_by_conn_string, authorize_by_sas_token, authorize_by_spn_secret)):
60
+ raise ValueError(
61
+ """
62
+ invalid authorization info
63
+ Four ways to config:
64
+ - using connection_string
65
+ - using account_url + sas_token
66
+ - using endpoint_suffix + account_name + sas_token
67
+ - using endpoint_suffix + account_name + tenant_id + client_id + client_secret
68
+ """
69
+ )
70
+
71
+ if authorize_by_conn_string:
72
+ self.blob_service = BlobServiceClient.from_connection_string(connection_string, **kwargs)
73
+ elif authorize_by_sas_token:
74
+ if not account_url:
75
+ account_url = f"https://{account_name}.blob.{endpoint_suffix}"
76
+ self.blob_service = BlobServiceClient(account_url, credential=sas_token, **kwargs)
77
+ else:
78
+ credential = ClientSecretCredential(tenant_id, client_id, client_secret, authority=self.spn_authority_host)
79
+ account_url = f"https://{account_name}.blob.{endpoint_suffix}"
80
+ self.blob_service = BlobServiceClient(account_url=account_url, credential=credential)
81
+
82
+ @property
83
+ def account_key(self) -> Optional[str]:
84
+ if not self.conn_string:
85
+ return None
86
+ kvs = self.parse_conn_string(self.conn_string)
87
+ return kvs["accountkey"]
88
+
89
+ @staticmethod
90
+ def parse_conn_string(conn_string: str):
91
+ parts = conn_string.strip(";").split(";")
92
+ kvs = {}
93
+ for p in parts:
94
+ k, v = p.split("=", 1)
95
+ kvs[k.lower()] = v
96
+ return kvs
97
+
98
+ def get_url(self, container: str, blob: str) -> str:
99
+ return f"https://{self.blob_service.primary_hostname}/{container}/{blob}"
100
+
101
+ def create_container(self, container_name: str, exist_ok=True):
102
+ """create container"""
103
+ try:
104
+ return self.blob_service.create_container(container_name)
105
+ except ResourceExistsError as e:
106
+ if exist_ok:
107
+ logger.info(f"container {container_name} already exists, skip")
108
+ else:
109
+ raise e
110
+
111
+ def delete_container(self, container_name: str, **kwargs):
112
+ """if container not exists, error will be suppressed with the fail_not_exist parameter"""
113
+ self.blob_service.delete_container(container_name, **kwargs)
114
+
115
+ def exists(self, container_name: str, blob_name: str = None, **kwargs) -> bool:
116
+ """
117
+ if blob name is none, check whether container exists or not
118
+ if blob name specified, check blob exists or not in the container
119
+ """
120
+ if blob_name is None:
121
+ client = self.blob_service.get_container_client(container_name)
122
+ else:
123
+ client = self.blob_service.get_blob_client(container_name, blob_name)
124
+ return client.exists(**kwargs)
125
+
126
+ def delete_blob(self, container_name, blob_name, **kwargs):
127
+ container = self.blob_service.get_container_client(container_name)
128
+ container.delete_blob(blob_name, **kwargs)
129
+
130
+ def list_blobs(self, container_name, name_starts_with=None, include=None, **kwargs) -> List[str]:
131
+ container = self.blob_service.get_container_client(container_name)
132
+ generator = container.list_blobs(name_starts_with=name_starts_with, include=include, **kwargs)
133
+ return [blob.name for blob in generator]
134
+
135
+ def upload(self, container_name, local_file_path, blob_name=None, overwrite=True, is_progress_hook=True, **kwargs):
136
+ """
137
+ Upload local file to container with specified blob name.
138
+ The specified container will also be created if not exists.
139
+ """
140
+ if not blob_name:
141
+ blob_name = os.path.basename(local_file_path)
142
+
143
+ # container_blob = f'{container_name}/{blob_name}'
144
+ blob = self.blob_service.get_blob_client(container_name, blob_name)
145
+ if not overwrite and blob.exists():
146
+ logger.info("Blob exists, skip!")
147
+ return blob_name
148
+
149
+ size = os.path.getsize(local_file_path)
150
+ options = {"overwrite": True, "max_concurrency": 4}
151
+ if is_progress_hook:
152
+ options["progress_hook"] = DisplayProgress(size, stream=False)
153
+
154
+ options.update(kwargs)
155
+ with open(local_file_path, "rb") as data:
156
+ blob.upload_blob(data, **options)
157
+ return blob_name
158
+
159
+ def download(self, container_name, blob_name, local_file_path, **kwargs):
160
+ """download blob to local"""
161
+ blob = self.blob_service.get_blob_client(container_name, blob_name)
162
+ size = blob.get_blob_properties().size
163
+ if size == 0:
164
+ logging.warning("blob %s has no content, create an empty file and exit", blob_name)
165
+ with open(local_file_path, "w"):
166
+ return
167
+
168
+ options = {
169
+ "max_concurrency": 4,
170
+ "progress_hook": DisplayProgress(size, stream=False),
171
+ }
172
+ options.update(kwargs)
173
+ with open(local_file_path, "wb") as f:
174
+ data: StorageStreamDownloader = blob.download_blob(**options)
175
+ data.readinto(f)
176
+ return local_file_path