recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,123 @@
1
+ import hashlib
2
+ import os
3
+
4
+ import cytoolz as toolz
5
+
6
+ from recurvedata.pigeon.connector._registry import register_connector_class
7
+ from recurvedata.pigeon.connector.awss3 import S3Connector
8
+ from recurvedata.pigeon.connector.postgresql import PostgresConnector, canonical_type_to_pg_type
9
+ from recurvedata.pigeon.utils import fs
10
+
11
+
12
+ @register_connector_class("redshift")
13
+ class RedshiftConnector(PostgresConnector):
14
+ _max_text = "VARCHAR(MAX)"
15
+
16
+ def __init__(self, *args, **kwargs):
17
+ super().__init__(*args, **kwargs)
18
+
19
+ self.s3_bucket_name = self.kwargs.get("s3_options", {}).get("bucket")
20
+
21
+ def is_redshift(self):
22
+ return True
23
+
24
+ @toolz.memoize
25
+ def create_s3_connector(self):
26
+ s3_options = self.kwargs.get("s3_options")
27
+ if not s3_options:
28
+ return None
29
+ return S3Connector(**s3_options)
30
+
31
+ def load_csv(
32
+ self,
33
+ table,
34
+ filename,
35
+ schema="public",
36
+ columns=None,
37
+ delimiter=",",
38
+ quotechar='"',
39
+ lineterminator="\r\n",
40
+ escapechar=None,
41
+ skiprows=0,
42
+ using_insert=False,
43
+ **kwargs,
44
+ ):
45
+ table = self._format_table_name(table, schema)
46
+ s3 = self.create_s3_connector()
47
+ if using_insert or s3 is None:
48
+ self.load_csv_by_inserting(
49
+ table, filename, columns, delimiter, quotechar, lineterminator, escapechar, skiprows=skiprows, **kwargs
50
+ )
51
+ else:
52
+ self.load_csv_by_s3(table, filename, columns, skiprows, **kwargs)
53
+
54
+ def load_csv_by_s3(self, table, filename, columns=None, skiprows=0, **kwargs):
55
+ s3 = self.create_s3_connector()
56
+ bucket = self.generate_s3_bucket_name()
57
+ if filename.endswith(".gz"):
58
+ file_to_upload = filename
59
+ else:
60
+ self.logger.info("compressing %s", filename)
61
+ file_to_upload = fs.gzip_compress(filename, using_cmd=True)
62
+ s, t = self._get_schema_table(table, schema=None)
63
+ key_name = f"{self.database}/{s}/{t}/{os.path.basename(file_to_upload)}"
64
+ key_uri = self.format_s3_key_uri(bucket, key_name)
65
+ self.logger.info("upload %s to %s", file_to_upload, key_uri)
66
+ s3.upload(bucket, file_to_upload, key_name)
67
+
68
+ if columns:
69
+ field_names = "({})".format(", ".join([self.quote_identifier(x) for x in columns]))
70
+ else:
71
+ field_names = ""
72
+
73
+ # TODO: null
74
+ if skiprows:
75
+ ignore_header = f"IGNOREHEADER AS {int(skiprows)}"
76
+ else:
77
+ ignore_header = ""
78
+ stmt = f"""
79
+ COPY {table} {field_names} FROM '{key_uri}'
80
+ credentials 'aws_access_key_id={s3.aws_access_key_id};aws_secret_access_key={s3.aws_secret_access_key}'
81
+ region '{s3.region}'
82
+ CSV GZIP ACCEPTINVCHARS EMPTYASNULL {ignore_header}
83
+ """
84
+
85
+ try:
86
+ self.logger.info("running COPY command")
87
+ self.execute(stmt, autocommit=False, commit_on_close=True)
88
+ self.logger.info("COPY finished")
89
+ except Exception as e:
90
+ self.logger.exception("failed to copy data to Redshift")
91
+ raise e
92
+ finally:
93
+ if file_to_upload != filename:
94
+ self.logger.info("delete %s", file_to_upload)
95
+ fs.remove_files_safely(file_to_upload)
96
+
97
+ self.logger.info("delete S3 file: %s", key_uri)
98
+ try:
99
+ s3.delete_key(key_name, bucket)
100
+ except Exception as e:
101
+ self.logger.error(f"operation on s3 bucket fails: {e}")
102
+
103
+ @staticmethod
104
+ def from_canonical_type(canonical_type, size):
105
+ rv = canonical_type_to_pg_type.get(canonical_type, "VARCHAR(MAX)")
106
+ if rv == "TEXT":
107
+ rv = "VARCHAR(MAX)"
108
+ return rv
109
+
110
+ @staticmethod
111
+ def get_key_name(filename):
112
+ return os.path.basename(filename)
113
+
114
+ @staticmethod
115
+ def format_s3_key_uri(bucket, key_name):
116
+ return f"s3://{bucket}/{key_name}"
117
+
118
+ def generate_s3_bucket_name(self):
119
+ if self.s3_bucket_name:
120
+ return self.s3_bucket_name
121
+ cluster_name = self.host.split(".", 1)[0]
122
+ digest = hashlib.md5(self.host.encode()).hexdigest()
123
+ return f"pigeon-{cluster_name}-{digest[:15]}"
@@ -0,0 +1,73 @@
1
+ import os
2
+ import shutil
3
+
4
+ import paramiko
5
+
6
+ from recurvedata.pigeon.connector._registry import register_connector_class
7
+ from recurvedata.pigeon.utils import LoggingMixin
8
+ from recurvedata.pigeon.utils.timing import DisplayProgress
9
+
10
+
11
+ @register_connector_class('sftp')
12
+ class SFtpConnector(LoggingMixin):
13
+ def __init__(self, host, port, username, password, rsa_private_key_file: str = None):
14
+ self.host = host
15
+ self.port = port
16
+ self.username = username
17
+ self.password = password
18
+ client = paramiko.Transport((self.host, self.port))
19
+ if rsa_private_key_file and password:
20
+ private_key = paramiko.RSAKey.from_private_key_file(rsa_private_key_file)
21
+ client.start_client(event=None, timeout=15)
22
+ client.get_remote_server_key()
23
+ client.auth_publickey(self.username, private_key, event=None)
24
+ client.auth_password(self.username, self.password, event=None)
25
+ elif rsa_private_key_file:
26
+ private_key = paramiko.RSAKey.from_private_key_file(rsa_private_key_file)
27
+ client.connect(username=self.username, pkey=private_key)
28
+ else:
29
+ client.connect(username=self.username, password=self.password)
30
+ self.sftp = paramiko.SFTPClient.from_transport(client)
31
+
32
+ def close(self):
33
+ self.sftp.close()
34
+
35
+ def rename(self, from_name, to_name):
36
+ self.sftp.rename(from_name, to_name)
37
+
38
+ def makedir(self, path):
39
+ self.sftp.mkdir(path)
40
+
41
+ def rmdir(self, path):
42
+ self.sftp.rmdir(path)
43
+
44
+ def rm(self, name):
45
+ self.sftp.remove(name)
46
+
47
+ def pwd(self):
48
+ return self.sftp.getcwd()
49
+
50
+ def size(self, name):
51
+ return self.sftp.stat(name).st_size
52
+
53
+ def download_file(self, src_file, dst_file):
54
+ exists = True
55
+ local_dir = os.path.dirname(dst_file)
56
+ if not os.path.exists(local_dir):
57
+ exists = False
58
+ os.makedirs(local_dir)
59
+ try:
60
+ size = self.size(src_file)
61
+ self.sftp.get(src_file, dst_file, callback=DisplayProgress(size, stream=False))
62
+ self.logger.info(f'successfully downloaded {src_file} to {dst_file}')
63
+ except Exception as e:
64
+ os.unlink(dst_file)
65
+ if not exists:
66
+ shutil.rmtree(local_dir)
67
+
68
+ self.logger.exception(f'failed to download {src_file}, reason:{e}')
69
+ raise e
70
+
71
+ def upload_file(self, src_file, dst_file):
72
+ self.sftp.put(src_file, dst_file, callback=DisplayProgress(stream=False))
73
+ self.logger.info(f'successfully uploaded {src_file} to {dst_file}')
@@ -0,0 +1,42 @@
1
+ import sqlite3
2
+
3
+ import pandas as pd
4
+
5
+
6
+ class SQLiteMemoryDbConnector:
7
+ """
8
+ SQLite 内存数据库连接器,用于在内存中的pandas Dataframe执行SQL查询, 支持标准SQL语法
9
+ 暂时只支持内存数据库, 不连接实体表和库
10
+ python进程关闭后,所有表和数据都会丢失
11
+ e.g.
12
+ sqlite_conn = SQLiteMemoryDbConnector()
13
+ df_1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
14
+ df_2 = pd.DataFrame({'a': [4, 5, 6], 'b': [7, 8, 9]})
15
+ sqlite_conn.create_temp_table(df_1, 'df_1')
16
+ sqlite_conn.create_temp_table(df_2, 'df_2')
17
+ result = sqlite_conn.get_pandas_df('SELECT *,row_number() over(partition by a order by b) as rn FROM df_1')
18
+ """
19
+
20
+ def __init__(self, max_memory_gb, **kwargs):
21
+ self.conn = sqlite3.connect(database=':memory:', **kwargs) # 创建内存中的 SQLite 数据库
22
+ self.conn.execute(f"PRAGMA max_memory = {max_memory_gb * 1024 * 1024}")
23
+ self.cursor = self.conn.cursor()
24
+ self.loaded_tables = set()
25
+
26
+ def create_temp_table(self, df, table_name):
27
+ """ Write a table in the memory database. """
28
+ df.to_sql(table_name, self.conn, index=False, if_exists='replace')
29
+
30
+ def drop_temp_table(self, table_name):
31
+ """ Drop a table in the memory database. """
32
+ self.conn.execute(f"DROP TABLE IF EXISTS {table_name}")
33
+
34
+ def get_pandas_df(self, query: str) -> pd.DataFrame:
35
+ """
36
+ :param query: SQL 查询语句
37
+ :return: pandas DataFrame
38
+ """
39
+ return pd.read_sql_query(query, self.conn)
40
+
41
+ def close(self):
42
+ self.conn.close()
@@ -0,0 +1,144 @@
1
+ import json
2
+ import subprocess
3
+
4
+ from recurvedata.pigeon.connector._registry import register_connector_class
5
+ from recurvedata.pigeon.connector.mysql import MySQLConnector
6
+ from recurvedata.pigeon.schema import types
7
+
8
+ _canonical_type_to_starrocks_type = {
9
+ types.BOOLEAN: "TINYINT",
10
+ types.INT8: "TINYINT",
11
+ types.INT16: "SMALLINT",
12
+ types.INT32: "INT",
13
+ types.INT64: "BIGINT",
14
+ types.FLOAT32: "FLOAT",
15
+ types.FLOAT64: "DOUBLE",
16
+ types.DATE: "DATE",
17
+ types.DATETIME: "DATETIME",
18
+ types.STRING: "STRING",
19
+ types.JSON: "STRING",
20
+ }
21
+
22
+
23
+ @register_connector_class(["starrocks"])
24
+ class StarRocksConnector(MySQLConnector):
25
+ _default_port = 9030
26
+ _default_fe_http_port = 8030
27
+
28
+ def __init__(self, host, port=None, http_port=None, database=None, user=None, password=None, *args, **kwargs):
29
+ self.http_port = http_port or self._default_fe_http_port
30
+ self.user = user
31
+ self.password = password
32
+ super().__init__(host=host, port=port, database=database, user=user, password=password, *args, **kwargs)
33
+
34
+ @property
35
+ def load_strict_mode(self) -> bool:
36
+ if not hasattr(self, "_load_strict_mode"):
37
+ return False
38
+ return self._load_strict_mode
39
+
40
+ @load_strict_mode.setter
41
+ def load_strict_mode(self, mode: bool):
42
+ self._load_strict_mode = mode
43
+
44
+ def _load_csv_mysql(
45
+ self,
46
+ table,
47
+ filename,
48
+ columns=None,
49
+ delimiter=",",
50
+ quotechar='"',
51
+ lineterminator="\r\n",
52
+ escapechar=None,
53
+ skiprows=0,
54
+ **kwargs,
55
+ ):
56
+ """
57
+ stream load data from csv file into table
58
+ """
59
+
60
+ def _split_database_table(table_name: str):
61
+ tmp_lst = table_name.split(".")
62
+ if len(tmp_lst) == 1:
63
+ return self.database, table_name
64
+ return tmp_lst
65
+
66
+ db_name, table_name = _split_database_table(table)
67
+ shell_cmd = self._format_load_shell(filename, db_name, table_name)
68
+
69
+ # Set only authentication environment variables
70
+ _env = {}
71
+ if self.user is not None:
72
+ _env["STARROCKS_USER"] = self.user
73
+ if self.password is not None:
74
+ _env["STARROCKS_PASSWORD"] = self.password
75
+
76
+ output = subprocess.check_output(shell_cmd, env=_env, shell=True)
77
+ res_txt = output.decode()
78
+ if res_txt:
79
+ res = json.loads(res_txt)
80
+ self._log(res_txt)
81
+
82
+ if res["Status"] != "Success":
83
+ if "ErrorURL" not in res:
84
+ err_output = res["Message"]
85
+ else:
86
+ err_url = res["ErrorURL"]
87
+ err_output = subprocess.check_output(["curl", err_url])
88
+ self._log(f"error: {err_output}")
89
+ raise Exception("load csv failed")
90
+
91
+ def _format_load_shell(self, filename: str, db_name: str, table_name: str) -> str:
92
+ def __format_column_mapping(db_name: str, table_name: str) -> str:
93
+ """
94
+ stream load 有个 bug,最后一列的右引号无法去除
95
+ 在 column_mapping 里处理
96
+ """
97
+ columns = self.get_columns(table_name, db_name)
98
+ # Extract column names from the column metadata
99
+ column_names = [col["name"] for col in columns]
100
+ # Escape backticks to prevent shell interpretation
101
+ cols = [f"\\`{col}\\`" for col in column_names]
102
+ cols_txt = ",".join(cols)
103
+ cols2 = [
104
+ f"\\`{col}\\`=trim(\\`{col}\\`,'\\\"')" if col == column_names[-1] else f"\\`{col}\\`=\\`{col}\\`"
105
+ for col in column_names
106
+ ]
107
+ cols_txt2 = ", ".join(cols2)
108
+ return f"columns: {cols_txt}, {cols_txt2}"
109
+
110
+ def __format_stream_load_url(db_name: str, table_name: str) -> str:
111
+ return f"http://{self.host}:{self.http_port}/api/{db_name}/{table_name}/_stream_load"
112
+
113
+ db_name, table_name = db_name.strip("`"), table_name.strip("`")
114
+ url = __format_stream_load_url(db_name, table_name)
115
+ strict_mode = "true" if self.load_strict_mode else "false"
116
+ column_mapping = __format_column_mapping(db_name, table_name)
117
+
118
+ # Handle authentication based on whether password is provided
119
+ if self.password is not None:
120
+ auth_part = "-u $STARROCKS_USER:$STARROCKS_PASSWORD"
121
+ else:
122
+ auth_part = "-u $STARROCKS_USER:"
123
+
124
+ # Construct the full curl command with properly escaped quotes
125
+ return (
126
+ f"curl --location-trusted {auth_part} "
127
+ f'-H "Expect:100-continue" '
128
+ f'-H "column_separator:," '
129
+ f'-H "enclose:\\"" '
130
+ f'-H "trim_double_quotes:true" '
131
+ f'-H "strict_mode:{strict_mode}" '
132
+ f'-H "escape:\'" '
133
+ f'-H "{column_mapping}" '
134
+ f"-T {filename} -XPUT "
135
+ f"{url}"
136
+ ).strip()
137
+
138
+ @staticmethod
139
+ def from_canonical_type(canonical_type, size):
140
+ if canonical_type == types.STRING:
141
+ starrocks_type = "STRING"
142
+ else:
143
+ starrocks_type = _canonical_type_to_starrocks_type.get(canonical_type, "STRING")
144
+ return starrocks_type
@@ -0,0 +1,162 @@
1
+ """
2
+ official API doc: https://tableau.github.io/server-client-python/docs/api-ref#views
3
+ tableau-api-lib:https://github.com/divinorum-webb/tableau-api-lib
4
+ """
5
+ import logging
6
+ import time
7
+ from typing import Optional
8
+
9
+ import pandas as pd
10
+ import tableauserverclient as TSC
11
+ from tableau_api_lib import TableauServerConnection
12
+ from tableau_api_lib.utils.querying import (
13
+ get_datasource_connections_dataframe,
14
+ get_datasources_dataframe,
15
+ get_embedded_datasources_dataframe,
16
+ get_projects_dataframe,
17
+ get_sites_dataframe,
18
+ get_views_dataframe,
19
+ get_workbooks_dataframe,
20
+ )
21
+ from tableauserverclient import Server
22
+
23
+
24
+ class TableauConnector:
25
+ def __init__(self, user: str = None, password: str = None, server_url: str = None, site: str = None):
26
+ self.user = user
27
+ self.password = password
28
+ self.server_url = server_url
29
+ self.site = site
30
+ self.server: Optional[Server] = None
31
+ self.library: Optional[TableauServerConnection] = None
32
+ self._sign_in()
33
+
34
+ def _sign_in(self):
35
+ tableau_auth = TSC.TableauAuth(self.user, self.password, self.site)
36
+ self.server = TSC.Server(
37
+ self.server_url, use_server_version=True
38
+ ) # https://community.tableau.com/s/question/0D54T00000ti0eOSAQ/api-version-upgrade
39
+ self.server.auth.sign_in(tableau_auth)
40
+
41
+ self.library = TableauServerConnection(
42
+ {
43
+ "my_env": {
44
+ "api_version": self.server.version,
45
+ "server": self.server_url,
46
+ "username": self.user,
47
+ "password": self.password,
48
+ "site_name": self.site,
49
+ "site_url": self.site,
50
+ }
51
+ },
52
+ env="my_env",
53
+ )
54
+ self.library.sign_in()
55
+
56
+ @staticmethod
57
+ def check_columns(df: pd.DataFrame, cols: list) -> pd.DataFrame:
58
+ if not cols:
59
+ return df
60
+ if not set(cols).issubset(df.columns):
61
+ raise Exception(
62
+ f"Contain wrong columns, target dataframe has {df.columns.to_list()} columns, while input has {cols}"
63
+ )
64
+ return df[cols]
65
+
66
+ def get_sites_df(self, cols: list = None):
67
+ df = get_sites_dataframe(self.library).rename(columns={"id": "site_id", "name": "site_name"})
68
+ return self.check_columns(df, cols)
69
+
70
+ def get_projects_df(self, cols: list = None):
71
+ df = get_projects_dataframe(self.library).rename(columns={"id": "project_id", "name": "project_name"})
72
+ return self.check_columns(df, cols)
73
+
74
+ def get_workbooks_df(self, cols: list = None):
75
+ df = get_workbooks_dataframe(self.library).rename(columns={"id": "workbook_id", "name": "workbook_name"})
76
+ df["project_id"], df["project_name"] = zip(*df["project"].apply(lambda x: (x["id"], x["name"])))
77
+ # df.drop(columns=["project"], inplace=True)
78
+ return self.check_columns(df, cols)
79
+
80
+ def get_views_df(self, cols: list = None):
81
+ df = get_views_dataframe(self.library).rename(columns={"id": "view_id", "name": "view_name"})
82
+ df["workbook_id"], df["workbook_name"] = zip(*df["workbook"].apply(lambda x: (x["id"], x["name"])))
83
+ project_df = self.get_projects_df(cols=["project_id", "project_name"])
84
+ df["project_id"] = df["project"].apply(lambda x: x["id"])
85
+ df = df.merge(project_df, on="project_id", how="left")
86
+ return self.check_columns(df, cols)
87
+
88
+ def get_datasources_df(self, cols: list = None):
89
+ df = get_datasources_dataframe(self.library).rename(columns={"id": "datasource_id", "name": "datasource_name"})
90
+ df["project_id"], df["project_name"] = zip(*df["project"].apply(lambda x: (x["id"], x["name"])))
91
+ return self.check_columns(df, cols)
92
+
93
+ def get_embedded_datasources_df(self, workbook_df: pd.DataFrame, cols: list = None):
94
+ """
95
+ 先筛选指定的 workbook,不然很慢
96
+ """
97
+ df = get_embedded_datasources_dataframe(
98
+ self.library, workbook_df, id_col="workbook_id", name_col="workbook_name"
99
+ )
100
+ return self.check_columns(df, cols)
101
+
102
+ def get_datasource_connections_df(self, datasources_df: pd.DataFrame = None, cols: list = None):
103
+ if datasources_df is None:
104
+ datasources_df = self.get_datasources_df(cols=["datasource_id", "datasource_name"])
105
+ else:
106
+ datasources_df = self.check_columns(datasources_df, cols=["datasource_id", "datasource_name"])
107
+ connections = []
108
+ for index, row in datasources_df.iterrows():
109
+ ds_conn = get_datasource_connections_dataframe(self.library, row["datasource_id"])
110
+ ds_conn["datasource_id"] = row["datasource_id"]
111
+ connections.append(ds_conn)
112
+ connections_df = pd.concat(connections, ignore_index=True)
113
+ connections_df = connections_df.merge(datasources_df, on="datasource_id", how="left")
114
+ return self.check_columns(connections_df, cols)
115
+
116
+ def get_job_status(self, job_id: str):
117
+ return self.server.jobs.get_by_id(job_id)
118
+
119
+ def wait_to_finish(self, job_id, timeout, retry_interval):
120
+ abort_time = time.time() + timeout
121
+ job_info = self.get_job_status(job_id)
122
+ while job_info.completed_at is None:
123
+ logging.info(
124
+ f"finish_code: {job_info.finish_code}, progress: {job_info.progress} %. Sleep for {retry_interval} s."
125
+ )
126
+ time.sleep(retry_interval)
127
+ if time.time() > abort_time:
128
+ logging.warning(f"Timeout {timeout} s. Job_info: {job_info}")
129
+ break
130
+ job_info = self.get_job_status(job_id)
131
+ if job_info.finish_code != 0:
132
+ logging.warning(f"Job {job_id} is not success")
133
+ return job_info
134
+
135
+ def refresh_workbook(self, workbook_id: str, timeout=600, retry_interval=5):
136
+ logging.info(f"Start refreshing workbook: {workbook_id}")
137
+ res = self.server.workbooks.refresh(workbook_id)
138
+ job_info = self.wait_to_finish(res.id, timeout, retry_interval)
139
+ logging.info(f"Finish refreshing: {job_info}")
140
+
141
+ def refresh_datasource(self, datasource_id: str, timeout=600, retry_interval=5):
142
+ logging.info(f"Start refreshing datasource: {datasource_id}")
143
+ res = self.server.datasources.refresh(datasource_id)
144
+ job_info = self.wait_to_finish(res.id, timeout, retry_interval)
145
+ logging.info(f"Finish refreshing: {job_info}")
146
+
147
+ def screenshot(self, workbook_id: str, view_id: str, save_path: str, maxage: int = 1):
148
+ """
149
+ 截图可能有延迟
150
+ """
151
+ logging.info(f"Start taking screenshot with workbook_id {workbook_id}, view_id {view_id}")
152
+ workbook = self.server.workbooks.get_by_id(workbook_id)
153
+ self.server.workbooks.populate_views(workbook)
154
+
155
+ view = self.server.views.get_by_id(view_id)
156
+ image_req_option = TSC.ImageRequestOptions(
157
+ imageresolution=TSC.ImageRequestOptions.Resolution.High, maxage=maxage
158
+ )
159
+ self.server.views.populate_image(view, image_req_option)
160
+ with open(save_path, "wb") as f:
161
+ f.write(view.image)
162
+ logging.info(f"Finish saving screenshot to {save_path}")
@@ -0,0 +1,21 @@
1
+ HIVE_FIELD_DELIMITER = chr(1)
2
+ HIVE_ARRAY_DELIMITER = chr(2)
3
+ HIVE_MAP_ITEM_DELIMITER = chr(2)
4
+ HIVE_MAP_KV_DELIMITER = chr(3)
5
+ HIVE_NULL = r"\N"
6
+
7
+ LOAD_RENAME_OVERWRITE = "RENAME_OVERWRITE"
8
+ LOAD_OVERWRITE = "OVERWRITE"
9
+ LOAD_MERGE = "MERGE"
10
+ LOAD_APPEND = "APPEND"
11
+
12
+ HIVE_FILE_FORMATS = {
13
+ "text": "TEXTFILE",
14
+ "sequence": "SEQUENCEFILE",
15
+ "parquet": "PARQUET", # http://parquet.apache.org/documentation/latest/
16
+ "orc": "ORC", # optimized row columnar file
17
+ "rc": "RCFILE", # record columnar file
18
+ "avro": "AVRO", # Apache Avro™ (http://avro.apache.org/docs/current/)
19
+ }
20
+
21
+ CLICKHOUSE_MAX_ROW_BUFFER = 10000