recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,204 @@
1
+ import concurrent.futures
2
+ import logging
3
+ import os
4
+ import shutil
5
+
6
+ from pywebhdfs.webhdfs import PyWebHdfsClient
7
+
8
+ from recurvedata.pigeon.connector._registry import register_connector_class
9
+ from recurvedata.pigeon.utils import extract_dict, mp
10
+
11
+
12
+ @register_connector_class('hdfs')
13
+ class HDFSConnector(object):
14
+ def __init__(self, host, port, username=None, user_name=None, **kwargs):
15
+ self.host = host
16
+ self.port = port
17
+ self.user_name = username or user_name
18
+ extra_opts = extract_dict(kwargs, ['path_to_hosts', 'timeout', 'base_uri_pattern', 'request_extra_opts'])
19
+ self.hdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name, **extra_opts)
20
+
21
+ def list_dir(self, path):
22
+ return self.hdfs.list_dir(path)
23
+
24
+ def make_dir(self, path, **kwargs):
25
+ return self.hdfs.make_dir(path, **kwargs)
26
+
27
+ def delete_file(self, path, recursive=False):
28
+ return self.hdfs.delete_file_dir(path, recursive=recursive)
29
+
30
+ def upload_file(self, local_path, hdfs_path=None, overwrite=True):
31
+ if not hdfs_path:
32
+ hdfs_path = os.path.basename(local_path)
33
+
34
+ if not os.path.dirname(hdfs_path):
35
+ hdfs_path = os.path.join('/tmp', hdfs_path)
36
+
37
+ self.delete_file(hdfs_path)
38
+
39
+ with open(local_path, 'rb') as data:
40
+ self.hdfs.create_file(hdfs_path, data, overwrite=overwrite)
41
+ return hdfs_path
42
+
43
+ def upload_files(self, local_paths, hdfs_folder, num_threads=2):
44
+ """num_threads is currently not used"""
45
+ for lf in local_paths:
46
+ hdfs_filename = os.path.join(hdfs_folder, os.path.basename(lf))
47
+ self.upload_file(lf, hdfs_filename, overwrite=True)
48
+ logging.info(f'uploaded {lf} to {hdfs_filename}')
49
+
50
+
51
+ class HDFSCliConnector(HDFSConnector):
52
+ def __init__(self, hdfs_cli=None, **kwargs):
53
+ if not hdfs_cli:
54
+ hdfs_cli = shutil.which('hdfs')
55
+ if not hdfs_cli:
56
+ raise ValueError('could not locate hdfs command line')
57
+ self.hdfs_cli = hdfs_cli
58
+
59
+ def list_dir(self, path):
60
+ raise NotImplementedError
61
+
62
+ def make_dir(self, path, **kwargs):
63
+ self._run_cmd(f'-mkdir {path}')
64
+
65
+ def delete_file(self, path, recursive=False):
66
+ self._run_cmd(f'-rm {"-r" if recursive else ""} -f {path}')
67
+
68
+ def upload_file(self, local_path, hdfs_path=None, overwrite=True):
69
+ if not hdfs_path:
70
+ hdfs_path = os.path.basename(local_path)
71
+
72
+ if not os.path.dirname(hdfs_path):
73
+ hdfs_path = os.path.join('/tmp', hdfs_path)
74
+ self._run_cmd(f'-put {"-f" if overwrite else ""} {local_path} {hdfs_path}')
75
+ return hdfs_path
76
+
77
+ def upload_files(self, local_paths, hdfs_folder, num_threads=2):
78
+ local_path_groups = partition_files_equally(local_paths, num_threads)
79
+ sub_cmds = [f'-put {" ".join(x)} {hdfs_folder}' for x in local_path_groups]
80
+ pool_size = min(num_threads, len(local_path_groups))
81
+ with concurrent.futures.ThreadPoolExecutor(max_workers=pool_size) as executor:
82
+ for _ in executor.map(self._run_cmd, sub_cmds):
83
+ # exhaust the iterator returned by executor.map
84
+ # if any task raises an exception, other tasks will be canceled by executor
85
+ pass
86
+
87
+ def _run_cmd(self, sub_cmd):
88
+ cmd = f'{self.hdfs_cli} dfs {sub_cmd}'
89
+ logging.info(cmd)
90
+ output = mp.run_subprocess(cmd, return_output=True, shell=True)
91
+ if 'NotReplicatedYetException' in output:
92
+ raise IOError('Incomplete copying from /data/oneflow to /tmp/oneflow/ !')
93
+ return output
94
+
95
+
96
+ def partition_files_equally(local_paths, num_groups: int):
97
+ groups = _do_partition_files_equally([(f, os.stat(f).st_size) for f in local_paths], num_groups)
98
+ return [[x[0] for x in g] for g in groups if g]
99
+
100
+
101
+ def _do_partition_files_equally(filename_size_pairs, num_groups: int):
102
+ """把文件划分为若干个总大小相当的分组
103
+ 抄了这个算法: https://cloud.tencent.com/developer/article/1659134,以下文字来自该文章
104
+
105
+ 这个问题是典型的动态规划的问题,理论上是无法找到最优解的,但是本次只是为了解决实际生产中的问题,而不是要AC,所以我们只需要找到一个相对合理的算法,使得partition的分配相对均衡就好了。
106
+
107
+ 输入:int数组,分组数divisionNum
108
+ 1. 对数组倒序排序
109
+ 2. 计算数组的平均值 avg
110
+ 3. 遍历数组。
111
+ * 如果第一个数大于等于avg,将这个数单独作为一组,因为再加下一个数也不会使得求和更接近avg;
112
+ 然后将剩下的数重新求平均,表示需要让剩下的数分配得更加平均,这样可以避免极值的影响,然后重新开始下一轮计算
113
+ * 如果第一个数num小于avg,我们将这个数加入到数组中,然后我们需要找到一(或若干)个数,使得其和更接近delta = avg-num,
114
+ - 继续遍历数组,若发现某个数k==delta,将k加入到数组,结束本轮寻找
115
+ - 若发现a > delta > b;此时要继续判断,如果(delta - b) > (a - delta),将b加入到数组,delta = delta - b,然后继续遍历;
116
+ 如果(delta - b) < (a - delta),保存distance = delta - b,然后将a将入到数组中,继续往下遍历,
117
+ 判断能否找到距离 < distance的,如果有则选择距离更小的这组,否则选择将b加入数组。
118
+
119
+ :param filename_size_pairs: 文件路径和大小的组合,格式 [(name1, size1), (name2, size2)...]
120
+ :param num_groups: 分组数量
121
+ """
122
+ filename_size_pairs = sorted(filename_size_pairs, key=lambda x: x[1], reverse=True)
123
+ total_size = sum(x[1] for x in filename_size_pairs)
124
+ avg = total_size / num_groups
125
+ groups = []
126
+ for idx in range(num_groups):
127
+ if idx == num_groups - 1:
128
+ # 最后一个分组,把剩余的全部放一起
129
+ groups.append(filename_size_pairs)
130
+ break
131
+
132
+ if filename_size_pairs and filename_size_pairs[0][1] >= avg:
133
+ sub_group = [filename_size_pairs[0]]
134
+ total_size -= filename_size_pairs[0][1]
135
+ avg = total_size / (num_groups - len(groups))
136
+ else:
137
+ sub_group, _ = __get_list(filename_size_pairs, avg, abs(avg))
138
+ groups.append(sub_group)
139
+ for item in sub_group:
140
+ filename_size_pairs.remove(item)
141
+ return groups
142
+
143
+
144
+ def __get_list(filename_size_pairs, delta: float, distance: float):
145
+ result = []
146
+ if not filename_size_pairs:
147
+ return result, -1
148
+
149
+ for idx, (filename, size) in enumerate(filename_size_pairs):
150
+ if delta < size:
151
+ continue
152
+ if delta == size:
153
+ result.append((filename, size))
154
+ return result, 0
155
+ else:
156
+ if idx == 0:
157
+ result.append((filename, size))
158
+ delta -= size
159
+ distance = abs(delta)
160
+ tmp, d = __get_list(filename_size_pairs[idx + 1:], delta, distance)
161
+ result.extend(tmp)
162
+ return result, d
163
+ else:
164
+ dis1 = abs(filename_size_pairs[idx - 1][1] - delta)
165
+ dis2 = abs(delta - size)
166
+ if dis1 > dis2:
167
+ result.append((filename, size))
168
+ delta -= size
169
+ tmp, d = __get_list(filename_size_pairs[idx + 1:], delta, dis2)
170
+ result.extend(tmp)
171
+ return result, d
172
+ else:
173
+ tmp, d = __get_list(filename_size_pairs[idx:], delta, dis2)
174
+ if dis1 > d:
175
+ result.extend(tmp)
176
+ return result, d
177
+ result.append(filename_size_pairs[idx - 1])
178
+ return result, dis1
179
+
180
+ dis = abs(delta - filename_size_pairs[-1][1])
181
+ if dis < distance:
182
+ return filename_size_pairs[-1:], dis
183
+ return [], -1
184
+
185
+
186
+ if __name__ == '__main__':
187
+ data = [('233dafd9b1d0b03e6e784987fe748be5.5', 400275118),
188
+ ('233dafd9b1d0b03e6e784987fe748be5.2', 1147688439),
189
+ ('233dafd9b1d0b03e6e784987fe748be5.4', 1232810556),
190
+ ('233dafd9b1d0b03e6e784987fe748be5.3', 1318304652),
191
+ ('233dafd9b1d0b03e6e784987fe748be5.0', 1392554705),
192
+ ('233dafd9b1d0b03e6e784987fe748be5.8', 1440314997),
193
+ ('233dafd9b1d0b03e6e784987fe748be5.7', 1453587946),
194
+ ('233dafd9b1d0b03e6e784987fe748be5.6', 1470806585),
195
+ ('233dafd9b1d0b03e6e784987fe748be5.1', 1509157699),
196
+ ('233dafd9b1d0b03e6e784987fe748be5.9', 1546082238)]
197
+ groups = _do_partition_files_equally(data, 5)
198
+ for g in groups:
199
+ print(g, sum(x[1] for x in g))
200
+ # [('233dafd9b1d0b03e6e784987fe748be5.9', 1546082238), ('233dafd9b1d0b03e6e784987fe748be5.5', 400275118)] 1946357356
201
+ # [('233dafd9b1d0b03e6e784987fe748be5.1', 1509157699), ('233dafd9b1d0b03e6e784987fe748be5.2', 1147688439)] 2656846138
202
+ # [('233dafd9b1d0b03e6e784987fe748be5.6', 1470806585), ('233dafd9b1d0b03e6e784987fe748be5.4', 1232810556)] 2703617141
203
+ # [('233dafd9b1d0b03e6e784987fe748be5.7', 1453587946), ('233dafd9b1d0b03e6e784987fe748be5.3', 1318304652)] 2771892598
204
+ # [('233dafd9b1d0b03e6e784987fe748be5.8', 1440314997), ('233dafd9b1d0b03e6e784987fe748be5.0', 1392554705)] 2832869702
@@ -0,0 +1,383 @@
1
+ # flake8: noqa: E402
2
+
3
+ # pylint: disable=wrong-import-position
4
+
5
+ import os
6
+ import re
7
+ import shutil
8
+ from typing import List, Optional, Union
9
+
10
+ import pyhive.hive
11
+
12
+ _ = 0 # prevent PyCharm to auto-format
13
+ import cytoolz as toolz
14
+
15
+ # impyla breaks TCLIService, which leads to ImportError while importing pyhive
16
+ # see https://github.com/cloudera/impyla/issues/277
17
+ import impala.dbapi
18
+ import sqlalchemy
19
+ from impala.error import HiveServer2Error
20
+ from pyhive.exc import OperationalError
21
+
22
+ from recurvedata.pigeon.connector._registry import register_connector_class
23
+ from recurvedata.pigeon.connector.dbapi import DBAPIConnector, _ShowTableLikeMixin
24
+ from recurvedata.pigeon.connector.hdfs import HDFSCliConnector, HDFSConnector
25
+ from recurvedata.pigeon.const import HIVE_FILE_FORMATS
26
+ from recurvedata.pigeon.schema import types
27
+ from recurvedata.pigeon.utils import ensure_list, trim_suffix
28
+ from recurvedata.pigeon.utils.sql import sqlformat
29
+
30
+ _hive_type_to_canonical_type = {
31
+ "BOOLEAN": types.BOOLEAN,
32
+ "TINYINT": types.INT8,
33
+ "SMALLINT": types.INT16,
34
+ "INT": types.INT32,
35
+ "BIGINT": types.INT64,
36
+ "FLOAT": types.FLOAT32,
37
+ "DOUBLE": types.FLOAT64,
38
+ "DECIMAL": types.FLOAT64,
39
+ "REAL": types.FLOAT64,
40
+ "TIMESTAMP": types.DATETIME,
41
+ "DATE": types.DATE,
42
+ "CHAR": types.STRING,
43
+ "VARCHAR": types.STRING,
44
+ "STRING": types.STRING,
45
+ }
46
+
47
+ _canonical_type_to_hive_type = {
48
+ types.BOOLEAN: "BOOLEAN",
49
+ types.INT8: "TINYINT",
50
+ types.INT16: "SMALLINT",
51
+ types.INT32: "INT",
52
+ types.INT64: "BIGINT",
53
+ types.FLOAT32: "DOUBLE",
54
+ types.FLOAT64: "DOUBLE",
55
+ # treat date, datetime as string
56
+ types.DATE: "STRING",
57
+ types.DATETIME: "STRING",
58
+ types.STRING: "STRING",
59
+ types.JSON: "STRING",
60
+ }
61
+
62
+
63
+ class _HiveSQLMixin:
64
+ def create_partition_table_like(self, table, like_table, partitions):
65
+ """建一个分区表 Like 已有的一个未分区的表,并添加分区键"""
66
+ if not self.has_table(like_table):
67
+ raise ValueError(f"like table {like_table!r} not exists")
68
+ partitions = [f"`{pname}` {ptype}" for pname, ptype in partitions.items()]
69
+ partitions = ", ".join(partitions)
70
+ with self.cursor() as cursor:
71
+ cursor.execute(f"SELECT * FROM {like_table} LIMIT 0")
72
+ columns = [(x[0], x[1]) for x in cursor.description]
73
+ columns = ",\n".join("{} {}".format(*x) for x in columns)
74
+ sql = f"""
75
+ CREATE TABLE {table} (
76
+ {columns}
77
+ ) PARTITIONED BY ({partitions})
78
+ """
79
+ self.execute(sqlformat(sql))
80
+
81
+ def is_table_partitioned(self, database, table):
82
+ with self.cursor() as cursor:
83
+ try:
84
+ # 查看表的分区情况,如果没有报错就返回True
85
+ cursor.execute(f"SHOW PARTITIONS {database}.{table}")
86
+ return True
87
+ except (OperationalError, HiveServer2Error) as e:
88
+ msg = str(e).lower()
89
+ if "table not found" in msg or "table does not exist:" in msg:
90
+ return False
91
+ elif "is not a partitioned table" in msg or "table is not partitioned" in msg:
92
+ return False
93
+ else:
94
+ raise e
95
+
96
+ @staticmethod
97
+ def to_canonical_type(type_code, size):
98
+ type_code = trim_suffix(type_code, "_TYPE")
99
+ return _hive_type_to_canonical_type.get(type_code, types.STRING)
100
+
101
+ @staticmethod
102
+ def from_canonical_type(canonical_type, size):
103
+ return _canonical_type_to_hive_type.get(canonical_type, "STRING")
104
+
105
+ def generate_create_table_ddl(self, name, schema, **kwargs):
106
+ cols = []
107
+ for f in schema:
108
+ col_name = self.quote_identifier(f.name)
109
+ if f.comment:
110
+ cols.append(f"{col_name} {self.from_canonical_type(f.type, f.size)} COMMENT {f.comment!r}")
111
+ else:
112
+ cols.append(f"{col_name} {self.from_canonical_type(f.type, f.size)}")
113
+
114
+ file_format = kwargs.get("file_format", "PARQUET")
115
+ col_types = ", \n".join(cols)
116
+ name = self.quote_identifier(name)
117
+ ddl = f"CREATE TABLE {name} (\n{col_types}\n) STORED AS {file_format}"
118
+ return ddl
119
+
120
+
121
+ @register_connector_class("hive")
122
+ class HiveConnector(_ShowTableLikeMixin, _HiveSQLMixin, DBAPIConnector):
123
+ _sqla_driver = "hive"
124
+ _log_query = False
125
+ _default_port = 10000
126
+
127
+ _complex_types = ("array", "map", "struct")
128
+
129
+ def connect_impl(self, autocommit=False, *args, **kwargs):
130
+ params = {
131
+ "host": self.host,
132
+ "port": self.port,
133
+ "username": self.user,
134
+ "database": self.database or "default",
135
+ }
136
+ if self.password:
137
+ params.update({"password": self.password, "auth": self.kwargs["auth"]})
138
+ hive_conf = self.hive_conf
139
+ hive_conf.update(kwargs.get("hive_conf", {}))
140
+ if hive_conf:
141
+ params["configuration"] = hive_conf
142
+ return pyhive.hive.connect(**params)
143
+
144
+ def create_engine(self, engine_kwargs=None, url_queries=None):
145
+ return sqlalchemy.create_engine("hive://", creator=self.connect)
146
+
147
+ def is_hive(self):
148
+ return True
149
+
150
+ @toolz.memoize
151
+ def create_hdfs_connector(self) -> Optional[HDFSConnector]:
152
+ hdfs_options = self.kwargs.get("hdfs_options")
153
+ if not hdfs_options:
154
+ return None
155
+ return HDFSConnector(**hdfs_options)
156
+
157
+ def has_complex_type_fields(self, table):
158
+ table = self.quote_identifier(table)
159
+ with self.cursor() as cursor:
160
+ cursor.execute("DESCRIBE {}".format(table))
161
+ for r in cursor.fetchall():
162
+ if r[0] == "":
163
+ break
164
+ has_complex = any(x in r[1].lower() for x in self._complex_types)
165
+ if has_complex:
166
+ return True
167
+ return False
168
+
169
+ def get_columns(self, table, database=None, exclude=None):
170
+ if database is None:
171
+ database = self.database
172
+ with self.cursor() as cursor:
173
+ if not self.has_table(table, database, cursor=cursor):
174
+ raise ValueError("Table {!r} not exists in {!r}".format(table, database))
175
+ # Hive bug https://issues.apache.org/jira/browse/HIVE-12184
176
+ cursor.execute("USE {}".format(self.quote_identifier(database)))
177
+ cursor.execute("DESCRIBE {}".format(self.quote_identifier(table)))
178
+ cols = []
179
+ for r in cursor.fetchall():
180
+ # the following is partition information
181
+ if r[0] == "":
182
+ break
183
+ cols.append(r[0])
184
+ if exclude:
185
+ cols = [x for x in cols if x not in exclude]
186
+ return cols
187
+
188
+ def load_local_file(self, table, filepath, overwrite=True):
189
+ hdfs_clients = []
190
+ hdfs_cli = shutil.which("hdfs")
191
+ if hdfs_cli:
192
+ hdfs = HDFSCliConnector(hdfs_cli)
193
+ hdfs_clients.append(hdfs)
194
+ webhdfs = self.create_hdfs_connector()
195
+ if webhdfs:
196
+ hdfs_clients.append(webhdfs)
197
+
198
+ exc = None
199
+ for hdfs in hdfs_clients:
200
+ self.logger.info(f"try to load file using {hdfs}")
201
+ try:
202
+ self._load_local_file_to_hive_impl(table, filepath, hdfs, overwrite)
203
+ self.logger.info("finished load files")
204
+ except Exception as e:
205
+ exc = e
206
+ self.logger.exception(f"failed to load file using {hdfs}")
207
+ else:
208
+ exc = None
209
+ break
210
+
211
+ if exc:
212
+ raise exc
213
+
214
+ def _load_local_file_to_hive_impl(
215
+ self, table: str, filepath: Union[str, List[str]], hdfs: HDFSConnector, overwrite=True
216
+ ):
217
+ staging_folder = self.kwargs.get("hdfs_options", {}).get("staging_folder", "/tmp")
218
+ hdfs_folder = os.path.join(staging_folder, f"{self.database}_{table}_")
219
+ hdfs.delete_file(hdfs_folder, recursive=True)
220
+ hdfs.make_dir(hdfs_folder)
221
+ hdfs.upload_files(ensure_list(filepath), hdfs_folder)
222
+ query = f"LOAD DATA INPATH '{hdfs_folder}/*' {'OVERWRITE' if overwrite else ''} INTO TABLE {table}"
223
+ self.execute(query)
224
+ hdfs.delete_file(hdfs_folder, recursive=True)
225
+
226
+ def generate_ddl(self, table, database=None, if_exists=True, file_format="text"):
227
+ file_format = file_format.lower()
228
+ if file_format not in HIVE_FILE_FORMATS:
229
+ raise ValueError(f"Format {file_format!r} is not supported")
230
+ if database is None:
231
+ database = self.database
232
+ if not self.has_table(table, database):
233
+ raise ValueError(f"Table {table!r} not exists in {database!r}")
234
+
235
+ with self.cursor() as cursor:
236
+ cursor.execute(f"USE {self.quote_identifier(database)}")
237
+ cursor.execute(f"SHOW CREATE TABLE {self.quote_identifier(table)}")
238
+ result = cursor.fetchall()
239
+
240
+ body = ""
241
+ for r in result[1:]:
242
+ if "ROW FORMAT" in r[0]:
243
+ break
244
+ body += r[0]
245
+ if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
246
+ file_format_stmt = f" STORED AS {HIVE_FILE_FORMATS[file_format]}"
247
+ return f"CREATE TABLE{if_exists_stmt}{self.quote_identifier(table)} ({body}{file_format_stmt}"
248
+
249
+ def _add_leading_comment_impl(self, query, comment):
250
+ comment = self._safe_comment(comment)
251
+ return "-- {}\n{}".format(comment, query.strip("\n"))
252
+
253
+ @property
254
+ def hive_conf(self):
255
+ """
256
+ 用于设置 hive query 的参数,与在 hive 里执行 set xxx=xxx 基本一致(数字需要用字符串形式);
257
+ 字典类型,例如 {
258
+ 'spark.yarn.queue': 'etl',
259
+ 'spark.app.name': 'pigeon',
260
+ 'spark.executor.instances': '3'
261
+ }
262
+ 注意字典里数字要写成字符串的形式
263
+ """
264
+ if "hive_conf" in self.kwargs:
265
+ # 考虑到 hive_conf 都是单层 k,v ,不使用 deepcopy
266
+ return self.kwargs["hive_conf"].copy()
267
+ return {}
268
+
269
+ def generate_load_staging_table_ddl(self, staging_table, table, database=None, exclude_columns=None):
270
+ if database is None:
271
+ database = self.database
272
+ if exclude_columns:
273
+ exclude_columns = [col.lower().replace("`", "") for col in exclude_columns]
274
+
275
+ with self.cursor() as cursor:
276
+ cursor.execute(f"USE {self.quote_identifier(database)}")
277
+ cursor.execute(f"SHOW CREATE TABLE {self.quote_identifier(table)}")
278
+ result = cursor.fetchall()
279
+
280
+ body = pre_row = ""
281
+ for r in result[1:]:
282
+ row = r[0].lower().strip()
283
+ if row.startswith("partitioned by ("):
284
+ continue
285
+ if row.startswith("comment"):
286
+ continue
287
+ if exclude_columns:
288
+ col_name = row.split(" ")[0].strip("`")
289
+ if col_name in exclude_columns:
290
+ continue
291
+ if row.endswith(")"):
292
+ row = ",".join(row.rsplit(")", 1))
293
+ if row.startswith("row format"):
294
+ pre_row = ")".join(pre_row.rsplit(",", 1))
295
+ body += pre_row
296
+ break
297
+ body += pre_row
298
+ pre_row = row
299
+
300
+ return f"CREATE TABLE {self.quote_identifier(staging_table)} ({body}"
301
+
302
+
303
+ @register_connector_class("impala")
304
+ class ImpalaConnector(_ShowTableLikeMixin, _HiveSQLMixin, DBAPIConnector):
305
+ _sqla_driver = "impala"
306
+ _default_port = 21050
307
+
308
+ def connect_impl(self, autocommit=False, *args, **kwargs):
309
+ params = {
310
+ "host": self.host,
311
+ "port": self.port,
312
+ "database": self.database or "default",
313
+ "user": self.user,
314
+ "password": self.password,
315
+ }
316
+ if "auth_mechanism" in self.kwargs:
317
+ params["auth_mechanism"] = self.kwargs["auth_mechanism"]
318
+ return impala.dbapi.connect(**params)
319
+
320
+ def create_engine(self, engine_kwargs=None, url_queries=None):
321
+ return sqlalchemy.create_engine("impala://", creator=self.connect)
322
+
323
+ def is_impala(self):
324
+ return True
325
+
326
+ def get_columns(self, table, database=None, exclude=None):
327
+ if database is None:
328
+ database = self.database
329
+ with self.cursor() as cursor:
330
+ if not self.has_table(table, database, cursor=cursor):
331
+ raise ValueError("Table {!r} not exists in {!r}".format(table, database))
332
+ cursor.execute("DESCRIBE {}.{}".format(self.quote_identifier(database), self.quote_identifier(table)))
333
+ cols = [x[0] for x in cursor.fetchall()]
334
+ if exclude:
335
+ cols = [x for x in cols if x not in exclude]
336
+ return cols
337
+
338
+ def invalidate_metadata(self, table=None):
339
+ if table:
340
+ table = self.quote_identifier(table)
341
+ else:
342
+ table = ""
343
+ query = f"INVALIDATE METADATA {table}"
344
+ self.execute(query)
345
+
346
+ def refresh(self, table, compute_stats=True):
347
+ table = self.quote_identifier(table)
348
+ queries = "REFRESH {}".format(table)
349
+ try:
350
+ self.execute(queries)
351
+ except Exception as e:
352
+ self.logger.error(f"failed to refresh, err: {e}, use INVALIDATE")
353
+ queries = "INVALIDATE METADATA {}".format(table)
354
+ self.execute(queries)
355
+ if compute_stats:
356
+ queries = "COMPUTE INCREMENTAL STATS {}".format(table)
357
+ self.execute(queries)
358
+
359
+ def generate_ddl(self, table, database=None, if_exists=True, file_format="text"):
360
+ file_format = file_format.lower()
361
+ # ORC is not supported in Impala
362
+ # https://www.cloudera.com/documentation/enterprise/5-12-x/topics/impala_file_formats.html
363
+ if file_format == "orc" or file_format not in HIVE_FILE_FORMATS:
364
+ raise ValueError(f"Format {file_format!r} is not supported")
365
+ if database is None:
366
+ database = self.database
367
+ if not self.has_table(table, database):
368
+ raise ValueError(f"Table {table!r} not exists in {database!r}")
369
+
370
+ with self.cursor() as cursor:
371
+ cursor.execute(f"USE {self.quote_identifier(database)}")
372
+ cursor.execute(f"SHOW CREATE TABLE {self.quote_identifier(table)}")
373
+ body = re.search(r"\.(.*)\nSTORED", cursor.fetchall()[0][0], flags=re.S).group(1)
374
+ if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
375
+ file_format_stmt = f" STORED AS {HIVE_FILE_FORMATS[file_format]}"
376
+ return f"CREATE TABLE{if_exists_stmt}{body}{file_format_stmt}"
377
+
378
+ @toolz.memoize
379
+ def create_hdfs_connector(self):
380
+ hdfs_options = self.kwargs.get("hdfs_options")
381
+ if not hdfs_options:
382
+ return None
383
+ return HDFSConnector(**hdfs_options)
@@ -0,0 +1,95 @@
1
+ from collections import OrderedDict
2
+ from typing import Any, Optional
3
+
4
+ import pyodbc
5
+
6
+ from recurvedata.pigeon.connector._registry import register_connector_class
7
+ from recurvedata.pigeon.connector.mssql import BaseAzureSQLConnector
8
+ from recurvedata.pigeon.schema import types
9
+ from recurvedata.pigeon.utils import safe_int
10
+
11
+
12
+ @register_connector_class("microsoft_fabric") # type: ignore
13
+ class MsFabricConnector(BaseAzureSQLConnector):
14
+ """Connector for Microsoft Fabric.
15
+
16
+ This connector extends BaseAzureSQLConnector to support Microsoft Fabric specific features:
17
+ - Azure AD authentication
18
+ - Workspace-level access control
19
+ - Special ODBC driver configuration
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ host: str | None = None,
25
+ port: int | None = None,
26
+ database: str | None = None,
27
+ schema: str | None = None,
28
+ tenant_id: str | None = None,
29
+ client_id: str | None = None,
30
+ client_secret: str | None = None,
31
+ authentication: str = "ServicePrincipal",
32
+ odbc_driver: str = "ODBC Driver 18 for SQL Server",
33
+ encrypt: bool = True,
34
+ trust_server_certificate: bool = False,
35
+ *args: Any,
36
+ **kwargs: Any,
37
+ ) -> None:
38
+ super().__init__(host, port, database, schema=schema, *args, **kwargs)
39
+ self.tenant_id = tenant_id
40
+ self.client_id = client_id
41
+ self.client_secret = client_secret
42
+ self.authentication = authentication
43
+ self.odbc_driver = odbc_driver
44
+ self.driver = "mssql+pyodbc"
45
+ self.encrypt = encrypt
46
+ self.trust_server_certificate = trust_server_certificate
47
+
48
+ def _get_sqlalchemy_uri(self) -> str:
49
+ """Generate SQLAlchemy URI for Microsoft Fabric."""
50
+ return (
51
+ f"{self.driver}://{self.client_id}:{self.client_secret}@{self.host}:{self.port}/"
52
+ f"{self.database}?driver={self.odbc_driver}&encrypt={self.encrypt}&trust_server_certificate={self.trust_server_certificate}"
53
+ )
54
+
55
+ def is_fabric(self) -> bool:
56
+ """Check if this is a Microsoft Fabric connector."""
57
+ return True
58
+
59
+ @staticmethod
60
+ def to_canonical_type(type_code: Any, size: Optional[int] = None) -> str:
61
+ """Convert Microsoft Fabric type to canonical type."""
62
+ return BaseAzureSQLConnector.to_canonical_type(type_code, size)
63
+
64
+ @staticmethod
65
+ def from_canonical_type(canonical_type: str, size: Optional[int] = None) -> str:
66
+ """Convert canonical type to Microsoft Fabric type."""
67
+ if canonical_type == types.STRING:
68
+ if size is None or size == 0:
69
+ return "VARCHAR(max)"
70
+ safe_size = safe_int(size * 4)
71
+ if safe_size > 4000:
72
+ return "VARCHAR(max)"
73
+ return f"VARCHAR({safe_size})"
74
+ return BaseAzureSQLConnector.from_canonical_type(canonical_type, size)
75
+
76
+ @property
77
+ def conn_string(self) -> str:
78
+ """Generate connection string for Microsoft Fabric with Azure AD authentication."""
79
+ options = OrderedDict(
80
+ {
81
+ "Driver": f"{{{self.odbc_driver}}}",
82
+ "Server": f"{self.host}",
83
+ "Database": str(self.database),
84
+ "Authentication": "ActiveDirectoryServicePrincipal",
85
+ "Encrypt": "yes" if self.encrypt else "no",
86
+ "TrustServerCertificate": "yes" if self.trust_server_certificate else "no",
87
+ "Uid": self.client_id,
88
+ "Pwd": self.client_secret,
89
+ "Connection Timeout": 30,
90
+ }
91
+ )
92
+ return ";".join([f"{k}={v}" for k, v in options.items()])
93
+
94
+ def connect_impl(self, autocommit=False, *args, **kwargs):
95
+ return pyodbc.connect(self.conn_string, autocommit=autocommit)