recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,51 @@
1
+ import datetime
2
+ from typing import TYPE_CHECKING
3
+
4
+ import dateutil.parser
5
+
6
+ if TYPE_CHECKING:
7
+ import pandas as pd
8
+
9
+
10
+ def parse_to_date(s: str) -> datetime.date:
11
+ if isinstance(s, pd.Timestamp):
12
+ return s.date()
13
+ return dateutil.parser.parse(s).date()
14
+
15
+
16
+ def infer_schema_from_dataframe(df: "pd.DataFrame"):
17
+ import numpy as np
18
+
19
+ from recurvedata.pigeon.schema import Schema, types
20
+
21
+ mapping = {
22
+ np.bool: types.BOOLEAN,
23
+ np.int8: types.INT8,
24
+ np.int16: types.INT16,
25
+ np.int32: types.INT32,
26
+ np.int64: types.INT64,
27
+ np.float16: types.FLOAT32,
28
+ np.float32: types.FLOAT32,
29
+ np.float64: types.FLOAT64,
30
+ np.datetime64: types.DATETIME,
31
+ np.object_: types.STRING,
32
+ np.str_: types.STRING,
33
+ np.bool_: types.BOOLEAN,
34
+ }
35
+
36
+ schema = Schema()
37
+ for col in df.columns:
38
+ canonical_type = mapping.get(df.dtypes[col].type, types.STRING)
39
+ schema.add_field_by_attrs(col, canonical_type)
40
+ return schema
41
+
42
+
43
+ def once(func):
44
+ def wrapper(*args, **kwargs):
45
+ if not wrapper.called:
46
+ wrapper.result = func(*args, **kwargs)
47
+ wrapper.called = True
48
+ return wrapper.result
49
+
50
+ wrapper.called = False
51
+ return wrapper
@@ -0,0 +1,150 @@
1
+ import collections
2
+ import csv
3
+ import json
4
+ import os
5
+ import shutil
6
+ import time
7
+
8
+ from recurvedata.pigeon.utils import fs
9
+
10
+ _csv_dialect_options = {
11
+ "delimiter": ",",
12
+ "quoting": csv.QUOTE_ALL,
13
+ "lineterminator": "\r\n",
14
+ }
15
+
16
+
17
+ def gzip_decompress(src_file, dst_file=None, inplace=True):
18
+ if not dst_file:
19
+ dst_file = fs.new_tempfile(dir=os.path.dirname(src_file))
20
+ fs.gzip_decompress(src_file, dst_file)
21
+
22
+ if inplace:
23
+ os.rename(dst_file, src_file)
24
+ return src_file
25
+ return dst_file
26
+
27
+
28
+ def zip_decompress(src_file, dst_file=None, inplace=True):
29
+ if not dst_file:
30
+ # Create a temporary directory for extraction
31
+ dst_dir = os.path.join(os.path.dirname(src_file), f"tmp_zip_{os.path.basename(src_file)}_{int(time.time())}")
32
+ os.makedirs(dst_dir, exist_ok=True)
33
+ dst_file = dst_dir
34
+
35
+ # Ensure the target directory exists
36
+ if not os.path.exists(dst_file):
37
+ os.makedirs(dst_file, exist_ok=True)
38
+
39
+ fs.zip_decompress(src_file, dst_file)
40
+
41
+ if inplace:
42
+ # For inplace replacement, we need to:
43
+ # 1. Remove the original zip file
44
+ # 2. Move the extracted content to the original location
45
+ extracted_files = os.listdir(dst_file)
46
+ if len(extracted_files) == 1:
47
+ # If there's only one file, move it to replace the original
48
+ extracted_file = os.path.join(dst_file, extracted_files[0])
49
+ os.remove(src_file) # Remove original zip
50
+ os.rename(extracted_file, src_file) # Move extracted file to original location
51
+ os.rmdir(dst_file) # Clean up empty temp dir
52
+ else:
53
+ # If multiple files, keep them in the directory
54
+ os.remove(src_file) # Remove original zip
55
+ return dst_file # Return the directory containing extracted files
56
+ return src_file
57
+ return dst_file
58
+
59
+
60
+ def convert_excel_to_csv(src_file, dst_file=None, skiprows=0, inplace=True, lineterminator="\r\n"):
61
+ import pandas as pd
62
+
63
+ if not dst_file:
64
+ dst_file = fs.new_tempfile(dir=os.path.dirname(src_file))
65
+
66
+ df = pd.read_excel(src_file, skiprows=skiprows)
67
+ df.to_csv(dst_file, lineterminator=lineterminator, header=False, index=False)
68
+ if inplace:
69
+ os.rename(dst_file, src_file)
70
+ return src_file
71
+ return dst_file
72
+
73
+
74
+ def convert_jsonlines_to_csv(src_file, dst_file=None, skiprows=0, src_encoding="utf8", inplace=True):
75
+ """把 JSONLines 格式文件转换成 CSV,JSONLines 文件的每一行都是一个 JSON object"""
76
+ if not dst_file:
77
+ dst_file = fs.new_tempfile(dir=os.path.dirname(src_file))
78
+
79
+ decoder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict)
80
+ with open(src_file, "r", encoding=src_encoding) as f_in, open(dst_file, "w") as f_out:
81
+ _skip_header_rows(f_in, skiprows)
82
+
83
+ line = f_in.readline()
84
+ row = decoder.decode(line)
85
+ writer = csv.DictWriter(f_out, fieldnames=list(row.keys()), **_csv_dialect_options)
86
+ writer.writerow(row)
87
+
88
+ for line in f_in:
89
+ writer.writerow(decoder.decode(line))
90
+
91
+ if inplace:
92
+ os.rename(dst_file, src_file)
93
+ return src_file
94
+ return dst_file
95
+
96
+
97
+ def convert_encoding(filename, src_encoding, dst_encoding="utf8", skiprows=0, inplace=True):
98
+ if src_encoding == dst_encoding:
99
+ return filename
100
+
101
+ target = fs.new_tempfile(dir=os.path.dirname(filename))
102
+ with open(filename, "r", encoding=src_encoding) as f_in, open(target, "w", encoding=dst_encoding) as f_out:
103
+ _skip_header_rows(f_in, skiprows)
104
+
105
+ shutil.copyfileobj(f_in, f_out)
106
+
107
+ if inplace:
108
+ os.rename(target, filename)
109
+ return filename
110
+ return target
111
+
112
+
113
+ def convert_csv_dialect(
114
+ filename, src_dialect_options, dst_dialect_options=None, skiprows=0, src_encoding="utf8", inplace=True
115
+ ):
116
+ if dst_dialect_options is None:
117
+ dst_dialect_options = _csv_dialect_options.copy()
118
+
119
+ if _same_dict(src_dialect_options, dst_dialect_options):
120
+ if src_encoding != "utf8":
121
+ convert_encoding(filename, src_encoding=src_encoding, skiprows=skiprows, inplace=True)
122
+ return filename
123
+
124
+ dst_file = fs.new_tempfile(dir=os.path.dirname(filename))
125
+ with open(filename, "r", encoding=src_encoding) as f_in, open(dst_file, "w") as f_out:
126
+ _skip_header_rows(f_in, skiprows)
127
+
128
+ reader = csv.reader(f_in, **src_dialect_options)
129
+ writer = csv.writer(f_out, **dst_dialect_options)
130
+ for row in reader:
131
+ writer.writerow(row)
132
+
133
+ if inplace:
134
+ os.rename(dst_file, filename)
135
+ return filename
136
+ return dst_file
137
+
138
+
139
+ def _skip_header_rows(f, n=0):
140
+ for _ in range(n):
141
+ f.readline()
142
+
143
+
144
+ def _same_dict(a: dict, b: dict):
145
+ if len(a) != len(b):
146
+ return False
147
+ for k in a:
148
+ if k not in b or a[k] != b[k]:
149
+ return False
150
+ return True
@@ -0,0 +1,10 @@
1
+ import os.path
2
+
3
+
4
+ def get_exist_path(candidate_paths: list[str]) -> str:
5
+ for path in candidate_paths:
6
+ if not path:
7
+ continue
8
+ path = os.path.expanduser(path)
9
+ if os.path.exists(path):
10
+ return path
@@ -0,0 +1,265 @@
1
+ import datetime
2
+ import logging
3
+ import re
4
+ from collections import namedtuple
5
+ from typing import Union
6
+
7
+ try:
8
+ import sqlparse
9
+ from sql_metadata.keywords_lists import QueryType, TokenType
10
+ from sql_metadata.parser import Parser
11
+ from sql_metadata.utils import UniqueList
12
+ except ImportError:
13
+ Parser = object
14
+ logger = logging.getLogger(__name__)
15
+ Table = namedtuple("Table", ["data_source", "database", "table"])
16
+ VERSION = 1
17
+
18
+
19
+ class LineageParser(Parser):
20
+ """
21
+ 2.5.1 sql-metadata 发现的问题,都已处理:
22
+ 1. 需要屏蔽 _preprocess_query,否则 hive / impala 里很多 " 被替换成 `,容易造成后续解析错误
23
+ 2. 有挺多不支持的 sql,都列在 NOT_SUPPORT_PREFIXES 里
24
+ 3. with xxx insert into 这种会被误认为 select 类型,已处理
25
+ 4. insert overwrite table 语句,要写入的表,识别不到
26
+ 5. insert into xxx partition (dt) 里的 dt 会被识别成表
27
+ 6. create table xxx(xxx) partitioned by (dt string) 里的 dt 会被识别成表
28
+ 7. 有些注释好像会导致解析错误(待确认),现在会提前去掉注释
29
+ """
30
+
31
+ NOT_SUPPORT_PREFIXES = (
32
+ "SET",
33
+ "COMPUTE",
34
+ "REFRESH",
35
+ "DROP STATS",
36
+ "DROP INCREMENTAL STATS",
37
+ "INVALIDATE METADATA",
38
+ "SHOW TABLE",
39
+ "DESCRIBE ",
40
+ "TRUNCATE ",
41
+ "MSCK REPAIR TABLE ",
42
+ "USE ",
43
+ "CREATE DATABASE",
44
+ "CREATE EXTERNAL TABLE",
45
+ "CREATE VIEW",
46
+ "DROP VIEW", # todo: view 表看要不要解析
47
+ "DROP FUNCTION",
48
+ "CREATE FUNCTION",
49
+ "SHOW FUNCTIONS",
50
+ "COMMENT ON",
51
+ "GRANT ",
52
+ "IF NOT EXISTS",
53
+ "UNLOAD",
54
+ "VACUUM", # redshift ,
55
+ )
56
+
57
+ NOT_TABLE_KEYS = ("PARTITION", "TABLE", "WHERE")
58
+
59
+ def __init__(self, sql: str, default_db: str, ds_name: str, ds_type: str) -> None:
60
+ super().__init__(sql)
61
+ self.default_db = default_db
62
+ self.ds_name = ds_name
63
+ self.dialect = ds_type # todo: current not used
64
+
65
+ def _preprocess_query(self):
66
+ """
67
+ sql-metadata 会特殊处理 ",导致后续解析报错。
68
+ 这里先替换掉,后续可能需要对不同的 dialect 分别处理
69
+ 比如: hive/impala 不需要把 " 替换成 `
70
+ :return:
71
+ """
72
+ query = self._raw_query
73
+ query = re.sub(r"as\(", "AS (", query, flags=re.I)
74
+ return query
75
+
76
+ def __repr__(self):
77
+ return f"parser: query_type {self.query_type};tables {self.tables}"
78
+
79
+ @classmethod
80
+ def not_supported_query(cls, ds_type, query):
81
+ query = query.strip().upper()
82
+ for prefix in cls.NOT_SUPPORT_PREFIXES:
83
+ if query.startswith(prefix):
84
+ return True
85
+ return False
86
+
87
+ @property
88
+ def query_type(self) -> "QueryType":
89
+ if self._query_type:
90
+ return self._query_type
91
+ query_type = super().query_type
92
+ if query_type == QueryType.SELECT: # with xxx insert into 这种会被误认为 select 类型
93
+ insert_table = self.get_insert_table_name()
94
+ if insert_table:
95
+ self._query_type = query_type = QueryType.INSERT
96
+ return query_type
97
+
98
+ @property
99
+ def tables(self):
100
+ """
101
+ 1. 防止把 partition (dt) 也误认为 tables
102
+ 2. 防止把 insert into table 中的 table 当做 tables
103
+ """
104
+ if self._tables is not None:
105
+ return self._tables
106
+
107
+ tables = UniqueList()
108
+ with_names = self.with_names
109
+
110
+ for token in self._not_parsed_tokens:
111
+ if not token.is_potential_table_name:
112
+ continue
113
+ if (
114
+ token.is_alias_of_table_or_alias_of_subquery
115
+ or token.is_with_statement_nested_in_subquery
116
+ or token.is_constraint_definition_inside_create_table_clause(query_type=self.query_type)
117
+ or token.is_columns_alias_of_with_query_or_column_in_insert_query(with_names=with_names)
118
+ ):
119
+ continue
120
+
121
+ if token.normalized in self.NOT_TABLE_KEYS:
122
+ continue
123
+
124
+ # 防止 insert into xxx partition (dt) 里的 dt 被识别成 table
125
+ # 防止 create table xxx(xxx) partitioned by (dt string) 里的 dt 被识别成 table
126
+ left_parenthesis = token.find_nearest_token(
127
+ value=True, value_attribute="is_left_parenthesis", direction="left"
128
+ )
129
+ right_parenthesis = token.find_nearest_token(
130
+ value=True, value_attribute="is_right_parenthesis", direction="left"
131
+ )
132
+ if (left_parenthesis and right_parenthesis and left_parenthesis.position > right_parenthesis.position) or (
133
+ left_parenthesis and not right_parenthesis
134
+ ):
135
+ if left_parenthesis.previous_token and left_parenthesis.previous_token.normalized in (
136
+ "PARTITION",
137
+ "BY",
138
+ ):
139
+ continue
140
+
141
+ table_name = str(token.value.strip("`"))
142
+ token.token_type = TokenType.TABLE
143
+ tables.append(table_name.lower()) # # 额外添加了 lower(),防止 with_names 和 tables 大小写不一致
144
+
145
+ self._tables = []
146
+ for table in tables - UniqueList([name.lower() for name in with_names]):
147
+ self._tables.append(table)
148
+ return self._tables
149
+
150
+ def get_insert_table_name(self):
151
+ sql = self._query.lower()
152
+ if "insert into" not in sql and "insert overwrite" not in sql:
153
+ return
154
+
155
+ insert_token = None
156
+ for try_num in range(99):
157
+ if insert_token is None:
158
+ if self.tokens[0].normalized == "INSERT":
159
+ insert_token = self.tokens[0]
160
+ else:
161
+ insert_token = self.tokens[0].find_nearest_token(
162
+ "INSERT", value_attribute="normalized", direction="right"
163
+ )
164
+ else:
165
+ insert_token = insert_token.find_nearest_token(
166
+ "INSERT", value_attribute="normalized", direction="right"
167
+ )
168
+ if insert_token.position < 0:
169
+ return
170
+ if insert_token.next_token.normalized in ("INTO", "OVERWRITE"):
171
+ break
172
+ else:
173
+ return
174
+
175
+ table_token = insert_token.next_token.next_token
176
+ if table_token.normalized == "TABLE":
177
+ table_token = table_token.next_token
178
+ insert_table_name = table_token.value.lower()
179
+ if insert_table_name not in self.tables:
180
+ logger.warning(
181
+ f"get_insert_table_name error: " f"table_token {insert_table_name} _tables {self._tables}, please check"
182
+ )
183
+ return insert_table_name
184
+
185
+ def get_create_table_name(self):
186
+ if self.query_type != QueryType.CREATE:
187
+ return
188
+ return (self.tables and self.tables[0]) or None
189
+
190
+ def get_lineage(self):
191
+ if self.query_type == QueryType.DROP:
192
+ return LineageResult([], self._format_table(self.tables[0]), self.query_type, self._raw_query)
193
+ if self.query_type in (QueryType.ALTER, QueryType.DELETE):
194
+ return
195
+ tables = self.tables[:]
196
+ downstream_table = (
197
+ self.get_create_table_name() or self.get_insert_table_name()
198
+ ) # todo: update/upsert table not supported
199
+ if downstream_table:
200
+ if downstream_table in tables:
201
+ tables.remove(downstream_table)
202
+ if not tables:
203
+ return
204
+ return LineageResult(
205
+ self._format_table(tables), self._format_table(downstream_table), self.query_type, self._raw_query
206
+ )
207
+
208
+ def _format_table(self, table_or_tables: Union[list[str], str]):
209
+ if isinstance(table_or_tables, list):
210
+ res_lst = []
211
+ for table in table_or_tables:
212
+ if "." in table:
213
+ db, table = table.split(".") # todo: redshift
214
+ else:
215
+ db = self.default_db
216
+ res_lst.append(Table(data_source=self.ds_name, database=db, table=table))
217
+ return res_lst
218
+ else:
219
+ if "." in table_or_tables:
220
+ db, table = table_or_tables.split(".") # todo: redshift
221
+ else:
222
+ db, table = self.default_db, table_or_tables
223
+ return Table(data_source=self.ds_name, database=db, table=table)
224
+
225
+
226
+ class LineageResult(object):
227
+ def __init__(self, upstream_tables: list[Table], downstream_table: Table, query_type: "QueryType", sql: str):
228
+ self.upstream_tables = upstream_tables
229
+ self.downstream_table = downstream_table
230
+ self.query_type = query_type
231
+ self.sql = sql
232
+
233
+ def to_dict(self):
234
+ return {
235
+ "upstream": [dict(table._asdict()) for table in self.upstream_tables],
236
+ "downstream": dict(self.downstream_table._asdict()),
237
+ "query_type": self.query_type.value,
238
+ "sql": self.sql,
239
+ "version": VERSION,
240
+ "created_at": datetime.datetime.now(),
241
+ }
242
+
243
+
244
+ def parse_lineage(sql, default_db, recurve_ds_name, recurve_ds_type):
245
+ lineage_lst = []
246
+ raw_sql = sql
247
+ remove_comment_sql = sqlparse.format(raw_sql, strip_comments=True)
248
+ for sql in sqlparse.split(remove_comment_sql):
249
+ sql = sql.strip(";\n\r\t ")
250
+ if not sql:
251
+ continue
252
+ if LineageParser.not_supported_query(recurve_ds_type, sql):
253
+ logger.debug(f"currently lineage not support ds_type {recurve_ds_type}")
254
+ continue
255
+ parser = LineageParser(sql, default_db, recurve_ds_name, recurve_ds_type)
256
+ lineage_result = parser.get_lineage()
257
+ if not lineage_result:
258
+ continue
259
+ lineage_lst.append(lineage_result.to_dict())
260
+
261
+ return lineage_lst
262
+
263
+
264
+ def supported_recurve_ds_type(ds_type):
265
+ return ds_type in ("hive", "impala")
@@ -0,0 +1,15 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)
4
+
5
+
6
+ # todo: move to common
7
+ def init_operator_web(op_cls, router, operator_params: dict):
8
+ if not hasattr(op_cls, "init_web"):
9
+ return
10
+ logger.info(f"operator_params: {operator_params} {op_cls.name()}")
11
+ init_func = getattr(op_cls, "init_web")
12
+ try:
13
+ init_func(router, operator_params.get(op_cls.name(), {}))
14
+ except Exception as e:
15
+ logger.error(f"{op_cls} init_web fail, {str(e)}")