recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,283 @@
1
+ import logging
2
+ import os.path
3
+ from urllib.parse import urlparse
4
+
5
+ from slugify import slugify
6
+
7
+ from recurvedata.pigeon.connector import get_connector, get_connector_class
8
+ from recurvedata.pigeon.const import LOAD_APPEND, LOAD_MERGE
9
+ from recurvedata.pigeon.handler.csv_handler import create_csv_file_handler_factory
10
+ from recurvedata.pigeon.loader import CSVToHiveLoader, CSVToMySQLLoader, CSVToRedshiftLoader
11
+ from recurvedata.pigeon.utils import ensure_list, fs
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def _parse_db_table(db_table):
17
+ t_db_table = db_table.split(".")
18
+ if len(t_db_table) == 2:
19
+ db, table = t_db_table
20
+ schema = None
21
+ elif len(t_db_table) == 3:
22
+ db, schema, table = t_db_table
23
+ else:
24
+ raise ValueError(f"Invalid database and table {db_table!r}")
25
+ return {
26
+ "database": db,
27
+ "schema": schema,
28
+ "table": table,
29
+ }
30
+
31
+
32
+ class Location:
33
+ """
34
+ Syntax:
35
+ 完整URL形式:{protocol}://{user}:{password}@{host}:{port}/{database}.{table}
36
+ 简写形式: {dbconf}:{database}.{table}
37
+ 本地文件: file://{path}
38
+ 本地文件简写:file:{path}
39
+ Example:
40
+ mysql://dev:pass@172.16.24.93:3306/testdb.test
41
+ tidb:testdb.test
42
+ file:///tmp/result.csv
43
+ file:/tmp/result.csv
44
+ """
45
+
46
+ ATTRS = ["protocol", "user", "password", "host", "port", "database", "schema", "table", "dbconf", "path"]
47
+
48
+ def __init__(self, spec=None, **kwargs):
49
+ if not kwargs:
50
+ if not spec:
51
+ raise ValueError("Location spec is required")
52
+ if spec.startswith("file:"):
53
+ self._from_file(spec)
54
+ elif "://" in spec:
55
+ self._from_url(spec)
56
+ else:
57
+ self._from_simple(spec)
58
+ else:
59
+ if spec is not None:
60
+ kwargs["dbconf"] = spec
61
+ self._from_dict(kwargs)
62
+
63
+ def __str__(self):
64
+ if self.protocol == "file":
65
+ return self._format_file()
66
+ if self.dbconf:
67
+ return self._format_simple()
68
+ else:
69
+ return self._format_url()
70
+
71
+ def _format_file(self):
72
+ return f"{self.protocol}:{self.path}"
73
+
74
+ def _format_url(self, hide_password=True):
75
+ ret = [f"{self.protocol}://"]
76
+ if self.user:
77
+ ret.append(str(self.user))
78
+ if self.password:
79
+ password = "***" if hide_password else self.passsword
80
+ ret.append(f":{password}")
81
+ ret.append("@")
82
+ if self.host:
83
+ if ":" in self.host:
84
+ ret.append(f"[{self.host}]")
85
+ else:
86
+ ret.append(str(self.host))
87
+ if self.port:
88
+ ret.append(f":{self.port}")
89
+ if self.database:
90
+ ret.append(f"/{self.database}")
91
+ if self.schema:
92
+ ret.append(f".{self.schema}")
93
+ if self.table:
94
+ ret.append(f".{self.table}")
95
+ return "".join(ret)
96
+
97
+ def _format_simple(self):
98
+ ret = [f"{self.dbconf}:"]
99
+ if self.database:
100
+ ret.append(f"{self.database}")
101
+ if self.schema:
102
+ ret.append(f".{self.schema}")
103
+ if self.table:
104
+ ret.append(f".{self.table}")
105
+ return "".join(ret)
106
+
107
+ def __repr__(self):
108
+ return f"<{type(self).__name__} {str(self)}>"
109
+
110
+ def _from_file(self, url):
111
+ """
112
+ Syntax:
113
+ file://{path}
114
+ file:{path}
115
+ """
116
+ protocol, path = url.split(":", maxsplit=1)
117
+ if path.startswith("//"):
118
+ path = path[2:]
119
+ params = {"protocol": protocol, "path": path}
120
+ self._from_dict(params)
121
+
122
+ def _from_url(self, url):
123
+ """
124
+ Syntax:
125
+ {protocol}://{user}:{password}@{host}:{port}/{database}.{table}
126
+ """
127
+ parsed = urlparse(url)
128
+ params = {
129
+ "protocol": parsed.scheme,
130
+ "user": parsed.username,
131
+ "password": parsed.password,
132
+ "host": parsed.hostname,
133
+ "port": parsed.port,
134
+ }
135
+ params.update(_parse_db_table(parsed.path.strip("/")))
136
+ self._from_dict(params)
137
+
138
+ def _from_simple(self, spec):
139
+ """
140
+ Syntax:
141
+ {dbconf}:{database}.{table}
142
+ """
143
+ dbconf, db_table = spec.strip().split(":")
144
+ params = {"dbconf": dbconf}
145
+ params.update(_parse_db_table(db_table))
146
+ self._from_dict(params)
147
+
148
+ def _from_dict(self, params):
149
+ unknown_params = set(params) - set(self.ATTRS)
150
+ if unknown_params:
151
+ raise ValueError(f"Unknown params {unknown_params}")
152
+ if not params.get("protocol") and not params.get("dbconf"):
153
+ raise ValueError("protocol or dbconf is required")
154
+ if params.get("protocol") == "file" and not params.get("path"):
155
+ raise ValueError("path is required")
156
+ for k in self.ATTRS:
157
+ setattr(self, k, params.get(k))
158
+
159
+ def to_dict(self):
160
+ ret = {}
161
+ for k in self.ATTRS:
162
+ v = getattr(self, k, None)
163
+ if v is not None and v != "":
164
+ ret[k] = v
165
+ return ret
166
+
167
+ @property
168
+ def is_local(self):
169
+ return self.dbconf == "file" or self.protocol == "file"
170
+
171
+
172
+ def _get_dumper_cls(dtype):
173
+ # XXX: so ugly here
174
+ if dtype in ["cassandra"]:
175
+ from recurvedata.pigeon.dumper.cass import CassandraDumper
176
+
177
+ return CassandraDumper
178
+
179
+ from recurvedata.pigeon.dumper.dbapi import DBAPIDumper
180
+
181
+ return DBAPIDumper
182
+
183
+
184
+ def _get_connector(location):
185
+ if location.protocol:
186
+ # only support dbapi
187
+ _conn_cls = get_connector_class(location.protocol)
188
+ connector = _conn_cls(
189
+ host=location.host,
190
+ port=location.port,
191
+ user=location.user,
192
+ password=location.passsword,
193
+ database=location.database,
194
+ )
195
+ else:
196
+ connector = get_connector(location.dbconf, database=location.database)
197
+ return connector
198
+
199
+
200
+ def _dump(src, handler_factory):
201
+ connector = _get_connector(src)
202
+ if src.schema:
203
+ table = f"{src.schema}.{src.table}"
204
+ else:
205
+ table = src.table
206
+ dumper = _get_dumper_cls(src.protocol or src.dbconf)(
207
+ connector,
208
+ table=table,
209
+ handler_factories=[handler_factory],
210
+ )
211
+ logger.info("Dump start".center(40, "="))
212
+ dumper.execute()
213
+
214
+
215
+ _loader_config = {
216
+ "hive": {
217
+ "cls": CSVToHiveLoader,
218
+ "connector": "hive_connector",
219
+ },
220
+ "redshift": {
221
+ "cls": CSVToRedshiftLoader,
222
+ "connector": "redshift_connector",
223
+ },
224
+ "mysql": {
225
+ "cls": CSVToMySQLLoader,
226
+ "connector": "connector",
227
+ },
228
+ }
229
+ _loader_config["tidb"] = _loader_config["mysql"]
230
+
231
+
232
+ def _load(dst, filename, mode, merge_keys=()):
233
+ connector = _get_connector(dst)
234
+
235
+ cfg = _loader_config[dst.protocol or dst.dbconf]
236
+ cls, connector_name = cfg["cls"], cfg["connector"]
237
+ kwargs = {
238
+ "database": dst.database,
239
+ "table": dst.table,
240
+ "filename": filename,
241
+ connector_name: connector,
242
+ }
243
+ if mode.upper() == LOAD_MERGE:
244
+ logger.info(f"Primary keys: {merge_keys} in {dst}")
245
+ kwargs.update({"mode": LOAD_MERGE, "primary_keys": ensure_list(merge_keys)})
246
+ elif mode.upper() == LOAD_APPEND:
247
+ kwargs.update({"mode": LOAD_APPEND})
248
+ loader = cls(**kwargs)
249
+ logger.info("Load start".center(40, "="))
250
+ loader.execute()
251
+
252
+
253
+ def _get_stage_filename(src, dst):
254
+ if src.is_local:
255
+ return src.path
256
+ if dst.is_local:
257
+ if not os.path.isabs(dst.path):
258
+ return os.path.abspath(dst.path)
259
+ return dst.path
260
+ tmpdir = f"{src.protocol or src.dbconf}_to_{dst.protocol or dst.dbconf}"
261
+ new_stagefile = fs.new_stagefile_factory(tmpdir)
262
+ return new_stagefile(slugify(f"{src}_to_{dst}") + ".txt")
263
+
264
+
265
+ def sync(src, dst, mode, merge_keys=()):
266
+ """同步一个表"""
267
+ if not isinstance(src, Location):
268
+ src = Location(src)
269
+ if not isinstance(dst, Location):
270
+ dst = Location(dst)
271
+
272
+ if dst.protocol and dst.protocol != "file":
273
+ raise NotImplementedError("暂不支持URL形式的目标")
274
+
275
+ filename = _get_stage_filename(src, dst)
276
+ logger.info(f"Dump to file: {filename}")
277
+
278
+ if not src.is_local:
279
+ for_hive = (dst.protocol or dst.dbconf) in ["impala", "hive"]
280
+ handler_factory = create_csv_file_handler_factory(filename=filename, hive=for_hive)
281
+ _dump(src, handler_factory)
282
+ if not dst.is_local:
283
+ _load(dst, filename, mode=mode, merge_keys=merge_keys)
@@ -0,0 +1,146 @@
1
+ import struct
2
+ import zlib
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+
5
+ try:
6
+ # use ujson for better performance
7
+ import ujson as json
8
+ except ImportError:
9
+ import json
10
+
11
+ from recurvedata.pigeon import const
12
+ from recurvedata.pigeon.schema import Schema
13
+
14
+ _Row = Union[Tuple, Dict[str, Any]]
15
+
16
+
17
+ class Transformer:
18
+ _input_schema: Optional[Schema] = None
19
+ _use_input_schema_as_output: bool = False
20
+
21
+ @property
22
+ def input_schema(self) -> Optional[Schema]:
23
+ """Returns the schema of input data"""
24
+ return self._input_schema
25
+
26
+ @input_schema.setter
27
+ def input_schema(self, schema: Schema):
28
+ """Should be called by the handler"""
29
+ assert isinstance(schema, Schema)
30
+ self._input_schema = schema
31
+
32
+ @property
33
+ def output_schema(self) -> Optional[Schema]:
34
+ """Subclasses that change the rows schema should provide the output schema.
35
+
36
+ These operations will change the output schema:
37
+ - Add or remove fields
38
+ - Change the name of fields
39
+ - Change the type of fields
40
+
41
+ An example of valid schema:
42
+
43
+ from recurvedata.pigeon.schema import Schema, Field, types
44
+
45
+ Schema([
46
+ Field(name='id', type=types.INT32),
47
+ Field(name='name', type=types.STRING, size=64),
48
+ Field(name='snapshot_time', type=types.DATETIME, comment='snapshot_time in UTC'),
49
+ Field(name='is_active', type=types.BOOLEAN)
50
+ ])
51
+
52
+ Allowed types:
53
+
54
+ - INT8 = 'INT8' # 1-byte (8-bit) signed integers
55
+ - INT16 = 'INT16' # 2-byte (16-bit) signed integers
56
+ - INT32 = 'INT32' # 4-byte (32-bit) signed integers
57
+ - INT64 = 'INT64' # 8-byte (64-bit) signed integers
58
+ - FLOAT32 = 'FLOAT32' # 4-byte (32-bit) single-precision floating
59
+ - FLOAT64 = 'FLOAT64' # 8-byte (64-bit) double-precision floating
60
+ - BOOLEAN = 'BOOLEAN'
61
+ - DATETIME = 'DATETIME'
62
+ - DATE = 'DATE'
63
+ - STRING = 'STRING'
64
+ """
65
+ if self._use_input_schema_as_output:
66
+ return self._input_schema
67
+ return None
68
+
69
+ def transform(self, row: _Row, *args, **kwargs) -> Union[_Row, List[_Row]]:
70
+ """This is the method called by Handler.
71
+
72
+ It internally calls `transform_impl` to do the real transform logic.
73
+ Subclasses should implement `transform_impl` but not this method.
74
+
75
+ :param row: a Row (namedtuple) object contains a row record fetched from database
76
+ :returns: returns one (tuple) or multiple (list of tuple) rows
77
+ """
78
+ return self.transform_impl(row, *args, **kwargs)
79
+
80
+ def transform_impl(self, row: _Row, *args, **kwargs) -> Union[_Row, List[_Row]]:
81
+ return row
82
+
83
+ @staticmethod
84
+ def convert_json_to_hive_map(data: Union[str, bytes]) -> str:
85
+ if not data:
86
+ return const.HIVE_NULL
87
+
88
+ d = json.loads(data)
89
+ items = []
90
+ for key, value in d.items():
91
+ key = str(key).strip()
92
+ value = str(value).strip()
93
+ item = '{0}{1}{2}'.format(key, const.HIVE_MAP_KV_DELIMITER, value)
94
+ items.append(item)
95
+ return const.HIVE_MAP_ITEM_DELIMITER.join(items)
96
+
97
+ @staticmethod
98
+ def convert_json_to_hive_array(data: Union[str, bytes]) -> str:
99
+ if not data:
100
+ return const.HIVE_NULL
101
+
102
+ items = json.loads(data)
103
+ return const.HIVE_ARRAY_DELIMITER.join(items)
104
+
105
+ @staticmethod
106
+ def mysql_uncompress(value: bytes, return_str: bool = False) -> Union[bytes, str]:
107
+ """A Python implementation of UNCOMPRESS function of MySQL.
108
+
109
+ Used to decompress result of COMPRESS function.
110
+
111
+ https://dev.mysql.com/doc/refman/5.7/en/encryption-functions.html#function_compress
112
+
113
+ :param value: the compressed data in bytes
114
+ :param return_str: the return value should be unicode
115
+ :type return_str: bool
116
+ :rtype: bytes | str
117
+ """
118
+
119
+ # Empty strings are stored as empty strings.
120
+ # Nonempty strings are stored as a 4-byte length of the uncompressed string
121
+ if not value or len(value) < 4:
122
+ return value
123
+
124
+ rv = zlib.decompress(value[4:])
125
+
126
+ if return_str:
127
+ rv = rv.decode()
128
+ return rv
129
+
130
+ @staticmethod
131
+ def mysql_compress(value: Optional[str]) -> Optional[bytes]:
132
+ if value is None:
133
+ return None
134
+ if value == '':
135
+ return b''
136
+ size = struct.pack('I', len(value))
137
+ data = zlib.compress(value.encode())
138
+ return size + data
139
+
140
+ @staticmethod
141
+ def json_loads(*args, **kwargs) -> Any:
142
+ return json.loads(*args, **kwargs)
143
+
144
+ @staticmethod
145
+ def json_dumps(*args, **kwargs) -> str:
146
+ return json.dumps(*args, **kwargs)
@@ -0,0 +1,134 @@
1
+ import hashlib
2
+ import logging
3
+ import time
4
+ from contextlib import contextmanager
5
+ from typing import Dict, List, Set, Tuple, TypeVar, Union
6
+ from uuid import uuid4
7
+
8
+ import cytoolz as toolz
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ def safe_int(v: Union[str, int, float], default: int = 0) -> int:
16
+ try:
17
+ return int(v)
18
+ except Exception:
19
+ return default
20
+
21
+
22
+ def trim_prefix(s: str, sub: str) -> str:
23
+ if not s.startswith(sub):
24
+ return s
25
+ return s[len(sub) :]
26
+
27
+
28
+ def trim_suffix(s: str, sub: str) -> str:
29
+ if not s.endswith(sub):
30
+ return s
31
+ return s[: -len(sub)]
32
+
33
+
34
+ class LoggingMixin(object):
35
+ @property
36
+ def logger(self) -> logging.Logger:
37
+ try:
38
+ return self._logger
39
+ except AttributeError:
40
+ self._logger = logging.root.getChild(self.__class__.__module__ + "." + self.__class__.__name__)
41
+ return self._logger
42
+
43
+
44
+ def init_logging(
45
+ level_name="info",
46
+ fmt="%(asctime)s - %(levelname)s - %(name)s - %(filename)s:%(lineno)d - [%(process)d:%(threadName)s] - %(message)s",
47
+ silent_cassandra=True,
48
+ ):
49
+ level = logging.INFO
50
+ if level_name == "info":
51
+ level = logging.INFO
52
+ elif level_name == "warning":
53
+ level = logging.WARNING
54
+ elif level_name == "error":
55
+ level = logging.ERROR
56
+ elif level_name == "debug":
57
+ level = logging.DEBUG
58
+ logging.basicConfig(level=level, format=fmt)
59
+
60
+ if silent_cassandra:
61
+ # cassandra is too noisy
62
+ logging.getLogger("cassandra.cluster").setLevel(logging.WARNING)
63
+
64
+
65
+ def ensure_list(v: Union[T, Tuple[T], List[T], Set[T]]) -> List[T]:
66
+ if isinstance(v, (tuple, set, list)):
67
+ return list(v)
68
+ return [v]
69
+
70
+
71
+ def ensure_str_list(v: str, sep: str = ",", strip: bool = True) -> List[str]:
72
+ if v is None:
73
+ return []
74
+
75
+ if isinstance(v, str):
76
+ if not v:
77
+ return []
78
+ if strip:
79
+ return [x.strip() for x in v.split(sep)]
80
+ else:
81
+ return v.split(sep)
82
+
83
+ if isinstance(v, (tuple, set, list)):
84
+ return list(v)
85
+ raise TypeError(f'unsupported type "{type(v)}"')
86
+
87
+
88
+ def ensure_query_list(v: Union[str, List[str]]) -> List[str]:
89
+ if not v:
90
+ return []
91
+ if isinstance(v, list):
92
+ return v
93
+ return list(filter(None, map(lambda x: x.strip(), v.split(";"))))
94
+
95
+
96
+ def extract_dict(d: Dict, keys: List) -> Dict:
97
+ return {k: v for k, v in d.items() if k in keys}
98
+
99
+
100
+ @contextmanager
101
+ def silent(*_excs, excs=None):
102
+ excs = excs or _excs or (Exception,)
103
+ try:
104
+ yield
105
+ except excs as e:
106
+ logging.exception("silent %s", type(e).__name__)
107
+
108
+
109
+ def replace_null_values(row: Union[List, Tuple, Dict], null_values: List, replacer=None):
110
+ def _f(v):
111
+ if v in null_values:
112
+ return replacer
113
+ return v
114
+
115
+ if isinstance(row, list):
116
+ return list(map(_f, row))
117
+ if isinstance(row, tuple):
118
+ return tuple(map(_f, row))
119
+ if isinstance(row, dict):
120
+ return toolz.valmap(_f, row)
121
+ raise TypeError(f"only list, tuple or dict type is supported, got {repr(type(row))}")
122
+
123
+
124
+ def md5hash(v: Union[str, bytes]) -> str:
125
+ if isinstance(v, str):
126
+ v = v.encode()
127
+ if not isinstance(v, bytes):
128
+ v = str(v).encode()
129
+ return hashlib.md5(v).hexdigest()
130
+
131
+
132
+ def randomized_suffix() -> str:
133
+ pure_time_str = str(time.time()).replace(".", "")
134
+ return pure_time_str[-1] + uuid4().hex[:6]