recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,172 @@
1
+ import contextlib
2
+ import csv
3
+ import logging
4
+ import sys
5
+
6
+ import cytoolz as toolz
7
+
8
+ from recurvedata.utils.imports import MockModule
9
+
10
+ try:
11
+ import numpy as np
12
+ import pandas as pd
13
+ except ImportError:
14
+ np = MockModule("numpy")
15
+ pd = MockModule("pandas")
16
+
17
+ from recurvedata.pigeon import const
18
+ from recurvedata.pigeon.schema import Schema, types
19
+
20
+ csv.field_size_limit(sys.maxsize)
21
+
22
+ dialect_terms = (
23
+ "delimiter",
24
+ "doublequote",
25
+ "escapechar",
26
+ "lineterminator",
27
+ "quotechar",
28
+ "quoting",
29
+ "skipinitialspace",
30
+ "strict",
31
+ )
32
+
33
+
34
+ class ExtendedSniffer(csv.Sniffer):
35
+ def __init__(self):
36
+ super().__init__()
37
+ self.preferred = [",", "\t", ";", " ", ":", "|", const.HIVE_FIELD_DELIMITER]
38
+
39
+
40
+ def copy_dialect(name, source_dialect):
41
+ return dict_to_dialect(dialect_to_dict(source_dialect), name)
42
+
43
+
44
+ def dialect_to_dict(dialect):
45
+ return {name: getattr(dialect, name) for name in dialect_terms if hasattr(dialect, name)}
46
+
47
+
48
+ def dict_to_dialect(d, name=""):
49
+ class dialect(csv.Dialect):
50
+ _name = name
51
+
52
+ for name in dialect_terms:
53
+ if name in d:
54
+ setattr(dialect, name, d[name])
55
+ return dialect
56
+
57
+
58
+ def infer_header(path, nbytes=10000, encoding="utf-8"):
59
+ with open(path, "rb") as f:
60
+ sample = f.read(nbytes).decode(encoding, "replace")
61
+ sniffer = ExtendedSniffer()
62
+ try:
63
+ return sniffer.has_header(sample)
64
+ except csv.Error:
65
+ return None
66
+
67
+
68
+ def sniff_dialect(path, nbytes=10000, encoding="utf-8"):
69
+ with open(path, "rb") as f:
70
+ sample = f.read(nbytes).decode(encoding, "replace")
71
+ sniffer = ExtendedSniffer()
72
+ try:
73
+ dialect = sniffer.sniff(sample, delimiters=sniffer.preferred)
74
+ except csv.Error as e:
75
+ logging.warning("failed to sniff dialect, copy from csv.excel. error: %s", e)
76
+ dialect = copy_dialect(name="excel_copy", source_dialect=csv.excel)
77
+
78
+ crnl, nl = "\r\n", "\n"
79
+ dialect.lineterminator = crnl if crnl in sample else nl
80
+ return dialect
81
+
82
+
83
+ class CSV(object):
84
+ """
85
+ Proxy for a CSV file.
86
+ """
87
+
88
+ def __init__(self, path, has_header=None, encoding="utf-8", **dialect_kwargs):
89
+ self.path = path
90
+ self._has_header = has_header
91
+ self.encoding = encoding or "utf-8"
92
+ self._dialect_kwargs = dialect_kwargs
93
+
94
+ @toolz.memoize
95
+ def _sniff_dialect(self):
96
+ dialect = sniff_dialect(self.path, encoding=self.encoding)
97
+ for k, v in self._dialect_kwargs.items():
98
+ if k in dialect_terms:
99
+ setattr(dialect, k, v)
100
+ return dialect
101
+
102
+ @property
103
+ def dialect(self):
104
+ return self._sniff_dialect()
105
+
106
+ @property
107
+ def dialect_options(self):
108
+ return dialect_to_dict(self.dialect)
109
+
110
+ @property
111
+ def has_header(self):
112
+ if self._has_header is None:
113
+ self._has_header = infer_header(self.path, encoding=self.encoding)
114
+
115
+ return self._has_header
116
+
117
+ @property
118
+ def header(self):
119
+ if not self.has_header:
120
+ return None
121
+
122
+ with open(self.path, encoding=self.encoding, newline="") as f:
123
+ reader = csv.reader(f, **self.dialect_options)
124
+ header = next(reader)
125
+ return tuple(header)
126
+
127
+ def to_df(self):
128
+ return pd.read_csv(self.path, encoding=self.encoding, dialect=self.dialect)
129
+
130
+ @contextlib.contextmanager
131
+ def reader(self, as_dict=False):
132
+ if as_dict and not self.header:
133
+ raise ValueError("missing header")
134
+
135
+ with open(self.path, encoding=self.encoding, newline="") as fd:
136
+ if as_dict:
137
+ reader = csv.DictReader(fd, **self.dialect_options)
138
+ else:
139
+ if self.has_header:
140
+ fd.readline() # skip header
141
+ reader = csv.reader(fd, **self.dialect_options)
142
+ yield reader
143
+
144
+ @toolz.memoize
145
+ def infer_schema(self):
146
+ if not self.has_header:
147
+ return None
148
+
149
+ mapping = {
150
+ np.int8: types.INT8,
151
+ np.int16: types.INT16,
152
+ np.int32: types.INT32,
153
+ np.int64: types.INT64,
154
+ np.float16: types.FLOAT32,
155
+ np.float32: types.FLOAT32,
156
+ np.float64: types.FLOAT64,
157
+ np.datetime64: types.DATETIME,
158
+ np.object_: types.STRING,
159
+ np.str_: types.STRING,
160
+ }
161
+ # np.bool removed since numpy 1.20 https://github.com/numpy/numpy/releases/tag/v1.20.0
162
+ if np.__version__ < "1.20.0":
163
+ mapping[np.bool] = types.BOOLEAN
164
+ else:
165
+ mapping[np.bool_] = types.BOOLEAN
166
+
167
+ df = pd.read_csv(self.path, encoding=self.encoding, dialect=self.dialect, nrows=500)
168
+ schema = Schema()
169
+ for col in df.columns:
170
+ canonical_type = mapping.get(df.dtypes[col].type, types.STRING)
171
+ schema.add_field_by_attrs(col, canonical_type)
172
+ return schema
@@ -0,0 +1,82 @@
1
+ {
2
+ "__meta__": {
3
+ "description": "THIS FILE IS AUTO GENERATED, PLEASE DO NOT EDIT",
4
+ "version": "2023-03-21T16:56:28.143088"
5
+ },
6
+ "clickhouse_default": {
7
+ "host": "clickhouse.ym",
8
+ "password": "gAAAAABkGXE805UXC1yT86W2NJ-L5s6VThImXfhqZNAWq8ejW_kr40hL6HDBkfNynAwZnK7xJ-hKdDp_kmB9pvoa0vooRCwTVg==",
9
+ "port": 19000,
10
+ "user": "ymetl"
11
+ },
12
+ "emr_hdfs_default": {
13
+ "host": "emr-header-2",
14
+ "port": 50070,
15
+ "user_name": "ymetl"
16
+ },
17
+ "emr_hive_default": {
18
+ "auth": "LDAP",
19
+ "hdfs_options": {
20
+ "host": "emr-header-2",
21
+ "port": 50070,
22
+ "user_name": "ymetl"
23
+ },
24
+ "hive_conf": {
25
+ "tez.queue.name": "etl"
26
+ },
27
+ "host": "ha.hive.emr.ym",
28
+ "password": "gAAAAABkGXE8L8IcXEm6k2vnkNd8vtapiIFe7vbqTu5ywGJrSkoZy1We4o_hWRElIJ3SCQvHTNsXdEW59qTbybasR2kSqIJ_Ys0lX3_xLIeuZ307qJGFKIE=",
29
+ "port": 10001,
30
+ "user": "ymetl"
31
+ },
32
+ "emr_impala_default": {
33
+ "auth_mechanism": "PLAIN",
34
+ "host": "ha.impala.emr.ym",
35
+ "password": "gAAAAABkGXE8WZQCSss2zXKeBswFEG1Qvdv6QqfFDwfszwu1bgP6ZSd1wRiIXv7tXL8cBWGBeZP1eYqexcwo5Cehor_9lUXjp7YVrAQEXzTjcWa9zlpkK50=",
36
+ "port": 21051,
37
+ "user": "ymetl"
38
+ },
39
+ "hdfs_default": {
40
+ "host": "hdfsnn.ym",
41
+ "port": 50070,
42
+ "user_name": "ymetl"
43
+ },
44
+ "hive_default": {
45
+ "auth": "LDAP",
46
+ "hdfs_options": {
47
+ "host": "hdfsnn.ym",
48
+ "port": 50070,
49
+ "user_name": "ymetl"
50
+ },
51
+ "hive_conf": {
52
+ "spark.yarn.queue": "etl"
53
+ },
54
+ "host": "hive.ym",
55
+ "password": "gAAAAABkGXE8zVq-ZOWQyzTMzP-ogS-TqV8K_gxklD61LmsEZeN54pOBIDpKJD9n5913vD4mZRTEEKzxKunLde9dpVW4u2lbZyepP-YT-tEbqjIfrW-gRUY=",
56
+ "port": 10000,
57
+ "user": "ymetl"
58
+ },
59
+ "impala_default": {
60
+ "auth_mechanism": "PLAIN",
61
+ "host": "ha.impala.ym",
62
+ "password": "gAAAAABkGXE8ciN_A0sQmzh1VOKeTprtjOtp_JPR7yCgZQeZiUD0lQ4dIshzWxLfb_YIqEcxL7uXYyxk0jFVwpFGJQUl8gNaCESFtg_Cei7tLwD4cm5KA9o=",
63
+ "port": 21051,
64
+ "user": "ymetl"
65
+ },
66
+ "mysql_default": {
67
+ "host": "mysql.ym",
68
+ "password": "gAAAAABkGXE8d1AG1mqenBsAOgGx_blaQQUceK0D_R1Vbo-wLe2ZHdHEmW9dSJ4fgYMkwy95-6uSjdXP3RfnankyXgd-BBLyiA==",
69
+ "port": 3306,
70
+ "user": "dev"
71
+ },
72
+ "phoenix_default": {
73
+ "host": "phoenix-etl.ym",
74
+ "port": 8765
75
+ },
76
+ "tidb_default": {
77
+ "host": "tidb-etl.ym",
78
+ "password": "gAAAAABkGXE8-zHHCsScuOqgSB6bVTmA1Mxdl_jp2Z-DprOC5Qh6cHrr33VOREGPyAJH_1Wh5SjHzGC1KmyRO49dbS38-bvQuJo8Z_ReCX1yL1DwlkgwMmv69xAlasFEEBwMP8CirioX",
79
+ "port": 4000,
80
+ "user": "dev"
81
+ }
82
+ }
@@ -0,0 +1,111 @@
1
+ A Lightweight and General Data Synchronization Solution
2
+ =======================
3
+
4
+ Data synchronization between different database systems is a common requirement in the big data field. A typical scenario is that the business system uses MySQL for transactions and random queries; the data warehouse uses Hive; the results after ETL are then put into systems such as MySQL, AWS Redshift, etc. for use by BI and reporting tools.
5
+
6
+ First, let's clarify the requirements and goals:
7
+
8
+ - Real-time: Non-real-time, offline synchronization, generally T+1, or as fine as hourly granularity
9
+ - Scalability: Need to support multiple heterogeneous data sources, such as MySQL, Hive, ElasticSearch, etc.
10
+ - Performance requirements: Because it is an offline system, there is no strict requirement for performance, but it is best to be as fast as possible and possible to optimize
11
+ - Complexity: Low complexity, few dependencies, easy to use, easy to operate
12
+ - Functional requirements: Need to meet full synchronization and incremental data synchronization
13
+
14
+ ## Solution
15
+
16
+ Data synchronization is not a special problem, it is actually just two operations: read and write. Similar to this is the database backup and restore, many database systems have such tools, such as MySQL's `mysqldump` and `mysqlimport`, MongoDB's `mongodump` and `mongorestore`, etc. These tools generally use special encoding formats for performance and do not consider generality. But a general data synchronization system can be implemented using the same approach.
17
+
18
+ ![pigeon_design.png](./images/pigeon_design.png)
19
+
20
+ The above picture describes the solution of this article, which is to split the read and write, and transition through CSV files.
21
+
22
+ ## Scalability
23
+
24
+ The core of this design is to abstract data synchronization into two processes: export (read) and import (write), completely decoupled, so it has good scalability. Each data source only needs to implement the two operations of read and write. Taking common data sources as an example, let's see how to import data from CSV (exporting to CSV is easy, can be implemented using any programming language).
25
+
26
+ | Data source | Import CSV |
27
+ | ------------- | ------------------------------------------------------------ |
28
+ | MySQL | Use `LOAD DATA LOCAL INFILE` for batch loading, or read the file to run the `INSERT` statement |
29
+ | AWS Redshift | Use AWS S3 as a transfer, and use the `COPY` command for batch loading |
30
+ | Hive | Specify the Serde as `org.apache.hadoop.hive.serde2.OpenCSVSerde` when creating the table, or convert the CSV to the default `TEXTFILE` format before importing; then use `LOAD DATA [LOCAL] INPATH` for batch loading. |
31
+ | ElasticSearch | Read the file and insert in batches |
32
+ | FTP, AWS S3 | Upload directly |
33
+
34
+ ## Performance Issues
35
+
36
+ Another benefit of decoupling is performance optimization, because we can focus on optimizing export and import without worrying about the impact of each other.
37
+
38
+ ### Export Performance
39
+
40
+ Export performance optimization is usually achieved through parallelization, that is, the data set is split and then processed in parallel.
41
+
42
+ Taking MySQL as an example, if the table has an auto-increment primary key, first query the upper and lower bounds, split into N pieces, and then start M threads to consume (Sqoop also uses this approach, by adjusting the number of mappers to control). Each thread can write a separate file and then merge, or use a separate thread to aggregate and write; generally speaking, the first method is better in terms of performance.
43
+
44
+ The premise of this optimization is to find a way to split as evenly as possible. If there is data skew, the improvement may not be significant, or even degrade to single thread. For the database, the field used for splitting also needs to have an index, and generally a auto-increment primary key or a timestamp with an index will be selected. The parallelism cannot be too high, otherwise it may bring too much pressure to the upstream system. Another implementation detail is that the data should be streamed to get and write to the file, rather than pulling all into memory, otherwise it may cause too much memory usage, or even OOM.
45
+
46
+ In addition, considering that the export process may be interrupted abnormally, you can also consider using a checkpoint mechanism to retry from the failure.
47
+
48
+ ### Import Performance
49
+
50
+ Import performance optimization is usually achieved through the batch idea.
51
+
52
+ Some databases, such as MySQL, Hive, Redshift, etc., support directly loading CSV files, which is generally the most efficient way. If batch loading is not supported, you can also call the batch import API (such as ElasticSearch's `/_bulk`, the database's `INSERT` statement usually supports inserting multiple records at once). Some data sources may support compressed files (such as Redshift supports GZIP and other compression formats), you can compress before importing to shorten the transmission time and bandwidth consumption.
53
+
54
+ The failure retry of the import process can also use the checkpoint to achieve "resuming from the breakpoint", and you can also consider using a deduplication mechanism, such as using a bloom filter for checking.
55
+
56
+ ## Complexity
57
+
58
+ From the design diagram in the previous section, you can see that this solution has low complexity, clear process, and easy to implement. Except for the local file system, there are basically no external dependencies. Pay attention to logs and statistics during implementation, which is convenient for tracking progress, analyzing problems, and locating faults.
59
+
60
+ ## Full and Incremental
61
+
62
+ From the perspective of complexity, full synchronization is the easiest to implement and better guarantees the consistency of the data. However, as the data volume increases, the resource consumption and time required for each full synchronization will increase. Incremental synchronization is necessary and more complex.
63
+
64
+ ### Incremental Export
65
+
66
+ The premise of incremental export is to be able to identify new data. The easiest way is to judge by the auto-increment primary key, but this is limited by the characteristics of the database itself. Some databases do not support auto-increment primary keys, and some databases do not guarantee monotonicity of auto-increment primary keys (such as [TiDB](<https://pingcap.com/docs/sql/mysql-compatibility/#auto-increment-id>), deploying multiple tidb-servers may result in the ID inserted later being smaller than the ID inserted earlier). It is more reliable to judge by time, time naturally increases and is strictly monotonic, and another benefit is that for periodic incremental synchronization, you don't need to save the checkpoint, you can calculate it directly.
67
+
68
+ Having a monotonically increasing integer or time field (preferably time) is a necessary condition for incremental export, and in order to better export performance, this field also needs to be indexed.
69
+
70
+ ### Incremental Import
71
+
72
+ Incremental import needs to consider more situations, such as import mode and idempotence.
73
+
74
+ First, let's look at the import mode, which can be divided into two types: merge (`MERGE`) and append (`APPEND`) (in fact, there is also a special incremental import, such as importing to a partition of Hive, which is the same as full import (`OVERWRITE`)).
75
+
76
+ - `MERGE`: The new and updated records in the upstream system need to be synchronized to the target system, similar to `UPSERT`
77
+
78
+ - `APPEND`: The upstream system only adds, does not update, similar to `INSERT`
79
+
80
+ The implementation of `APPEND` is relatively simple, but if imported multiple times, (when there is no unique constraint) it is easy to generate duplicate data (not idempotent). In fact, `APPEND` is an extreme case of `MERGE`, so it can be converted to `MERGE` for implementation.
81
+
82
+ The premise of implementing `MERGE` is that you need a field to distinguish the uniqueness of the record, such as a primary key, a unique constraint (as long as it can be logically distinguished). Different data sources implement `MERGE` in different ways. Some data sources support `UPSERT` operation, such as Phoenix, Kudu, MongoDB, etc.; ElasticSearch is also similar to `UPSERT` when indexing documents; some databases support `REPLACE` operation; MySQL also has `INSERT ON DUPLICATE UPDATE`. In fact, for MySQL, Redshift, Hive and other relational databases, there is also a general solution: use `FULL JOIN` or `LEFT JOIN + UNION ALL` (refer to [Talking about Idempotence](http://liyangliang.me/posts/2019/03/idempotence/)).
83
+
84
+ This implementation of incremental import has a limitation, that is, it cannot synchronize the physical delete operation of the upstream system. If there is such a requirement, you can consider changing to soft delete, or using full synchronization.
85
+
86
+ ### Import Process
87
+
88
+ Whether it is full or incremental, the import process needs to ensure at least two points: "transactionality" and cannot (as little as possible) affect the use of the target data. This problem mainly occurs in the database system, and generally does not occur in scenarios such as ElasticSearch and object storage.
89
+
90
+ "Transactionality" means that for the data to be imported, either all succeed or all fail, and partial import cannot occur.
91
+
92
+ During the import process, the target data should be available, or the affected time should be as short as possible. For example, long-term table locking should not occur, causing queries to fail.
93
+
94
+ You can optimize the process: first import to the staging table, prepare the final result table, and then replace it with the target table. When importing to the staging table, you can keep deleting and retrying, to ensure that the new data is completely imported before proceeding to the next step. For full import, you can directly rename the staging table to the target table, or use the `INSERT OVERWRITE` statement to copy the data. For incremental import, you need to create an intermediate table to store the result data, and after completing, use the rename or data copy method to update to the target table.
95
+
96
+ ## Limitations
97
+
98
+ There are mainly two limitations: 1. Need to write to disk; 2. CSV.
99
+
100
+ In some scenarios, writing to disk may bring some additional performance overhead, but in the offline system, this impact should be negligible. Pay attention to file cleaning, otherwise the entire disk space may be used up. The biggest problem should be that the export and import are not completely decoupled, and must be deployed on the same machine, and ensure that the same file path is used. Because of this state, it to some extent limits the ability to horizontally scale (note, only the synchronization of a single table needs to be completed on one machine, and multiple tables can be horizontally scaled).
101
+
102
+ Using CSV files as a data exchange format is actually a compromise, with both advantages and disadvantages. Regarding the CSV format, there is also a discussion in [this article](http://liyangliang.me/posts/2019/03/data-encoding/). Here is a summary of the shortcomings:
103
+
104
+ - Cannot distinguish between numbers and strings that happen to be composed of numbers. However, this can be solved by using an additional schema, such as exporting the data at the same time, also exporting a schema, or using the schema of the target database to determine when importing.
105
+ - Does not support binary data.
106
+ - There may be escape problems.
107
+ - Cannot distinguish between empty strings and null values (`None`, `NULL`), one solution is to use a special value to represent null values, such as `\N`.
108
+
109
+ Overall, using CSV should be able to meet more than 90% of the use cases.
110
+
111
+ Using Kafka as a data exchange bus can break these limitations, but it also increases the complexity of the system. You can choose according to the actual situation.
@@ -0,0 +1,171 @@
1
+ from recurvedata.pigeon.connector import get_connector
2
+ from recurvedata.pigeon.handler.csv_handler import CSVFileHandler, create_csv_file_handler_factory
3
+
4
+
5
+ def new_to_csv_dumper(
6
+ dbtype, connection=None, database=None, connector=None, filename=None, transformer=None, hive=False, **dumper_kwargs
7
+ ):
8
+ if connector is None:
9
+ connector = get_connector(dbtype, connection=connection, database=database)
10
+
11
+ handler_factory_params = ["merge_files", "encoding", "write_header"] + CSVFileHandler.ERROR_HANDLE_PARAMS
12
+ factory_options = dict(filename=filename, hive=hive, transformer=transformer)
13
+ for p in handler_factory_params:
14
+ if p in dumper_kwargs:
15
+ factory_options[p] = dumper_kwargs.pop(p)
16
+ factory = create_csv_file_handler_factory(**factory_options)
17
+ dumper_kwargs.setdefault("handler_factories", [factory])
18
+
19
+ row_factory = dumper_kwargs.pop("row_factory", None)
20
+
21
+ if dbtype == "cassandra":
22
+ from .cass import CassandraDumper
23
+
24
+ dumper = CassandraDumper(connector, **dumper_kwargs)
25
+ else:
26
+ from .dbapi import DBAPIDumper
27
+
28
+ dumper = DBAPIDumper(connector, **dumper_kwargs)
29
+
30
+ if row_factory is not None:
31
+ dumper.row_factory = row_factory
32
+
33
+ return dumper
34
+
35
+
36
+ def new_tidb_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
37
+ return new_to_csv_dumper("tidb", connection, database, filename, transformer, hive=False, **dumper_kwargs)
38
+
39
+
40
+ def new_tidb_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
41
+ return new_to_csv_dumper("tidb", connection, database, filename, transformer, hive=True, **dumper_kwargs)
42
+
43
+
44
+ def new_mysql_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
45
+ return new_to_csv_dumper("mysql", connection, database, filename, transformer, hive=False, **dumper_kwargs)
46
+
47
+
48
+ def new_mysql_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
49
+ return new_to_csv_dumper("mysql", connection, database, filename, transformer, hive=True, **dumper_kwargs)
50
+
51
+
52
+ def new_redshift_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
53
+ return new_to_csv_dumper("redshift", connection, database, filename, transformer, hive=False, **dumper_kwargs)
54
+
55
+
56
+ def new_redshift_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
57
+ return new_to_csv_dumper("redshift", connection, database, filename, transformer, hive=True, **dumper_kwargs)
58
+
59
+
60
+ def new_impala_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
61
+ return new_to_csv_dumper("impala", connection, database, filename, transformer, hive=False, **dumper_kwargs)
62
+
63
+
64
+ def new_impala_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
65
+ return new_to_csv_dumper("impala", connection, database, filename, transformer, hive=True, **dumper_kwargs)
66
+
67
+
68
+ def new_phoenix_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
69
+ return new_to_csv_dumper("phoenix", connection, database, filename, transformer, hive=False, **dumper_kwargs)
70
+
71
+
72
+ def new_phoenix_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
73
+ return new_to_csv_dumper("phoenix", connection, database, filename, transformer, hive=True, **dumper_kwargs)
74
+
75
+
76
+ def new_clickhouse_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
77
+ return new_to_csv_dumper("clickhouse", connection, database, filename, transformer, hive=False, **dumper_kwargs)
78
+
79
+
80
+ def new_clickhouse_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
81
+ return new_to_csv_dumper("clickhouse", connection, database, filename, transformer, hive=True, **dumper_kwargs)
82
+
83
+
84
+ def new_cassandra_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
85
+ return new_to_csv_dumper("cassandra", connection, database, filename, transformer, hive=False, **dumper_kwargs)
86
+
87
+
88
+ def new_cassandra_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
89
+ return new_to_csv_dumper("cassandra", connection, database, filename, transformer, hive=True, **dumper_kwargs)
90
+
91
+
92
+ def new_elasticsearch_to_csv_dumper(hosts=None, filename=None, transformer=None, **dumper_kwargs):
93
+ from recurvedata.pigeon.dumper.es import ElasticSearchDumper
94
+
95
+ factory = create_csv_file_handler_factory(filename=filename, transformer=transformer)
96
+ dumper_kwargs.setdefault("handler_factories", [factory])
97
+ dumper = ElasticSearchDumper(connector=get_connector("es", host=hosts), **dumper_kwargs)
98
+ return dumper
99
+
100
+
101
+ def new_elasticsearch_to_hive_dumper(hosts=None, filename=None, transformer=None, **dumper_kwargs):
102
+ from recurvedata.pigeon.dumper.es import ElasticSearchDumper
103
+
104
+ factory = create_csv_file_handler_factory(filename=filename, transformer=transformer, hive=True)
105
+ dumper_kwargs.setdefault("handler_factories", [factory])
106
+ dumper = ElasticSearchDumper(connector=get_connector("es", host=hosts), **dumper_kwargs)
107
+ return dumper
108
+
109
+
110
+ def new_ftp_dumper(conf=None, **dumper_kwargs):
111
+ from recurvedata.pigeon.dumper.ftp import FtpDumper
112
+
113
+ dumper = FtpDumper(connector=get_connector("ftp", conf=conf), **dumper_kwargs)
114
+ return dumper
115
+
116
+
117
+ def new_mongodb_to_csv_dumper(connection=None, filename=None, transformer=None, **dumper_kwargs):
118
+ from recurvedata.pigeon.dumper.mongodb import MongoDBDumper
119
+
120
+ factory = create_csv_file_handler_factory(filename=filename, transformer=transformer)
121
+ dumper_kwargs.setdefault("handler_factories", [factory])
122
+ dumper = MongoDBDumper(connector=get_connector("mongodb", connection=connection), **dumper_kwargs)
123
+ return dumper
124
+
125
+
126
+ def new_mongodb_to_hive_dumper(connection=None, filename=None, transformer=None, **dumper_kwargs):
127
+ from recurvedata.pigeon.dumper.mongodb import MongoDBDumper
128
+
129
+ factory = create_csv_file_handler_factory(filename=filename, transformer=transformer, hive=True)
130
+ dumper_kwargs.setdefault("handler_factories", [factory])
131
+ dumper = MongoDBDumper(connector=get_connector("mongodb", connection=connection), **dumper_kwargs)
132
+ return dumper
133
+
134
+
135
+ def new_google_bigquery_to_csv_dumper(
136
+ filename=None,
137
+ transformer=None,
138
+ key_path=None,
139
+ key_dict=None,
140
+ proxies=None,
141
+ location=None,
142
+ hive=False,
143
+ **dumper_kwargs,
144
+ ):
145
+ from recurvedata.pigeon.connector import new_google_bigquery_connector
146
+ from recurvedata.pigeon.dumper.dbapi import DBAPIDumper
147
+
148
+ connector = new_google_bigquery_connector(key_path=key_path, key_dict=key_dict, proxies=proxies, location=location)
149
+ factory = create_csv_file_handler_factory(filename=filename, transformer=transformer, hive=hive, encoding="utf-8")
150
+ dumper_kwargs.setdefault("handler_factories", [factory])
151
+ dumper = DBAPIDumper(connector, **dumper_kwargs)
152
+ row_factory = dumper_kwargs.pop("row_factory", None)
153
+ if row_factory is not None:
154
+ dumper.row_factory = row_factory
155
+ return dumper
156
+
157
+
158
+ def new_clickhouse_native_to_csv_dumper(
159
+ connection=None, database=None, filename=None, transformer=None, **dumper_kwargs
160
+ ):
161
+ return new_to_csv_dumper(
162
+ "clickhouse_native", connection, database, filename, transformer, hive=False, **dumper_kwargs
163
+ )
164
+
165
+
166
+ def new_clickhouse_native_to_hive_dumper(
167
+ connection=None, database=None, filename=None, transformer=None, **dumper_kwargs
168
+ ):
169
+ return new_to_csv_dumper(
170
+ "clickhouse_native", connection, database, filename, transformer, hive=True, **dumper_kwargs
171
+ )