recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,233 @@
1
+ import time
2
+ from typing import TYPE_CHECKING, Any, List, Optional
3
+
4
+ from recurvedata.pigeon import const
5
+ from recurvedata.pigeon.loader.csv_to_mysql import CSVToMySQLLoader
6
+ from recurvedata.pigeon.utils import md5hash, randomized_suffix
7
+ from recurvedata.pigeon.utils.sql import bak_table_of, reconcile_table_of, staging_table_of
8
+
9
+ if TYPE_CHECKING:
10
+ from recurvedata.pigeon.connector.starrocks import StarRocksConnector
11
+
12
+ allowed_modes = (const.LOAD_OVERWRITE, const.LOAD_MERGE, const.LOAD_APPEND)
13
+
14
+
15
+ class CSVToStarRocksLoader(CSVToMySQLLoader):
16
+ def __init__(
17
+ self,
18
+ database: str,
19
+ table: str,
20
+ filename: str,
21
+ connector: Optional["StarRocksConnector"] = None,
22
+ create_table_ddl: Optional[str] = None,
23
+ mode: str = const.LOAD_OVERWRITE,
24
+ primary_keys: Optional[List[str]] = None,
25
+ skiprows: int = 0,
26
+ columns: Optional[List[str]] = None,
27
+ using_insert: bool = False,
28
+ insert_batch_size: int = 1000,
29
+ insert_concurrency: int = 1,
30
+ delete_file: bool = False,
31
+ pre_queries: Optional[List[str]] = None,
32
+ post_queries: Optional[List[str]] = None,
33
+ load_strict_mode: bool = False,
34
+ *args: Any,
35
+ **kwargs: Any,
36
+ ):
37
+ if not connector:
38
+ raise ValueError(f"connector is required for {self.__class__.__name__}")
39
+ self.load_strict_mode: bool = load_strict_mode
40
+ connector.load_strict_mode = load_strict_mode
41
+ self.logger.info(f"load_strict_mode: {load_strict_mode}")
42
+ # the same filename incoming, the same intermediate tables will be generated
43
+ # and the previous fail intermediate tables will be cleaned in a new try
44
+ table_suffix: str = md5hash(filename)[:6] if filename is not None else randomized_suffix()
45
+ self.__staging_table: str = staging_table_of(table) + "_" + table_suffix
46
+ self.__reconcile_table: str = reconcile_table_of(table) + "_" + table_suffix
47
+ self.__bak_table: str = bak_table_of(table) + "_" + table_suffix
48
+ if any(
49
+ [
50
+ len(self.__staging_table) > 64,
51
+ len(self.__reconcile_table) > 64,
52
+ len(self.__bak_table) > 64,
53
+ ]
54
+ ):
55
+ self.logger.error(
56
+ f"table name {self.__staging_table} 's length: {len(self.__staging_table)}\n"
57
+ f"table name {self.__reconcile_table}'s length: {len(self.__reconcile_table)}\n"
58
+ f"table name {self.__bak_table}'s length: {len(self.__bak_table)}\n"
59
+ )
60
+ raise ValueError("length of intermediate table name is greater than 64!")
61
+ super().__init__(
62
+ database=database,
63
+ table=table,
64
+ filename=filename,
65
+ connector=connector,
66
+ create_table_ddl=create_table_ddl,
67
+ mode=mode,
68
+ primary_keys=primary_keys,
69
+ skiprows=skiprows,
70
+ columns=columns,
71
+ using_insert=using_insert,
72
+ insert_batch_size=insert_batch_size,
73
+ insert_concurrency=insert_concurrency,
74
+ delete_file=delete_file,
75
+ pre_queries=pre_queries,
76
+ post_queries=post_queries,
77
+ *args,
78
+ **kwargs,
79
+ )
80
+
81
+ @property
82
+ def staging_table(self) -> str:
83
+ """
84
+ overwrite method, return a table name with randomized postfix
85
+ """
86
+ return self.__staging_table
87
+
88
+ def _merge_into_target_table(self) -> None:
89
+ queries = []
90
+ if self.mode == const.LOAD_MERGE:
91
+ queries.extend(self._ingest_by_merging())
92
+ elif self.mode == const.LOAD_OVERWRITE:
93
+ # bak_table = bak_table_of(self.table)
94
+ bak_table = self.__bak_table
95
+ queries.extend(
96
+ [
97
+ f"DROP TABLE IF EXISTS {bak_table}",
98
+ f"ALTER TABLE {self.table} RENAME {bak_table}",
99
+ f"ALTER TABLE {self.staging_table} RENAME {self.table}",
100
+ f"DROP TABLE IF EXISTS {bak_table}",
101
+ ]
102
+ )
103
+ else:
104
+ # special process at `APPEND` mode, cuz an occasional error happens:
105
+ # ================================== ERROR MSG START ======================================
106
+ # pymysql.err.ProgrammingError: (1064, 'Unexpected exception: Failed to drop table {table_name}.
107
+ # msg: There are still some transactions in the COMMITTED state waiting to be completed.
108
+ # The table {table_name} cannot be dropped. If you want to forcibly drop(cannot be recovered),
109
+ # please use "DROP TABLE <table> FORCE".')
110
+ # ================================== ERROR MSG END ========================================
111
+ # Here's the optimization: commit insert statement first, make it blocking until finished.
112
+ queries.append(f"INSERT INTO {self.table} SELECT * FROM {self.staging_table}")
113
+ self.connector.execute(self.pre_queries + queries, autocommit=True, commit_on_close=False)
114
+
115
+ queries.clear()
116
+ queries.append(f"DROP TABLE {self.staging_table}")
117
+ self.connector.execute(queries + self.post_queries, autocommit=True, commit_on_close=False)
118
+ return
119
+
120
+ queries = self.pre_queries + queries + self.post_queries
121
+ self.logger.info("running MySQL queries within a transaction")
122
+ self.connector.execute(queries, autocommit=False, commit_on_close=True)
123
+
124
+ def _ingest_by_merging(self) -> List[str]:
125
+ """Merge with deduplication based on primary keys using StarRocks-compatible syntax"""
126
+ # First, deduplicate staging table based on primary keys using window function
127
+ pk_columns = ", ".join(self.primary_keys)
128
+
129
+ # Get all columns from staging table (excluding the rn column we'll add)
130
+ cols = self.connector.get_columns(self.staging_table)
131
+ cols_str = ", ".join(self.connector.quote_identifier(x) for x in cols)
132
+
133
+ # Create a temporary table with deduplicated data
134
+ tmp_table = f"{self.staging_table}_dedup"
135
+ dedup_sql = f"""
136
+ DROP TABLE IF EXISTS {tmp_table};
137
+ CREATE TABLE {tmp_table} LIKE {self.staging_table};
138
+ INSERT INTO {tmp_table}
139
+ SELECT {cols_str} FROM (
140
+ SELECT *, ROW_NUMBER() OVER(PARTITION BY {pk_columns} ORDER BY {pk_columns}) AS rn
141
+ FROM {self.staging_table}
142
+ ) t
143
+ WHERE rn = 1;
144
+ """
145
+
146
+ # Replace staging table with deduplicated data
147
+ replace_sql = f"""
148
+ DROP TABLE {self.staging_table};
149
+ ALTER TABLE {tmp_table} RENAME {self.staging_table};
150
+ """
151
+
152
+ # Simple merge: backup target table, then merge and deduplicate in one step
153
+ bak = self.__bak_table
154
+ table = self.connector.quote_identifier(self.table)
155
+ staging = self.connector.quote_identifier(self.staging_table)
156
+ bak = self.connector.quote_identifier(bak)
157
+
158
+ # Simple and efficient merge: backup + merge + deduplicate in one operation
159
+ merge_sql = f"""
160
+ -- Backup target table
161
+ DROP TABLE IF EXISTS {bak};
162
+ ALTER TABLE {table} RENAME {bak};
163
+
164
+ -- Create new target table and insert deduplicated merged data in one step
165
+ CREATE TABLE {table} AS
166
+ SELECT {cols_str} FROM (
167
+ SELECT *, ROW_NUMBER() OVER(PARTITION BY {pk_columns} ORDER BY {pk_columns}) AS rn
168
+ FROM (
169
+ SELECT * FROM {bak}
170
+ UNION ALL
171
+ SELECT * FROM {staging}
172
+ ) combined
173
+ ) t WHERE rn = 1;
174
+
175
+ -- Clean up
176
+ DROP TABLE {bak};
177
+ DROP TABLE {staging};
178
+ """
179
+
180
+ return [dedup_sql, replace_sql, merge_sql]
181
+
182
+ def execute(self) -> None:
183
+ """
184
+ overwrite method, implemented try...catch...
185
+ """
186
+ self.before_execute()
187
+ try:
188
+ self.execute_impl()
189
+ except Exception as e:
190
+ self.handle_exception()
191
+ raise e
192
+ self.after_execute()
193
+
194
+ def _prepare_staging_table(self):
195
+ queries = """
196
+ DROP TABLE IF EXISTS {staging};
197
+ CREATE TABLE {staging} LIKE {table};
198
+ """.format(
199
+ staging=self.staging_table, table=self.table
200
+ )
201
+ self.connector.execute(queries, autocommit=True)
202
+ time.sleep(5) # wait for table to be created and visible
203
+
204
+ def handle_exception(self) -> None:
205
+ """
206
+ ensure all intermediate tables are cleaned safely after catch the exception
207
+ """
208
+ qry_exists_sql = """
209
+ SELECT 1 FROM information_schema.tables
210
+ WHERE table_schema = '{database}' AND table_name = '{table}';
211
+ """
212
+ is_table_exists = self.connector.fetchall(qry_exists_sql.format(database=self.database, table=self.table))
213
+ is_bak_exists = self.connector.fetchall(qry_exists_sql.format(database=self.database, table=self.__bak_table))
214
+ if is_table_exists:
215
+ # clean intermediate tables directly.
216
+ queries = [
217
+ f"DROP TABLE IF EXISTS {self.__bak_table}",
218
+ f"DROP TABLE IF EXISTS {self.__staging_table}",
219
+ f"DROP TABLE IF EXISTS {self.__reconcile_table}",
220
+ ]
221
+ elif is_bak_exists:
222
+ # rollback from bak_table
223
+ queries = [
224
+ f"ALTER TABLE {self.__bak_table} RENAME {self.table}",
225
+ f"DROP TABLE IF EXISTS {self.__staging_table}",
226
+ f"DROP TABLE IF EXISTS {self.__reconcile_table}",
227
+ ]
228
+ else:
229
+ queries = [
230
+ f"DROP TABLE IF EXISTS {self.__staging_table}",
231
+ f"DROP TABLE IF EXISTS {self.__reconcile_table}",
232
+ ]
233
+ self.connector.execute(queries, autocommit=False, commit_on_close=True)
@@ -0,0 +1,116 @@
1
+ import datetime
2
+ import json
3
+
4
+ import cytoolz as toolz
5
+
6
+ from recurvedata.pigeon.schema import Schema
7
+
8
+
9
+ class JSONEncoder(json.JSONEncoder):
10
+ def default(self, o):
11
+ if isinstance(o, datetime.date):
12
+ return o.isoformat()
13
+ if isinstance(o, datetime.timedelta):
14
+ return str(o)
15
+ if isinstance(o, Schema):
16
+ return o.to_list()
17
+ return super().default(o)
18
+
19
+
20
+ class Meta(object):
21
+ def to_dict(self):
22
+ raise NotImplementedError()
23
+
24
+ def to_json(self, **kwargs):
25
+ params = toolz.merge({"sort_keys": True, "ensure_ascii": False, "cls": JSONEncoder}, kwargs)
26
+ return json.dumps(self.to_dict(), **params)
27
+
28
+
29
+ class HandlerMeta(Meta):
30
+ def __init__(self):
31
+ self.reset()
32
+
33
+ def reset(self):
34
+ self.num_input_rows = 0
35
+ self.num_output_rows = 0
36
+ self.num_error_rows = 0
37
+ self.error_log_size = 0 # 当前打印的报错 row 字符串的字符数
38
+
39
+ def to_dict(self):
40
+ return {
41
+ "num_input_rows": self.num_input_rows,
42
+ "num_output_rows": self.num_output_rows,
43
+ "num_error_rows": self.num_error_rows,
44
+ "error_log_size": self.error_log_size,
45
+ }
46
+
47
+
48
+ class HandlerFactoryMeta(HandlerMeta):
49
+ def __init__(self, name):
50
+ self.name = name
51
+ super().__init__()
52
+
53
+ def update(self, handler_meta):
54
+ self.num_input_rows += handler_meta.num_input_rows
55
+ self.num_output_rows += handler_meta.num_output_rows
56
+ self.num_error_rows += handler_meta.num_error_rows
57
+ self.error_log_size += handler_meta.error_log_size
58
+
59
+ def to_dict(self):
60
+ d = super().to_dict()
61
+ d["name"] = self.name
62
+ return d
63
+
64
+
65
+ class DumperWorkerMeta(Meta):
66
+ def __init__(self):
67
+ self.num_dumped_rows = 0
68
+ self.schema = None
69
+ self.handlers_meta = None
70
+
71
+ def to_dict(self):
72
+ return {
73
+ "num_dumped_rows": self.num_dumped_rows,
74
+ "schema": self.schema,
75
+ "handlers_meta": [x.to_dict() for x in self.handlers_meta],
76
+ }
77
+
78
+
79
+ class DumperMeta(Meta):
80
+ def __init__(self, context=None):
81
+ self.time_start = None
82
+ self.time_finish = None
83
+ self.num_dumped_rows = 0
84
+ self.context = context
85
+ self.schema = None
86
+ self.handlers_meta = []
87
+
88
+ def mark_start(self):
89
+ self.time_start = datetime.datetime.now()
90
+
91
+ def mark_finish(self):
92
+ self.time_finish = datetime.datetime.now()
93
+
94
+ @property
95
+ def rows_per_second(self):
96
+ return self.num_dumped_rows / (self.time_finish - self.time_start).total_seconds()
97
+
98
+ @property
99
+ def duration(self):
100
+ if not self.time_start:
101
+ return None
102
+ if not self.time_finish:
103
+ return datetime.datetime.now() - self.time_start
104
+ return self.time_finish - self.time_start
105
+
106
+ def to_dict(self):
107
+ return {
108
+ "time_start": self.time_start,
109
+ "time_finish": self.time_finish,
110
+ "time_duration": self.duration,
111
+ "num_dumped_rows": self.num_dumped_rows,
112
+ "rows_per_second": self.rows_per_second,
113
+ "context": self.context,
114
+ "schema": self.schema,
115
+ "handlers_meta": [x.to_dict() for x in self.handlers_meta],
116
+ }
@@ -0,0 +1,42 @@
1
+ import logging
2
+ from collections import OrderedDict
3
+
4
+ from recurvedata.pigeon.utils.keyed_tuple import KeyedTuple
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def tuple_factory(colnames, row):
10
+ """Returns each row as a tuple"""
11
+ return row
12
+
13
+
14
+ def keyed_tuple_factory(colnames, row):
15
+ return KeyedTuple(row, colnames)
16
+
17
+
18
+ def dict_factory(colnames, row):
19
+ return dict(zip(colnames, row))
20
+
21
+
22
+ def ordered_dict_factory(colnames, row):
23
+ return OrderedDict(zip(colnames, row))
24
+
25
+
26
+ def get_row_keys(row):
27
+ if isinstance(row, dict):
28
+ # created by dict_factory or ordered_dict_factory
29
+ return list(row.keys())
30
+ if hasattr(row, "_fields"):
31
+ # created by keyed_tuple_factory
32
+ return list(row._fields)
33
+ else:
34
+ # created by tuple_factory, which is not able to know the keys
35
+ return None
36
+
37
+
38
+ def get_row_values(row):
39
+ if isinstance(row, dict):
40
+ # created by dict_factory or ordered_dict_factory
41
+ return list(row.values())
42
+ return list(row)
@@ -0,0 +1,124 @@
1
+ import json
2
+
3
+ import dateutil.parser
4
+
5
+ from recurvedata.pigeon.schema import types
6
+
7
+
8
+ class Field(object):
9
+ def __init__(self, name, type, size=None, comment=None, extra=None):
10
+ self.name = name
11
+ self.type = type
12
+ self.size = size
13
+ self.comment = comment
14
+ self.extra = extra
15
+
16
+ self._cast_func = {
17
+ types.INT8: self._cast_to_int,
18
+ types.INT16: self._cast_to_int,
19
+ types.INT32: self._cast_to_int,
20
+ types.INT64: self._cast_to_int,
21
+ types.FLOAT32: self._cast_to_float,
22
+ types.FLOAT64: self._cast_to_float,
23
+ types.BOOLEAN: self._cast_to_boolean,
24
+ types.DATETIME: self._cast_to_datetime,
25
+ types.DATE: self._cast_to_date,
26
+ types.JSON: self._cast_to_json,
27
+ }.get(self.type, self._cast_pass)
28
+
29
+ def cast(self, value):
30
+ if value is None:
31
+ return None
32
+ if value == "NULL":
33
+ return None
34
+ return self._cast_func(value)
35
+
36
+ def _cast_pass(self, value):
37
+ return value
38
+
39
+ def _cast_to_int(self, value: str):
40
+ if value == "":
41
+ return 0
42
+ return int(value)
43
+
44
+ def _cast_to_float(self, value: str):
45
+ if value == "":
46
+ return 0.0
47
+ return float(value)
48
+
49
+ def _cast_to_boolean(self, value: str):
50
+ if value.lower() in ("", "0", "false"):
51
+ return False
52
+ return True
53
+
54
+ def _cast_to_datetime(self, value: str):
55
+ if value == "":
56
+ return None
57
+ return dateutil.parser.parse(value)
58
+
59
+ def _cast_to_date(self, value: str):
60
+ if value == "":
61
+ return None
62
+ return dateutil.parser.parse(value).date()
63
+
64
+ def _cast_to_json(self, value: str):
65
+ if value in ("",):
66
+ # 正常情况下不会有 '',很可能是从 CSV 文件读到了空字符,当作 None 处理
67
+ return None
68
+ return json.loads(value)
69
+
70
+ def to_dict(self):
71
+ return {
72
+ "name": self.name,
73
+ "type": self.type,
74
+ "size": self.size,
75
+ "comment": self.comment,
76
+ "extra": self.extra,
77
+ }
78
+
79
+ def __str__(self):
80
+ return f'<Field ("{self.name}", "{self.type}")>'
81
+
82
+ def __repr__(self):
83
+ return f'<Field ("{self.name}", "{self.type}")>'
84
+
85
+
86
+ class Schema(object):
87
+ def __init__(self, fields=None):
88
+ self.fields = fields or []
89
+
90
+ def add_field(self, field):
91
+ # TODO(liyangliang): clean field names, handle special characters and duplications
92
+ self.fields.append(field)
93
+
94
+ def add_field_by_attrs(self, name, type, size=None, comment=None, extra=None):
95
+ self.add_field(Field(name, type, size, comment, extra))
96
+
97
+ def remove_field(self, name):
98
+ self.fields = [x for x in self.fields if x.name != name]
99
+
100
+ def keep_fields(self, names):
101
+ self.fields = [x for x in self.fields if x.name in names]
102
+
103
+ @property
104
+ def field_names(self):
105
+ return [x.name for x in self.fields]
106
+
107
+ def __iter__(self):
108
+ return iter(self.fields)
109
+
110
+ def to_list(self):
111
+ return [x.to_dict() for x in self.fields]
112
+
113
+ def to_json(self):
114
+ return json.dumps(self.to_list())
115
+
116
+ def dump(self, filename):
117
+ with open(filename, "w") as f:
118
+ json.dump(self.to_list(), f, indent=2)
119
+
120
+ @classmethod
121
+ def load(cls, filename):
122
+ with open(filename) as f:
123
+ data = json.load(f)
124
+ return cls([Field(**item) for item in data])
@@ -0,0 +1,13 @@
1
+ INT8 = "INT8" # 1-byte (8-bit) signed integers
2
+ INT16 = "INT16" # 2-byte (16-bit) signed integers
3
+ INT32 = "INT32" # 4-byte (32-bit) signed integers
4
+ INT64 = "INT64" # 8-byte (64-bit) signed integers
5
+ FLOAT32 = "FLOAT32" # 4-byte (32-bit) single-precision floating
6
+ FLOAT64 = "FLOAT64" # 8-byte (64-bit) double-precision floating
7
+ BOOLEAN = "BOOLEAN"
8
+
9
+ DATETIME = "DATETIME"
10
+ DATE = "DATE"
11
+
12
+ STRING = "STRING"
13
+ JSON = "JSON"