recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,155 @@
1
+ import copy
2
+
3
+ try:
4
+ import arrow
5
+
6
+ from recurvedata.pigeon.dumper.cass import CassandraDumper
7
+ from recurvedata.pigeon.row_factory import ordered_dict_factory
8
+ except ImportError:
9
+ pass
10
+
11
+ from recurvedata.operators.transfer_operator import utils
12
+ from recurvedata.operators.transfer_operator.task import DumpTask
13
+ from recurvedata.utils import extract_dict
14
+
15
+
16
+ class CassandraDumpTask(DumpTask):
17
+ enabled = False
18
+ worker_install_require = ["pigeon[cassandra]"]
19
+ ds_name_fields = ("data_source_name",)
20
+
21
+ def determine_partitions(self):
22
+ if self.config.partitions:
23
+ return self.config.partitions
24
+
25
+ if not self.config.incremental_by_time:
26
+ return None
27
+
28
+ if self.dag.is_once:
29
+ return None
30
+
31
+ start_date, end_date = self.get_schedule_time_range()
32
+ partitions = arrow.Arrow.range(self.config.time_granularity, start_date, end_date)
33
+ partitions = [x.datetime for x in partitions[:-1]]
34
+ return partitions
35
+
36
+ def execute_impl(self, *args, **kwargs):
37
+ ds = self.must_get_connection_by_name(self.config["data_source_name"])
38
+ hf = self.create_handler_factory()
39
+ dump_options = extract_dict(
40
+ self.rendered_config, keys=["table", "columns", "where", "partition_column", "concurrency"]
41
+ )
42
+ dump_options.update({"connector": ds.connector, "handler_factories": [hf]})
43
+ partitions = self.determine_partitions()
44
+ if partitions:
45
+ dump_options.update({"partitions": partitions})
46
+ dumper = CassandraDumper(**dump_options)
47
+ # if self.has_custom_transformer():
48
+ dumper.row_factory = ordered_dict_factory
49
+ return dumper.execute()
50
+
51
+ @classmethod
52
+ def config_schema(cls):
53
+ # dss = cls.get_connection_names_by_type('cassandra')
54
+ return {
55
+ "type": "object",
56
+ "properties": {
57
+ "data_source_name": {
58
+ "type": "string",
59
+ "title": "Data Source",
60
+ "ui:field": "ProjectConnectionSelectorField",
61
+ "ui:options": {
62
+ "supportTypes": [
63
+ "cassandra",
64
+ ],
65
+ },
66
+ # 'default': cls.first_or_default(dss, ''),
67
+ },
68
+ "table": {
69
+ "type": "string",
70
+ "title": "Table Name",
71
+ "ui:field": "CodeEditorWithReferencesField",
72
+ "ui:options": {
73
+ "type": "plain",
74
+ },
75
+ },
76
+ "columns": {
77
+ "type": "string",
78
+ "title": "Columns",
79
+ "description": "要导出的列,用 `,` 分隔;默认导出所有列(*)",
80
+ "ui:field": "CodeEditorWithReferencesField",
81
+ "ui:options": {
82
+ "type": "plain",
83
+ },
84
+ },
85
+ "where": {
86
+ "type": "string",
87
+ "title": "Where Clause",
88
+ "description": "Where 条件",
89
+ "ui:field": "CodeEditorWithReferencesField",
90
+ "ui:options": {
91
+ "type": "plain",
92
+ },
93
+ },
94
+ "partition_column": {
95
+ "type": "string",
96
+ "title": "Partition Column",
97
+ "description": "分区键,通常名为 date",
98
+ "ui:field": "CodeEditorWithReferencesField",
99
+ "ui:options": {
100
+ "type": "plain",
101
+ },
102
+ },
103
+ "partitions": {
104
+ "type": "string",
105
+ "title": "Partitions",
106
+ "ui:field": "CodeEditorWithReferencesField",
107
+ "ui:options": {
108
+ "type": "plain",
109
+ },
110
+ },
111
+ "concurrency": {
112
+ "type": "number",
113
+ "ui:options": {"controls": False},
114
+ "title": "Concurrency",
115
+ "default": 1,
116
+ "description": "并发数,1~20",
117
+ "minimum": 1,
118
+ "maximum": 20,
119
+ },
120
+ "transform": copy.deepcopy(utils.TRANSFORM),
121
+ "incremental_by_time": {
122
+ "type": "boolean",
123
+ "title": "Incremental By Time",
124
+ "default": False,
125
+ "description": "是否按时间进行增量同步,这个时间必须是分区键",
126
+ "ui:widget": "BaseCheckbox",
127
+ "ui:options": {
128
+ "label": "Incremental By Time",
129
+ },
130
+ },
131
+ "time_granularity": {
132
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
133
+ "type": "string",
134
+ "title": "Time Granularity",
135
+ "default": "day",
136
+ "description": "分区键的时间粒度,用于生成分区值",
137
+ "enum": ["day", "hour"],
138
+ "enumNames": ["day", "hour"],
139
+ },
140
+ "time_auto_round": {
141
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
142
+ "type": "boolean",
143
+ "title": "Round Time Resolution",
144
+ "default": True,
145
+ "description": "是否把数据时间范围 round 到合适的粒度。比如每天 01:23 同步上一个自然日的数据,"
146
+ "则运行时间是 01:23,数据范围是 [T-1 00:00, T 00:00);否则数据范围是 [T-1 01:23, T 01:23)。"
147
+ "开启后,每天运行的任务,数据范围会 round 到 0 点,即自然日;"
148
+ "每周运行的任务,会 round 到周一 0 点;"
149
+ "每月运行的任务,会 round 到每月 1 日 0 点",
150
+ },
151
+ },
152
+ # NOTE:前端用的 vue-json-schema 有 bug,enum 字段必须被 required...
153
+ "required": ["data_source_name", "table", "time_granularity"],
154
+ # 处理表单联动,只有 incremental_by_time 为 True 时,才需要显示其他两个输入框
155
+ }
@@ -0,0 +1,209 @@
1
+ import copy
2
+
3
+ import jsonschema
4
+
5
+ from recurvedata.connectors.service import list_sql_operator_types
6
+ from recurvedata.core.translation import _l
7
+ from recurvedata.operators.transfer_operator import utils
8
+ from recurvedata.operators.transfer_operator.task import DumpTask
9
+ from recurvedata.pigeon.dumper.dbapi import DBAPIDumper
10
+ from recurvedata.pigeon.row_factory import ordered_dict_factory
11
+ from recurvedata.pigeon.utils.sql import apply_where_safely
12
+ from recurvedata.utils import date_time, extract_dict
13
+
14
+
15
+ class DBAPIDumpTask(DumpTask):
16
+ ds_name_fields = ("data_source_name",)
17
+ worker_install_require = ["pigeon"]
18
+
19
+ @property
20
+ def time_column_tz(self):
21
+ return self.config.get("time_column_tz", "UTC")
22
+
23
+ def determine_time_range(self):
24
+ start_date, end_date = self.get_schedule_time_range()
25
+ if self.config.time_column_type == "date":
26
+ return start_date.date(), end_date.date()
27
+
28
+ # convert timezone
29
+ start_date = date_time.astimezone(start_date, tz=self.time_column_tz)
30
+ end_date = date_time.astimezone(end_date, tz=self.time_column_tz)
31
+
32
+ return start_date.replace(tzinfo=None), end_date.replace(tzinfo=None)
33
+
34
+ def derive_sql_query(self, connector, base_query: str):
35
+ base_query = base_query.strip().rstrip(";")
36
+ comment = self.get_query_comment_conf()
37
+ if not self.config.incremental_by_time or self.dag.is_once:
38
+ annotated_query = connector.add_leading_comment(base_query, comment)
39
+ return annotated_query
40
+
41
+ if not base_query:
42
+ base_query = f"SELECT * FROM {connector.quote_identifier(self.config.table)}"
43
+ annotated_query = connector.add_leading_comment(base_query, comment)
44
+
45
+ start, end = self.determine_time_range()
46
+ col = connector.quote_identifier(self.config.time_column)
47
+
48
+ if connector.is_phoenix():
49
+ where = f"{col} >= TIMESTAMP '{start}' AND {col} < TIMESTAMP '{end}'"
50
+ else:
51
+ where = f"{col} >= '{start}' AND {col} < '{end}'"
52
+ return apply_where_safely(annotated_query, where)
53
+
54
+ def execute_impl(self, *args, **kwargs):
55
+ ds = self.must_get_connection_by_name(self.config["data_source_name"])
56
+ hf = self.create_handler_factory()
57
+ dump_options = extract_dict(self.rendered_config, keys=["table", "splitby", "splits", "concurrency"])
58
+ dump_options.update(
59
+ {
60
+ "connector": ds.connector,
61
+ "sql": self.derive_sql_query(ds.connector, self.rendered_config.get("sql", "")),
62
+ "handler_factories": [hf],
63
+ }
64
+ )
65
+ if not dump_options.get("splitby"):
66
+ dump_options["splits"] = dump_options["concurrency"] = 1
67
+ dumper = DBAPIDumper(**dump_options)
68
+ dumper.row_factory = ordered_dict_factory
69
+ return dumper.execute()
70
+
71
+ @classmethod
72
+ def validate(cls, configuration):
73
+ conf = super().validate(configuration)
74
+ if not (conf.get("table") or conf.get("sql")):
75
+ raise jsonschema.ValidationError(message="either table or sql is required", path=("table", "sql"))
76
+ return conf
77
+
78
+ @classmethod
79
+ def config_schema(cls):
80
+ return {
81
+ "type": "object",
82
+ "properties": {
83
+ "data_source_name": {
84
+ "type": "string",
85
+ "title": _l("Data Source"),
86
+ "description": _l("Database connection to extract data from"),
87
+ "ui:field": "ProjectConnectionSelectorField",
88
+ "ui:options": {
89
+ "supportTypes": list_sql_operator_types(),
90
+ },
91
+ },
92
+ "table": {
93
+ "type": "string",
94
+ "title": _l("Source Table"),
95
+ "description": _l(
96
+ "Table name including schema (if required). Either specify a table name or SQL query."
97
+ ),
98
+ "ui:field": "CodeEditorWithReferencesField",
99
+ "ui:options": {
100
+ "type": "plain",
101
+ },
102
+ },
103
+ "sql": {
104
+ "type": "string",
105
+ "title": _l("Custom Query"),
106
+ "description": _l(
107
+ "Custom SELECT query with Jinja template support. Takes precedence over table name if both are specified."
108
+ ),
109
+ "ui:field": "CodeEditorWithReferencesField",
110
+ "ui:options": {
111
+ "type": "code",
112
+ "lang": "sql",
113
+ "sqlLang": "sql",
114
+ },
115
+ },
116
+ "splitby": {
117
+ "type": "string",
118
+ "title": _l("Split Column"),
119
+ "description": _l(
120
+ "Column to partition data by for parallel processing. Must be indexed, sortable and non-null."
121
+ ),
122
+ "ui:field": "CodeEditorWithReferencesField",
123
+ "ui:options": {
124
+ "type": "plain",
125
+ },
126
+ },
127
+ "splits": {
128
+ "ui:hidden": "{{ !parentFormData.splitby }}",
129
+ "type": "number",
130
+ "ui:options": {"controls": False},
131
+ "title": _l("Number of Splits"),
132
+ "default": 1,
133
+ "minimum": 1,
134
+ "maximum": 2000,
135
+ },
136
+ "concurrency": {
137
+ "ui:hidden": "{{ !parentFormData.splitby }}",
138
+ "type": "number",
139
+ "ui:options": {"controls": False},
140
+ "title": _l("Parallel Threads"),
141
+ "default": 1,
142
+ "description": _l("Number of concurrent extraction threads (1-20)"),
143
+ "minimum": 1,
144
+ "maximum": 20,
145
+ },
146
+ "transform": copy.deepcopy(utils.TRANSFORM),
147
+ "incremental_by_time": {
148
+ "type": "boolean",
149
+ "title": _l("Enable Time-based Incremental Sync"),
150
+ "default": False,
151
+ "description": _l("Sync data incrementally based on a time column"),
152
+ "ui:widget": "BaseCheckbox",
153
+ "ui:options": {
154
+ "label": _l("Enable Time-based Incremental Sync"),
155
+ },
156
+ },
157
+ "time_column": {
158
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
159
+ "type": "string",
160
+ "title": _l("Time Column Name"),
161
+ "default": "snapshot_time",
162
+ "description": _l(
163
+ "Name of the time column used for incremental sync. Column should be indexed for better performance."
164
+ ),
165
+ "ui:field": "CodeEditorWithReferencesField",
166
+ "ui:options": {
167
+ "type": "plain",
168
+ },
169
+ },
170
+ "time_column_tz": {
171
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
172
+ "type": "string",
173
+ "title": _l("Time Column Timezone"),
174
+ "default": "UTC",
175
+ "enum": [
176
+ "UTC",
177
+ "Asia/Shanghai",
178
+ ],
179
+ "enumNames": [
180
+ "UTC",
181
+ "Asia/Shanghai",
182
+ ],
183
+ },
184
+ "time_column_type": {
185
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
186
+ "type": "string",
187
+ "title": _l("Timestamp Format"),
188
+ "default": "datetime",
189
+ "enum": ["datetime", "date"],
190
+ "enumNames": ["datetime", "date"],
191
+ },
192
+ "time_auto_round": {
193
+ "ui:hidden": "{{!parentFormData.incremental_by_time}}",
194
+ "type": "boolean",
195
+ "title": "Auto Round Time Range",
196
+ "default": True,
197
+ "description": _l(
198
+ "Automatically round time ranges to appropriate intervals. For example:\n"
199
+ "- Daily tasks running at 01:23 will sync previous day's data from 00:00 to 00:00\n"
200
+ "- Weekly tasks will round to Monday 00:00\n"
201
+ "- Monthly tasks will round to 1st day 00:00\n"
202
+ "If disabled, exact execution times will be used (e.g. 01:23 to 01:23)"
203
+ ),
204
+ },
205
+ },
206
+ "required": [
207
+ "data_source_name",
208
+ ],
209
+ }
@@ -0,0 +1,113 @@
1
+ import copy
2
+ import json
3
+
4
+ try:
5
+ from recurvedata.pigeon.dumper.es import ElasticSearchDumper
6
+ except ImportError:
7
+ pass
8
+
9
+ from recurvedata.core.translation import _l
10
+ from recurvedata.operators.transfer_operator import utils
11
+ from recurvedata.operators.transfer_operator.task import DumpTask
12
+ from recurvedata.utils import extract_dict
13
+
14
+
15
+ class ElasticSearchDumpTask(DumpTask):
16
+ ds_name_fields = ("data_source_name",)
17
+ worker_install_require = ["pigeon[elasticsearch]"]
18
+
19
+ def execute_impl(self, *args, **kwargs):
20
+ ds = self.must_get_connection_by_name(self.config["data_source_name"])
21
+ hf = self.create_handler_factory()
22
+ dump_options = extract_dict(self.rendered_config, keys=["index", "doc_type", "query", "fields", "meta_fields"])
23
+ if self.rendered_config.get("search_kwargs"):
24
+ search_kwargs = json.loads(self.rendered_config.get("search_kwargs"))
25
+ dump_options["search_kwargs"] = search_kwargs
26
+ dump_options.update({"connector": ds.connector, "handler_factories": [hf]})
27
+ dumper = ElasticSearchDumper(**dump_options)
28
+ return dumper.execute()
29
+
30
+ @classmethod
31
+ def config_schema(cls):
32
+ # get_choices_by_type = cls.get_connection_names_by_type
33
+ # dss = get_choices_by_type('elasticsearch')
34
+ return {
35
+ "type": "object",
36
+ "properties": {
37
+ "data_source_name": {
38
+ "type": "string",
39
+ "title": _l("Elasticsearch Data Source"),
40
+ "ui:field": "ProjectConnectionSelectorField",
41
+ "ui:options": {
42
+ "supportTypes": [
43
+ "elasticsearch",
44
+ ],
45
+ },
46
+ # 'default': cls.first_or_default(dss, ''),
47
+ },
48
+ "index": {
49
+ "type": "string",
50
+ "title": _l("Elasticsearch Index"),
51
+ "ui:field": "CodeEditorWithReferencesField",
52
+ "ui:options": {
53
+ "type": "plain",
54
+ },
55
+ },
56
+ "doc_type": {
57
+ "type": "string",
58
+ "title": _l("Document Type"),
59
+ "default": "_doc",
60
+ "description": _l("The type of documents to query"),
61
+ "ui:field": "CodeEditorWithReferencesField",
62
+ "ui:options": {
63
+ "type": "plain",
64
+ },
65
+ },
66
+ "query": {
67
+ "type": "string",
68
+ "title": _l("Search Query"),
69
+ "default": "*",
70
+ "description": _l("Elasticsearch query string to filter documents. Supports Jinja templating."),
71
+ "ui:field": "CodeEditorWithReferencesField",
72
+ "ui:options": {
73
+ "type": "plain",
74
+ },
75
+ },
76
+ "fields": {
77
+ "type": "string",
78
+ "title": _l("Document Fields"),
79
+ "description": _l(
80
+ "Comma-separated list of document fields to retrieve. Leave empty to get all fields."
81
+ ),
82
+ "ui:field": "CodeEditorWithReferencesField",
83
+ "ui:options": {
84
+ "type": "plain",
85
+ },
86
+ },
87
+ "meta_fields": {
88
+ "type": "array",
89
+ "uniqueItems": True,
90
+ "items": {
91
+ "type": "string",
92
+ "enum": ["_index", "_type", "_id"],
93
+ "enumNames": ["_index", "_type", "_id"],
94
+ },
95
+ "title": _l("Document Metadata Fields"),
96
+ "description": _l("Additional metadata fields to include with each document."),
97
+ "ui:widget": "SelectWidget",
98
+ },
99
+ "search_kwargs": {
100
+ "type": "string",
101
+ "title": _l("Advanced Search Options"),
102
+ "description": _l(
103
+ "Additional options for Elasticsearch scan operation in JSON format (e.g. size, scroll)."
104
+ ),
105
+ "ui:field": "CodeEditorWithReferencesField",
106
+ "ui:options": {
107
+ "type": "plain",
108
+ },
109
+ },
110
+ "transform": copy.deepcopy(utils.TRANSFORM),
111
+ },
112
+ "required": ["data_source_name", "index"],
113
+ }
@@ -0,0 +1,114 @@
1
+ import json
2
+ import logging
3
+ import urllib.parse
4
+
5
+ try:
6
+ import pandas as pd
7
+
8
+ from recurvedata.pigeon.connector.feishu import FeishuBot
9
+ except ImportError:
10
+ pass
11
+
12
+ from recurvedata.core.translation import _l
13
+ from recurvedata.operators.transfer_operator.dump_sheet_task_base import SheetDumpTaskBase
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FeishuSheetDumpTask(SheetDumpTaskBase):
19
+ _AUTO_REGISTER = True
20
+ ds_name_fields = ("feishu_bot",)
21
+ worker_install_require = ["pandas", "numpy", "pigeon[feishu]"]
22
+
23
+ custom_config_schema_properties = {
24
+ "feishu_bot": {
25
+ "type": "string",
26
+ "title": _l("Feishu Bot Connection"),
27
+ "description": _l(
28
+ "Select the Feishu bot connection that has permissions to access and read the target sheet"
29
+ ),
30
+ "ui:field": "ProjectConnectionSelectorField",
31
+ "ui:options": {
32
+ "supportTypes": [
33
+ "feishu_bot",
34
+ ],
35
+ },
36
+ # 'default': cls.first_or_default(dss, ''),
37
+ },
38
+ "file_url": {
39
+ "type": "string",
40
+ "title": _l("Feishu Document URL"),
41
+ "description": _l(
42
+ "URL of the Feishu spreadsheet or document to read data from. For multi-sheet documents, the first sheet will be used by default unless a specific sheet ID is included in the URL"
43
+ ),
44
+ "ui:field": "CodeEditorWithReferencesField",
45
+ "ui:options": {
46
+ "type": "plain",
47
+ },
48
+ },
49
+ "cell_range": {
50
+ "type": "string",
51
+ "title": _l("Data Range"),
52
+ "description": _l(
53
+ "Optional range of cells to read in A1 notation (e.g. A1:D100). Leave empty to read all data from the sheet"
54
+ ),
55
+ "ui:field": "CodeEditorWithReferencesField",
56
+ "ui:options": {
57
+ "type": "plain",
58
+ },
59
+ },
60
+ }
61
+ custom_config_schema_required = ["feishu_bot", "file_url"]
62
+
63
+ def read_origin_df(self) -> "pd.DataFrame":
64
+ conf = self.rendered_config
65
+ if conf.extra_read_kwargs:
66
+ extra_read_kwargs = json.loads(conf.extra_read_kwargs)
67
+ else:
68
+ extra_read_kwargs = {}
69
+
70
+ ds = self.must_get_connection_by_name(conf.feishu_bot)
71
+ bot = FeishuBot(**ds.extra)
72
+ file_type, file_token, sheet = self.parse_feishu_sheets_url(conf.file_url)
73
+ logger.info(f"reading {conf.file_url}")
74
+ if file_type == "file":
75
+ df = bot.read_feishuexcel(file_token, **extra_read_kwargs)
76
+ else: # sheets
77
+ if not sheet:
78
+ logger.info("sheet_id not found in url, use the first sheet as default")
79
+ sheet_ids, _ = bot.get_sheet_ids(file_token)
80
+ sheet = sheet_ids[0]
81
+ if conf.cell_range:
82
+ sheet = f"{sheet}!{conf.cell_range}"
83
+ df = bot.read_feishusheet(file_token, sheet, **extra_read_kwargs)
84
+
85
+ logger.info(f"original DataFrame shape {df.shape}, dtypes:\n{df.dtypes}")
86
+ logger.info(df.head())
87
+ return df
88
+
89
+ @staticmethod
90
+ def parse_feishu_sheets_url(url: str) -> tuple[str, str, str]:
91
+ rv = urllib.parse.urlparse(url)
92
+ if "/file/" in url:
93
+ file_type = "file"
94
+ elif "/sheets/" in url:
95
+ file_type = "sheets"
96
+ elif "/wiki/" in url:
97
+ file_type = "wiki"
98
+ else:
99
+ raise ValueError(f"unsupported url {url}")
100
+
101
+ file_token = rv.path.rsplit("/", 1)[-1]
102
+ sheet = urllib.parse.parse_qs(rv.query).get("sheet") or None
103
+ if sheet:
104
+ sheet = sheet[0]
105
+
106
+ bot = FeishuBot()
107
+ if file_type == "wiki":
108
+ obj_type, obj_token = bot.get_wiki_type_token(wiki_token=file_token)
109
+ if obj_type == "sheet":
110
+ file_type, file_token = obj_type, obj_token
111
+ else:
112
+ raise ValueError(f"unsupported url {url}")
113
+
114
+ return file_type, file_token, sheet