recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,415 @@
1
+ import os
2
+ import re
3
+ import time
4
+ from collections import OrderedDict
5
+ from datetime import datetime, timedelta, timezone
6
+ from functools import wraps
7
+ from typing import Any, Callable, Dict, Generator, List, Optional
8
+
9
+ from dateutil import parser as date_parser
10
+
11
+ try:
12
+ from aliyun.log import GetHistogramsRequest, GetLogsRequest, LogClient
13
+ except ImportError:
14
+ pass
15
+
16
+ from recurvedata.pigeon.dumper.base import BaseDumper
17
+ from recurvedata.pigeon.handler.base import HandlerFactory
18
+
19
+ # Constants
20
+ SQL_PATTERN = re.compile(r"^\s*select\s+.+\s+from\s+.+", re.IGNORECASE)
21
+ LOGSEARCH_ANALYSIS_PATTERN = re.compile(r".*\|\s*select\s+.+", re.IGNORECASE)
22
+
23
+ # Configuration constants
24
+ DEFAULT_TIMEZONE_OFFSET = 8 # CST (UTC+8)
25
+ TIMEZONE_ENV_VAR = "TZ_OFFSET"
26
+ LARGE_DATASET_THRESHOLD = 500_000 # 500k logs
27
+ DEFAULT_BATCH_SIZE = 1000
28
+ MAX_RETRIES = 3
29
+
30
+
31
+ def with_retry(max_retries: int = MAX_RETRIES):
32
+ """Decorator to add retry logic with Aliyun SLS error handling."""
33
+
34
+ def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
35
+ @wraps(func)
36
+ def wrapper(self: "AliyunSLSDumper", *args: Any, **kwargs: Any) -> Any:
37
+ retry_count = 0
38
+
39
+ while retry_count < max_retries:
40
+ try:
41
+ return func(self, *args, **kwargs)
42
+ except Exception as e:
43
+ retry_count = self.handle_aliyun_error(e, retry_count, max_retries)
44
+ if retry_count >= max_retries:
45
+ self.logger.error(f"Max retries reached for {func.__name__}, stopping: {e}")
46
+ raise e
47
+ return None
48
+
49
+ return wrapper
50
+
51
+ return decorator
52
+
53
+
54
+ class AliyunSLSDumper(BaseDumper):
55
+ """Used to dump data from Aliyun SLS to local file (csv format).
56
+
57
+ This dumper uses histograms API to get total log count first, then chooses the optimal
58
+ fetching method based on data volume:
59
+ - For datasets > 500k logs: Uses get_log_all method (recommended by Aliyun SDK)
60
+ - For smaller datasets: Uses standard pagination with 1000-item batches
61
+
62
+ Args:
63
+ access_key_id: Aliyun Access Key ID
64
+ access_key_secret: Aliyun Access Key Secret
65
+ endpoint: Aliyun SLS Endpoint
66
+ project: Aliyun SLS Project
67
+ logstore: Aliyun SLS Logstore
68
+ start_time: Aliyun SLS Start Time (format: YYYY-MM-DD HH:MM:SS)
69
+ end_time: Aliyun SLS End Time (format: YYYY-MM-DD HH:MM:SS)
70
+ query: Aliyun SLS Query
71
+ handler_factories: List of handler factories for processing data
72
+ fields: Comma-separated list of fields to extract
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ access_key_id: str,
78
+ access_key_secret: str,
79
+ endpoint: str,
80
+ project: str,
81
+ logstore: str,
82
+ start_time: str,
83
+ end_time: str,
84
+ query: Optional[str] = None,
85
+ handler_factories: Optional[List[HandlerFactory]] = None,
86
+ fields: Optional[str] = None,
87
+ ):
88
+ super().__init__(handler_factories=handler_factories)
89
+
90
+ self.access_key_id = access_key_id
91
+ self.access_key_secret = access_key_secret
92
+ self.endpoint = endpoint
93
+ self.project = project
94
+ self.logstore = logstore
95
+ self.query = query
96
+ self.fields = [field.strip() for field in fields.split(",")] if fields else []
97
+
98
+ # Parse time strings to datetime objects
99
+ self.start_time = self._parse_time_string(start_time)
100
+ self.end_time = self._parse_time_string(end_time)
101
+
102
+ # Initialize client in fetch_logs_segment to handle import errors
103
+ self.client = LogClient(self.endpoint, self.access_key_id, self.access_key_secret)
104
+
105
+ def _parse_time_string(self, time_str: str) -> datetime:
106
+ """Parse time string to datetime object using dateutil.parser."""
107
+ try:
108
+ parsed_time = date_parser.parse(time_str, dayfirst=False, yearfirst=True)
109
+
110
+ if parsed_time.tzinfo is not None:
111
+ parsed_time = parsed_time.replace(tzinfo=None)
112
+
113
+ self.logger.info(f"Parsed '{time_str}' -> {parsed_time}")
114
+ return parsed_time
115
+
116
+ except (ValueError, TypeError) as e:
117
+ raise ValueError(f"Unable to parse time string '{time_str}': {e}")
118
+
119
+ def execute(self):
120
+ self.meta.mark_start()
121
+ self.execute_impl()
122
+ self.meta.mark_finish()
123
+ self.logger.info("dumper meta: %s", self.meta.to_json(indent=2))
124
+ return self.meta
125
+
126
+ def is_sql_or_logsearch_query(self, q: str) -> bool:
127
+ if not q:
128
+ return False
129
+ q = q.strip()
130
+ return bool(SQL_PATTERN.match(q) or LOGSEARCH_ANALYSIS_PATTERN.match(q))
131
+
132
+ def _process_log_contents(self, log, raw_contents):
133
+ """Process log contents and return ordered dictionary if fields are specified."""
134
+ if not self.fields:
135
+ return raw_contents
136
+
137
+ ordered_contents: OrderedDict = OrderedDict()
138
+ # Add fields in the user-specified order first
139
+ for field in self.fields:
140
+ if field in raw_contents:
141
+ ordered_contents[field] = raw_contents[field]
142
+ elif field == "__time__":
143
+ # Handle time field specially
144
+ ordered_contents[field] = log.get_time()
145
+ elif field == "_source_":
146
+ # Handle source field specially
147
+ ordered_contents[field] = log.get_source()
148
+ else:
149
+ self.logger.warning(f"Field '{field}' not found in raw_contents and not a special field")
150
+
151
+ return ordered_contents
152
+
153
+ def _get_timezone_offset(self) -> int:
154
+ """Get local timezone offset in hours from environment variable."""
155
+ tz_offset = os.environ.get(TIMEZONE_ENV_VAR)
156
+ return int(tz_offset) if tz_offset is not None else DEFAULT_TIMEZONE_OFFSET
157
+
158
+ def _calculate_utc_timestamp(self, dt: datetime) -> int:
159
+ """Calculate UTC timestamp by treating datetime as local time."""
160
+ local_offset = timezone(timedelta(hours=self._get_timezone_offset()))
161
+ local_dt = dt.replace(tzinfo=local_offset)
162
+ utc_dt = local_dt.astimezone(timezone.utc)
163
+ return int(utc_dt.timestamp())
164
+
165
+ def _get_time_range(self) -> tuple[int, int]:
166
+ """Get time range as timestamps to avoid Aliyun SDK timezone issues."""
167
+ from_time = self._calculate_utc_timestamp(self.start_time)
168
+ to_time = self._calculate_utc_timestamp(self.end_time)
169
+
170
+ self.logger.info(f"Time range - start_time: {self.start_time} -> from_time: {from_time}")
171
+ self.logger.info(f"Time range - end_time: {self.end_time} -> to_time: {to_time}")
172
+
173
+ return from_time, to_time
174
+
175
+ def handle_aliyun_error(self, error: Exception, retry_count: int, max_retries: int) -> int:
176
+ """Handle Aliyun SLS specific errors with appropriate delays."""
177
+ error_msg = str(error)
178
+
179
+ # Handle specific Aliyun SLS error codes
180
+ if "ReadQuotaExceed" in error_msg:
181
+ self.logger.warning(
182
+ f"Read quota exceeded (attempt {retry_count}/{max_retries}). Waiting 5 seconds before retry..."
183
+ )
184
+ time.sleep(5.0) # Longer delay for quota issues
185
+ elif "QpsLimitExceeded" in error_msg or "MetaOperationQpsLimitExceeded" in error_msg:
186
+ self.logger.warning(
187
+ f"QPS limit exceeded (attempt {retry_count}/{max_retries}). Waiting 3 seconds before retry..."
188
+ )
189
+ time.sleep(3.0) # Medium delay for QPS issues
190
+ elif "ServerBusy" in error_msg or "RequestTimeout" in error_msg:
191
+ self.logger.warning(
192
+ f"Server busy/timeout (attempt {retry_count}/{max_retries}). Waiting 2 seconds before retry..."
193
+ )
194
+ time.sleep(2.0) # Short delay for server issues
195
+ else:
196
+ self.logger.warning(f"Error fetching logs (attempt {retry_count}/{max_retries}): {error}")
197
+ time.sleep(1.0) # Default delay
198
+
199
+ return retry_count + 1
200
+
201
+ def _process_logs_batch(self, logs, handlers):
202
+ """Process a batch of logs and send to handlers."""
203
+ for log in logs:
204
+ raw_contents = log.get_contents()
205
+ log_entry = self._process_log_contents(log, raw_contents)
206
+
207
+ # Handle all handlers in one loop
208
+ for h in handlers:
209
+ h.handle(log_entry)
210
+
211
+ def _create_logs_request(
212
+ self, from_time: int, to_time: int, offset: int = 0, limit: int = DEFAULT_BATCH_SIZE
213
+ ) -> "GetLogsRequest":
214
+ """Create a GetLogsRequest with appropriate parameters."""
215
+ has_pagination_in_query = self.is_sql_or_logsearch_query(self.query or "")
216
+
217
+ if has_pagination_in_query:
218
+ # For queries with pagination, we need to modify the query to include our offset/limit
219
+ modified_query = self._add_pagination_to_query(self.query or "")
220
+
221
+ return GetLogsRequest(
222
+ self.project,
223
+ self.logstore,
224
+ from_time,
225
+ to_time,
226
+ query=modified_query,
227
+ reverse=False,
228
+ )
229
+ else:
230
+ # Use standard offset pagination
231
+ return GetLogsRequest(
232
+ self.project,
233
+ self.logstore,
234
+ from_time,
235
+ to_time,
236
+ query=self.query,
237
+ line=limit,
238
+ offset=offset,
239
+ reverse=False,
240
+ )
241
+
242
+ def _add_pagination_to_query(self, query: str) -> str:
243
+ """Add pagination parameters to existing query."""
244
+ # Check if query already has limit clause
245
+ if "limit" in query.lower():
246
+ return query
247
+ else:
248
+ # Add limit and offset to query
249
+ return f"{query} limit {0},{DEFAULT_BATCH_SIZE}"
250
+
251
+ @with_retry()
252
+ def _get_total_log_count(self) -> int:
253
+ """Get total log count using histograms API."""
254
+ from_time, to_time = self._get_time_range()
255
+
256
+ # Check if query is an analysis statement (SQL or LogSearch analysis)
257
+ if self.is_sql_or_logsearch_query(self.query or ""):
258
+ self.logger.warning(
259
+ f"Query '{self.query}' appears to be an analysis statement. "
260
+ "Histograms API does not support analysis queries. "
261
+ "Will use get_log_all method for fetching data."
262
+ )
263
+ # Return a large number to trigger get_log_all method
264
+ return LARGE_DATASET_THRESHOLD + 1
265
+
266
+ request = GetHistogramsRequest(self.project, self.logstore, from_time, to_time, query=self.query or "")
267
+
268
+ response = self.client.get_histograms(request)
269
+ total_logs = response.get_total_count()
270
+ self.logger.info(f"Total logs to fetch: {total_logs}")
271
+ return total_logs
272
+
273
+ def _fetch_logs_batch(
274
+ self, offset: int, limit: int, from_time: int, to_time: int
275
+ ) -> Generator[Dict[str, Any], None, None]:
276
+ """Fetch logs in a single batch using offset pagination."""
277
+ request = self._create_logs_request(from_time, to_time, offset, limit)
278
+ response = self.client.get_logs(request)
279
+
280
+ if response:
281
+ logs = response.get_logs()
282
+ batch_logs = []
283
+ for log in logs:
284
+ raw_contents = log.get_contents()
285
+ if not self.fields:
286
+ batch_logs.append(raw_contents)
287
+ else:
288
+ batch_logs.append(self._process_log_contents(log, raw_contents))
289
+
290
+ return batch_logs
291
+
292
+ def _fetch_logs_with_get_log_all(self, handlers):
293
+ """Fetch logs using get_log_all method for large datasets."""
294
+ self.logger.info("Starting get_log_all fetch...")
295
+ start_time = time.time()
296
+ total_processed = 0
297
+ batch_count = 0
298
+ max_retries = 3
299
+ retry_count = 0
300
+
301
+ from_time, to_time = self._get_time_range()
302
+ while True:
303
+ try:
304
+ for response in self.client.get_log_all(
305
+ self.project, self.logstore, from_time, to_time, query=self.query, reverse=False
306
+ ):
307
+ if response:
308
+ logs = response.get_logs()
309
+ batch_count += 1
310
+ logs_count = len(logs)
311
+ total_processed += logs_count
312
+
313
+ # Log progress every 50 batches to reduce logging overhead
314
+ if batch_count % 50 == 0:
315
+ elapsed_time = time.time() - start_time
316
+ rate = total_processed / elapsed_time if elapsed_time > 0 else 0
317
+ self.logger.info(
318
+ f"Fetched {logs_count} logs from get_log_all (batch {batch_count}, total: {total_processed:,}, rate: {rate:.0f} logs/sec)"
319
+ )
320
+
321
+ # Process logs directly - optimize for speed
322
+ for log in logs:
323
+ raw_contents = log.get_contents()
324
+
325
+ # Skip field processing if no fields specified for maximum speed
326
+ if not self.fields:
327
+ log_entry = raw_contents
328
+ else:
329
+ log_entry = self._process_log_contents(log, raw_contents)
330
+
331
+ # Handle all handlers in one loop
332
+ for h in handlers:
333
+ h.handle(log_entry)
334
+
335
+ # If we reach here, the generator completed successfully
336
+ break
337
+
338
+ except Exception as e:
339
+ retry_count = self.handle_aliyun_error(e, retry_count, max_retries)
340
+ if retry_count >= max_retries:
341
+ self.logger.error(f"Max retries reached for get_log_all, stopping: {e}")
342
+ raise e
343
+ # Continue the loop to retry
344
+ continue
345
+
346
+ elapsed_time = time.time() - start_time
347
+ final_rate = total_processed / elapsed_time if elapsed_time > 0 else 0
348
+ self.logger.info(
349
+ f"get_log_all fetch completed: {total_processed:,} logs in {elapsed_time:.1f}s ({final_rate:.0f} logs/sec)"
350
+ )
351
+
352
+ def _fetch_logs_with_pagination(self, handlers, total_logs):
353
+ """Fetch logs using standard pagination method for smaller datasets."""
354
+ batch_size = 1000
355
+ self.logger.info(f"Using batch size: {batch_size}")
356
+
357
+ # Fetch logs in batches using offset pagination
358
+ offset = 0
359
+ processed_count = 0
360
+ from_time, to_time = self._get_time_range()
361
+
362
+ while offset < total_logs:
363
+ self.logger.info(
364
+ f"Fetching logs batch: offset={offset:,}, limit={batch_size} (processed: {processed_count:,}/{total_logs:,})"
365
+ )
366
+
367
+ # Retry logic for each batch
368
+ max_retries = 3
369
+ retry_count = 0
370
+ batch_success = False
371
+
372
+ while retry_count < max_retries and not batch_success:
373
+ try:
374
+ for log_entry in self._fetch_logs_batch(offset, batch_size, from_time, to_time):
375
+ for h in handlers:
376
+ h.handle(log_entry)
377
+ processed_count += 1
378
+
379
+ offset += batch_size
380
+ batch_success = True
381
+
382
+ except Exception as e:
383
+ retry_count = self.handle_aliyun_error(e, retry_count, max_retries)
384
+ if retry_count >= max_retries:
385
+ self.logger.error(f"Failed to fetch batch at offset {offset}: {e}")
386
+ raise e
387
+ # Continue retry loop for the same batch
388
+ continue
389
+
390
+ def execute_impl(self):
391
+ handlers = self.create_handlers()
392
+ self.logger.info("execute with context")
393
+ self.logger.info(f"query: {self.query}")
394
+ self.logger.info(f"start_time: {self.start_time}")
395
+ self.logger.info(f"end_time: {self.end_time}")
396
+ self.logger.info(f"fields: {self.fields}")
397
+
398
+ # Get total log count using histograms
399
+ total_logs = self._get_total_log_count()
400
+
401
+ if total_logs == 0:
402
+ self.logger.info("No logs found for the specified time range and query")
403
+ return
404
+
405
+ # Choose appropriate method based on log count
406
+ if total_logs > LARGE_DATASET_THRESHOLD: # More than 500k logs - use get_log_all for better performance
407
+ self.logger.info(f"Large dataset detected ({total_logs:,} logs), using get_log_all method")
408
+ self._fetch_logs_with_get_log_all(handlers)
409
+ else:
410
+ self.logger.info(f"Using standard pagination method for {total_logs:,} logs")
411
+ self._fetch_logs_with_pagination(handlers, total_logs)
412
+
413
+ for h in handlers:
414
+ h.close()
415
+ self.join_handlers()
@@ -0,0 +1,141 @@
1
+ from typing import List
2
+
3
+ from recurvedata.pigeon.handler.base import Handler, HandlerFactory
4
+ from recurvedata.pigeon.meta import DumperMeta, DumperWorkerMeta
5
+ from recurvedata.pigeon.row_factory import keyed_tuple_factory
6
+ from recurvedata.pigeon.utils import LoggingMixin, ensure_list
7
+ from recurvedata.pigeon.utils.timing import Timer
8
+
9
+
10
+ class BaseWorker(LoggingMixin):
11
+ def __init__(self, worker_id, task_id, handlers, row_factory=keyed_tuple_factory, retries=3):
12
+ self.worker_id = worker_id
13
+ self.task_id = task_id
14
+ self.handlers = handlers
15
+ self.row_factory = row_factory
16
+ self.retries = retries
17
+ self.meta = DumperWorkerMeta()
18
+
19
+ def _log(self, msg, *args, **kwargs):
20
+ msg = f'Worker#{self.worker_id} Task#{self.task_id} {msg}'
21
+ self.logger.info(msg, *args, **kwargs)
22
+
23
+ def call_handlers(self, row):
24
+ for h in self.handlers:
25
+ # the handlers should take care of exceptions
26
+ h.handle(row)
27
+
28
+ def close_handlers(self):
29
+ for h in self.handlers:
30
+ h.close()
31
+
32
+ def reset_handlers(self):
33
+ self._log('reset handlers')
34
+ for h in self.handlers:
35
+ h.reset()
36
+
37
+ def set_input_schema(self, schema):
38
+ self.meta.schema = schema
39
+ for h in self.handlers:
40
+ h.set_input_schema(schema)
41
+
42
+ def execute(self):
43
+ self._log('executing')
44
+ for i, h in enumerate(self.handlers):
45
+ self._log('Handler #%s: %s', i, h)
46
+
47
+ for num_try in range(self.retries):
48
+ self._log(f'Try#{num_try}')
49
+ try:
50
+ rv = self.execute_impl()
51
+ except Exception as ex:
52
+ self._log(str(ex))
53
+ self.logger.exception(ex)
54
+ self.reset_handlers()
55
+ else:
56
+ break
57
+ else:
58
+ # TODO(liyangliang): 使用自定义的异常
59
+ raise RuntimeError('All attempts failed')
60
+ self.close_handlers()
61
+
62
+ self.meta.num_dumped_rows = rv
63
+ self.meta.handlers_meta = [x.meta for x in self.handlers]
64
+ return self.meta
65
+
66
+ def execute_impl(self):
67
+ raise NotImplementedError('execute_impl must be implemented by subclass')
68
+
69
+ def start_timer(self):
70
+ return Timer(logger=self.logger)
71
+
72
+
73
+ class SQLBasedWorker(BaseWorker):
74
+ def __init__(self, connector, query, parameters, handlers, *args, **kwargs):
75
+ self.connector = connector
76
+ self.query = query
77
+ self.parameters = parameters
78
+ super().__init__(handlers=handlers, *args, **kwargs)
79
+
80
+ def execute_impl(self):
81
+ n = 0
82
+ t = self.start_timer()
83
+ for row in self.dump_query(self.query, self.parameters):
84
+ self.call_handlers(row)
85
+ n += 1
86
+ if n % 10000 == 0:
87
+ t.info('dumped %d rows', n)
88
+ t.info('dumped %d rows in total', n)
89
+ return n
90
+
91
+ def dump_query(self, query, parameters):
92
+ raise NotImplementedError('dump_query must be implemented by subclass')
93
+
94
+
95
+ class BaseDumper(LoggingMixin):
96
+ _row_factory = staticmethod(keyed_tuple_factory)
97
+
98
+ def __init__(self, handler_factories, *args, **kwargs):
99
+ self.handler_factories = ensure_list(handler_factories or [])
100
+
101
+ assert len(self.handler_factories) > 0, 'must specific at least one HandlerFactory'
102
+ for hf in self.handler_factories:
103
+ assert isinstance(hf, HandlerFactory)
104
+
105
+ self.meta = DumperMeta()
106
+
107
+ @property
108
+ def row_factory(self):
109
+ """
110
+ The format to return row results in. By default, each returned row will be a named tuple.
111
+ You can alternatively use any of the following:
112
+
113
+ - :func:`pigeon.row_factory.tuple_factory` - return a result row as a tuple
114
+ - :func:`pigeon.row_factory.keyed_tuple_factory` - return a result row as a named tuple
115
+ - :func:`pigeon.row_factory.dict_factory` - return a result row as a dict
116
+ - :func:`pigeon.row_factory.ordered_dict_factory` - return a result row as an OrderedDict
117
+ """
118
+ return self._row_factory
119
+
120
+ @row_factory.setter
121
+ def row_factory(self, factory):
122
+ self._row_factory = factory
123
+
124
+ def create_handlers(self, **kwargs) -> List[Handler]:
125
+ return [hf.create_handler(**kwargs) for hf in self.handler_factories]
126
+
127
+ def join_handlers(self):
128
+ [hf.join() for hf in self.handler_factories]
129
+
130
+ def handle_schema(self):
131
+ return [hf.handle_dumper_schema(self.meta.schema) for hf in self.handler_factories]
132
+
133
+ def set_input_schema(self, schema):
134
+ for hf in self.handler_factories:
135
+ hf.transformer.input_schema = schema
136
+
137
+ def execute(self):
138
+ raise NotImplementedError('execute must be implemented by subclass')
139
+
140
+ def start_timer(self):
141
+ return Timer(logger=self.logger)