PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/pigeon/dumper/aliyun_sls.py ADDED Viewed

@@ -0,0 +1,415 @@
+import os
+import re
+import time
+from collections import OrderedDict
+from datetime import datetime, timedelta, timezone
+from functools import wraps
+from typing import Any, Callable, Dict, Generator, List, Optional
+from dateutil import parser as date_parser
+try:
+    from aliyun.log import GetHistogramsRequest, GetLogsRequest, LogClient
+except ImportError:
+    pass
+from recurvedata.pigeon.dumper.base import BaseDumper
+from recurvedata.pigeon.handler.base import HandlerFactory
+# Constants
+SQL_PATTERN = re.compile(r"^\s*select\s+.+\s+from\s+.+", re.IGNORECASE)
+LOGSEARCH_ANALYSIS_PATTERN = re.compile(r".*\|\s*select\s+.+", re.IGNORECASE)
+# Configuration constants
+DEFAULT_TIMEZONE_OFFSET = 8  # CST (UTC+8)
+TIMEZONE_ENV_VAR = "TZ_OFFSET"
+LARGE_DATASET_THRESHOLD = 500_000  # 500k logs
+DEFAULT_BATCH_SIZE = 1000
+MAX_RETRIES = 3
+def with_retry(max_retries: int = MAX_RETRIES):
+    """Decorator to add retry logic with Aliyun SLS error handling."""
+    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
+        @wraps(func)
+        def wrapper(self: "AliyunSLSDumper", *args: Any, **kwargs: Any) -> Any:
+            retry_count = 0
+            while retry_count < max_retries:
+                try:
+                    return func(self, *args, **kwargs)
+                except Exception as e:
+                    retry_count = self.handle_aliyun_error(e, retry_count, max_retries)
+                    if retry_count >= max_retries:
+                        self.logger.error(f"Max retries reached for {func.__name__}, stopping: {e}")
+                        raise e
+            return None
+        return wrapper
+    return decorator
+class AliyunSLSDumper(BaseDumper):
+    """Used to dump data from Aliyun SLS to local file (csv format).
+    This dumper uses histograms API to get total log count first, then chooses the optimal
+    fetching method based on data volume:
+    - For datasets > 500k logs: Uses get_log_all method (recommended by Aliyun SDK)
+    - For smaller datasets: Uses standard pagination with 1000-item batches
+    Args:
+        access_key_id: Aliyun Access Key ID
+        access_key_secret: Aliyun Access Key Secret
+        endpoint: Aliyun SLS Endpoint
+        project: Aliyun SLS Project
+        logstore: Aliyun SLS Logstore
+        start_time: Aliyun SLS Start Time (format: YYYY-MM-DD HH:MM:SS)
+        end_time: Aliyun SLS End Time (format: YYYY-MM-DD HH:MM:SS)
+        query: Aliyun SLS Query
+        handler_factories: List of handler factories for processing data
+        fields: Comma-separated list of fields to extract
+    """
+    def __init__(
+        self,
+        access_key_id: str,
+        access_key_secret: str,
+        endpoint: str,
+        project: str,
+        logstore: str,
+        start_time: str,
+        end_time: str,
+        query: Optional[str] = None,
+        handler_factories: Optional[List[HandlerFactory]] = None,
+        fields: Optional[str] = None,
+    ):
+        super().__init__(handler_factories=handler_factories)
+        self.access_key_id = access_key_id
+        self.access_key_secret = access_key_secret
+        self.endpoint = endpoint
+        self.project = project
+        self.logstore = logstore
+        self.query = query
+        self.fields = [field.strip() for field in fields.split(",")] if fields else []
+        # Parse time strings to datetime objects
+        self.start_time = self._parse_time_string(start_time)
+        self.end_time = self._parse_time_string(end_time)
+        # Initialize client in fetch_logs_segment to handle import errors
+        self.client = LogClient(self.endpoint, self.access_key_id, self.access_key_secret)
+    def _parse_time_string(self, time_str: str) -> datetime:
+        """Parse time string to datetime object using dateutil.parser."""
+        try:
+            parsed_time = date_parser.parse(time_str, dayfirst=False, yearfirst=True)
+            if parsed_time.tzinfo is not None:
+                parsed_time = parsed_time.replace(tzinfo=None)
+            self.logger.info(f"Parsed '{time_str}' -> {parsed_time}")
+            return parsed_time
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"Unable to parse time string '{time_str}': {e}")
+    def execute(self):
+        self.meta.mark_start()
+        self.execute_impl()
+        self.meta.mark_finish()
+        self.logger.info("dumper meta: %s", self.meta.to_json(indent=2))
+        return self.meta
+    def is_sql_or_logsearch_query(self, q: str) -> bool:
+        if not q:
+            return False
+        q = q.strip()
+        return bool(SQL_PATTERN.match(q) or LOGSEARCH_ANALYSIS_PATTERN.match(q))
+    def _process_log_contents(self, log, raw_contents):
+        """Process log contents and return ordered dictionary if fields are specified."""
+        if not self.fields:
+            return raw_contents
+        ordered_contents: OrderedDict = OrderedDict()
+        # Add fields in the user-specified order first
+        for field in self.fields:
+            if field in raw_contents:
+                ordered_contents[field] = raw_contents[field]
+            elif field == "__time__":
+                # Handle time field specially
+                ordered_contents[field] = log.get_time()
+            elif field == "_source_":
+                # Handle source field specially
+                ordered_contents[field] = log.get_source()
+            else:
+                self.logger.warning(f"Field '{field}' not found in raw_contents and not a special field")
+        return ordered_contents
+    def _get_timezone_offset(self) -> int:
+        """Get local timezone offset in hours from environment variable."""
+        tz_offset = os.environ.get(TIMEZONE_ENV_VAR)
+        return int(tz_offset) if tz_offset is not None else DEFAULT_TIMEZONE_OFFSET
+    def _calculate_utc_timestamp(self, dt: datetime) -> int:
+        """Calculate UTC timestamp by treating datetime as local time."""
+        local_offset = timezone(timedelta(hours=self._get_timezone_offset()))
+        local_dt = dt.replace(tzinfo=local_offset)
+        utc_dt = local_dt.astimezone(timezone.utc)
+        return int(utc_dt.timestamp())
+    def _get_time_range(self) -> tuple[int, int]:
+        """Get time range as timestamps to avoid Aliyun SDK timezone issues."""
+        from_time = self._calculate_utc_timestamp(self.start_time)
+        to_time = self._calculate_utc_timestamp(self.end_time)
+        self.logger.info(f"Time range - start_time: {self.start_time} -> from_time: {from_time}")
+        self.logger.info(f"Time range - end_time: {self.end_time} -> to_time: {to_time}")
+        return from_time, to_time
+    def handle_aliyun_error(self, error: Exception, retry_count: int, max_retries: int) -> int:
+        """Handle Aliyun SLS specific errors with appropriate delays."""
+        error_msg = str(error)
+        # Handle specific Aliyun SLS error codes
+        if "ReadQuotaExceed" in error_msg:
+            self.logger.warning(
+                f"Read quota exceeded (attempt {retry_count}/{max_retries}). Waiting 5 seconds before retry..."
+            )
+            time.sleep(5.0)  # Longer delay for quota issues
+        elif "QpsLimitExceeded" in error_msg or "MetaOperationQpsLimitExceeded" in error_msg:
+            self.logger.warning(
+                f"QPS limit exceeded (attempt {retry_count}/{max_retries}). Waiting 3 seconds before retry..."
+            )
+            time.sleep(3.0)  # Medium delay for QPS issues
+        elif "ServerBusy" in error_msg or "RequestTimeout" in error_msg:
+            self.logger.warning(
+                f"Server busy/timeout (attempt {retry_count}/{max_retries}). Waiting 2 seconds before retry..."
+            )
+            time.sleep(2.0)  # Short delay for server issues
+        else:
+            self.logger.warning(f"Error fetching logs (attempt {retry_count}/{max_retries}): {error}")
+            time.sleep(1.0)  # Default delay
+        return retry_count + 1
+    def _process_logs_batch(self, logs, handlers):
+        """Process a batch of logs and send to handlers."""
+        for log in logs:
+            raw_contents = log.get_contents()
+            log_entry = self._process_log_contents(log, raw_contents)
+            # Handle all handlers in one loop
+            for h in handlers:
+                h.handle(log_entry)
+    def _create_logs_request(
+        self, from_time: int, to_time: int, offset: int = 0, limit: int = DEFAULT_BATCH_SIZE
+    ) -> "GetLogsRequest":
+        """Create a GetLogsRequest with appropriate parameters."""
+        has_pagination_in_query = self.is_sql_or_logsearch_query(self.query or "")
+        if has_pagination_in_query:
+            # For queries with pagination, we need to modify the query to include our offset/limit
+            modified_query = self._add_pagination_to_query(self.query or "")
+            return GetLogsRequest(
+                self.project,
+                self.logstore,
+                from_time,
+                to_time,
+                query=modified_query,
+                reverse=False,
+            )
+        else:
+            # Use standard offset pagination
+            return GetLogsRequest(
+                self.project,
+                self.logstore,
+                from_time,
+                to_time,
+                query=self.query,
+                line=limit,
+                offset=offset,
+                reverse=False,
+            )
+    def _add_pagination_to_query(self, query: str) -> str:
+        """Add pagination parameters to existing query."""
+        # Check if query already has limit clause
+        if "limit" in query.lower():
+            return query
+        else:
+            # Add limit and offset to query
+            return f"{query} limit {0},{DEFAULT_BATCH_SIZE}"
+    @with_retry()
+    def _get_total_log_count(self) -> int:
+        """Get total log count using histograms API."""
+        from_time, to_time = self._get_time_range()
+        # Check if query is an analysis statement (SQL or LogSearch analysis)
+        if self.is_sql_or_logsearch_query(self.query or ""):
+            self.logger.warning(
+                f"Query '{self.query}' appears to be an analysis statement. "
+                "Histograms API does not support analysis queries. "
+                "Will use get_log_all method for fetching data."
+            )
+            # Return a large number to trigger get_log_all method
+            return LARGE_DATASET_THRESHOLD + 1
+        request = GetHistogramsRequest(self.project, self.logstore, from_time, to_time, query=self.query or "")
+        response = self.client.get_histograms(request)
+        total_logs = response.get_total_count()
+        self.logger.info(f"Total logs to fetch: {total_logs}")
+        return total_logs
+    def _fetch_logs_batch(
+        self, offset: int, limit: int, from_time: int, to_time: int
+    ) -> Generator[Dict[str, Any], None, None]:
+        """Fetch logs in a single batch using offset pagination."""
+        request = self._create_logs_request(from_time, to_time, offset, limit)
+        response = self.client.get_logs(request)
+        if response:
+            logs = response.get_logs()
+            batch_logs = []
+            for log in logs:
+                raw_contents = log.get_contents()
+                if not self.fields:
+                    batch_logs.append(raw_contents)
+                else:
+                    batch_logs.append(self._process_log_contents(log, raw_contents))
+            return batch_logs
+    def _fetch_logs_with_get_log_all(self, handlers):
+        """Fetch logs using get_log_all method for large datasets."""
+        self.logger.info("Starting get_log_all fetch...")
+        start_time = time.time()
+        total_processed = 0
+        batch_count = 0
+        max_retries = 3
+        retry_count = 0
+        from_time, to_time = self._get_time_range()
+        while True:
+            try:
+                for response in self.client.get_log_all(
+                    self.project, self.logstore, from_time, to_time, query=self.query, reverse=False
+                ):
+                    if response:
+                        logs = response.get_logs()
+                        batch_count += 1
+                        logs_count = len(logs)
+                        total_processed += logs_count
+                        # Log progress every 50 batches to reduce logging overhead
+                        if batch_count % 50 == 0:
+                            elapsed_time = time.time() - start_time
+                            rate = total_processed / elapsed_time if elapsed_time > 0 else 0
+                            self.logger.info(
+                                f"Fetched {logs_count} logs from get_log_all (batch {batch_count}, total: {total_processed:,}, rate: {rate:.0f} logs/sec)"
+                            )
+                        # Process logs directly - optimize for speed
+                        for log in logs:
+                            raw_contents = log.get_contents()
+                            # Skip field processing if no fields specified for maximum speed
+                            if not self.fields:
+                                log_entry = raw_contents
+                            else:
+                                log_entry = self._process_log_contents(log, raw_contents)
+                            # Handle all handlers in one loop
+                            for h in handlers:
+                                h.handle(log_entry)
+                # If we reach here, the generator completed successfully
+                break
+            except Exception as e:
+                retry_count = self.handle_aliyun_error(e, retry_count, max_retries)
+                if retry_count >= max_retries:
+                    self.logger.error(f"Max retries reached for get_log_all, stopping: {e}")
+                    raise e
+                # Continue the loop to retry
+                continue
+        elapsed_time = time.time() - start_time
+        final_rate = total_processed / elapsed_time if elapsed_time > 0 else 0
+        self.logger.info(
+            f"get_log_all fetch completed: {total_processed:,} logs in {elapsed_time:.1f}s ({final_rate:.0f} logs/sec)"
+        )
+    def _fetch_logs_with_pagination(self, handlers, total_logs):
+        """Fetch logs using standard pagination method for smaller datasets."""
+        batch_size = 1000
+        self.logger.info(f"Using batch size: {batch_size}")
+        # Fetch logs in batches using offset pagination
+        offset = 0
+        processed_count = 0
+        from_time, to_time = self._get_time_range()
+        while offset < total_logs:
+            self.logger.info(
+                f"Fetching logs batch: offset={offset:,}, limit={batch_size} (processed: {processed_count:,}/{total_logs:,})"
+            )
+            # Retry logic for each batch
+            max_retries = 3
+            retry_count = 0
+            batch_success = False
+            while retry_count < max_retries and not batch_success:
+                try:
+                    for log_entry in self._fetch_logs_batch(offset, batch_size, from_time, to_time):
+                        for h in handlers:
+                            h.handle(log_entry)
+                        processed_count += 1
+                    offset += batch_size
+                    batch_success = True
+                except Exception as e:
+                    retry_count = self.handle_aliyun_error(e, retry_count, max_retries)
+                    if retry_count >= max_retries:
+                        self.logger.error(f"Failed to fetch batch at offset {offset}: {e}")
+                        raise e
+                    # Continue retry loop for the same batch
+                    continue
+    def execute_impl(self):
+        handlers = self.create_handlers()
+        self.logger.info("execute with context")
+        self.logger.info(f"query: {self.query}")
+        self.logger.info(f"start_time: {self.start_time}")
+        self.logger.info(f"end_time: {self.end_time}")
+        self.logger.info(f"fields: {self.fields}")
+        # Get total log count using histograms
+        total_logs = self._get_total_log_count()
+        if total_logs == 0:
+            self.logger.info("No logs found for the specified time range and query")
+            return
+        # Choose appropriate method based on log count
+        if total_logs > LARGE_DATASET_THRESHOLD:  # More than 500k logs - use get_log_all for better performance
+            self.logger.info(f"Large dataset detected ({total_logs:,} logs), using get_log_all method")
+            self._fetch_logs_with_get_log_all(handlers)
+        else:
+            self.logger.info(f"Using standard pagination method for {total_logs:,} logs")
+            self._fetch_logs_with_pagination(handlers, total_logs)
+        for h in handlers:
+            h.close()
+        self.join_handlers()

recurvedata/pigeon/dumper/base.py ADDED Viewed

@@ -0,0 +1,141 @@
+from typing import List
+from recurvedata.pigeon.handler.base import Handler, HandlerFactory
+from recurvedata.pigeon.meta import DumperMeta, DumperWorkerMeta
+from recurvedata.pigeon.row_factory import keyed_tuple_factory
+from recurvedata.pigeon.utils import LoggingMixin, ensure_list
+from recurvedata.pigeon.utils.timing import Timer
+class BaseWorker(LoggingMixin):
+    def __init__(self, worker_id, task_id, handlers, row_factory=keyed_tuple_factory, retries=3):
+        self.worker_id = worker_id
+        self.task_id = task_id
+        self.handlers = handlers
+        self.row_factory = row_factory
+        self.retries = retries
+        self.meta = DumperWorkerMeta()
+    def _log(self, msg, *args, **kwargs):
+        msg = f'Worker#{self.worker_id} Task#{self.task_id} {msg}'
+        self.logger.info(msg, *args, **kwargs)
+    def call_handlers(self, row):
+        for h in self.handlers:
+            # the handlers should take care of exceptions
+            h.handle(row)
+    def close_handlers(self):
+        for h in self.handlers:
+            h.close()
+    def reset_handlers(self):
+        self._log('reset handlers')
+        for h in self.handlers:
+            h.reset()
+    def set_input_schema(self, schema):
+        self.meta.schema = schema
+        for h in self.handlers:
+            h.set_input_schema(schema)
+    def execute(self):
+        self._log('executing')
+        for i, h in enumerate(self.handlers):
+            self._log('Handler #%s: %s', i, h)
+        for num_try in range(self.retries):
+            self._log(f'Try#{num_try}')
+            try:
+                rv = self.execute_impl()
+            except Exception as ex:
+                self._log(str(ex))
+                self.logger.exception(ex)
+                self.reset_handlers()
+            else:
+                break
+        else:
+            # TODO(liyangliang): 使用自定义的异常
+            raise RuntimeError('All attempts failed')
+        self.close_handlers()
+        self.meta.num_dumped_rows = rv
+        self.meta.handlers_meta = [x.meta for x in self.handlers]
+        return self.meta
+    def execute_impl(self):
+        raise NotImplementedError('execute_impl must be implemented by subclass')
+    def start_timer(self):
+        return Timer(logger=self.logger)
+class SQLBasedWorker(BaseWorker):
+    def __init__(self, connector, query, parameters, handlers, *args, **kwargs):
+        self.connector = connector
+        self.query = query
+        self.parameters = parameters
+        super().__init__(handlers=handlers, *args, **kwargs)
+    def execute_impl(self):
+        n = 0
+        t = self.start_timer()
+        for row in self.dump_query(self.query, self.parameters):
+            self.call_handlers(row)
+            n += 1
+            if n % 10000 == 0:
+                t.info('dumped %d rows', n)
+        t.info('dumped %d rows in total', n)
+        return n
+    def dump_query(self, query, parameters):
+        raise NotImplementedError('dump_query must be implemented by subclass')
+class BaseDumper(LoggingMixin):
+    _row_factory = staticmethod(keyed_tuple_factory)
+    def __init__(self, handler_factories, *args, **kwargs):
+        self.handler_factories = ensure_list(handler_factories or [])
+        assert len(self.handler_factories) > 0, 'must specific at least one HandlerFactory'
+        for hf in self.handler_factories:
+            assert isinstance(hf, HandlerFactory)
+        self.meta = DumperMeta()
+    @property
+    def row_factory(self):
+        """
+        The format to return row results in. By default, each returned row will be a named tuple.
+        You can alternatively use any of the following:
+          - :func:`pigeon.row_factory.tuple_factory` - return a result row as a tuple
+          - :func:`pigeon.row_factory.keyed_tuple_factory` - return a result row as a named tuple
+          - :func:`pigeon.row_factory.dict_factory` - return a result row as a dict
+          - :func:`pigeon.row_factory.ordered_dict_factory` - return a result row as an OrderedDict
+        """
+        return self._row_factory
+    @row_factory.setter
+    def row_factory(self, factory):
+        self._row_factory = factory
+    def create_handlers(self, **kwargs) -> List[Handler]:
+        return [hf.create_handler(**kwargs) for hf in self.handler_factories]
+    def join_handlers(self):
+        [hf.join() for hf in self.handler_factories]
+    def handle_schema(self):
+        return [hf.handle_dumper_schema(self.meta.schema) for hf in self.handler_factories]
+    def set_input_schema(self, schema):
+        for hf in self.handler_factories:
+            hf.transformer.input_schema = schema
+    def execute(self):
+        raise NotImplementedError('execute must be implemented by subclass')
+    def start_timer(self):
+        return Timer(logger=self.logger)