PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/pigeon/csv.py ADDED Viewed

@@ -0,0 +1,172 @@
+import contextlib
+import csv
+import logging
+import sys
+import cytoolz as toolz
+from recurvedata.utils.imports import MockModule
+try:
+    import numpy as np
+    import pandas as pd
+except ImportError:
+    np = MockModule("numpy")
+    pd = MockModule("pandas")
+from recurvedata.pigeon import const
+from recurvedata.pigeon.schema import Schema, types
+csv.field_size_limit(sys.maxsize)
+dialect_terms = (
+    "delimiter",
+    "doublequote",
+    "escapechar",
+    "lineterminator",
+    "quotechar",
+    "quoting",
+    "skipinitialspace",
+    "strict",
+)
+class ExtendedSniffer(csv.Sniffer):
+    def __init__(self):
+        super().__init__()
+        self.preferred = [",", "\t", ";", " ", ":", "|", const.HIVE_FIELD_DELIMITER]
+def copy_dialect(name, source_dialect):
+    return dict_to_dialect(dialect_to_dict(source_dialect), name)
+def dialect_to_dict(dialect):
+    return {name: getattr(dialect, name) for name in dialect_terms if hasattr(dialect, name)}
+def dict_to_dialect(d, name=""):
+    class dialect(csv.Dialect):
+        _name = name
+    for name in dialect_terms:
+        if name in d:
+            setattr(dialect, name, d[name])
+    return dialect
+def infer_header(path, nbytes=10000, encoding="utf-8"):
+    with open(path, "rb") as f:
+        sample = f.read(nbytes).decode(encoding, "replace")
+    sniffer = ExtendedSniffer()
+    try:
+        return sniffer.has_header(sample)
+    except csv.Error:
+        return None
+def sniff_dialect(path, nbytes=10000, encoding="utf-8"):
+    with open(path, "rb") as f:
+        sample = f.read(nbytes).decode(encoding, "replace")
+    sniffer = ExtendedSniffer()
+    try:
+        dialect = sniffer.sniff(sample, delimiters=sniffer.preferred)
+    except csv.Error as e:
+        logging.warning("failed to sniff dialect, copy from csv.excel. error: %s", e)
+        dialect = copy_dialect(name="excel_copy", source_dialect=csv.excel)
+    crnl, nl = "\r\n", "\n"
+    dialect.lineterminator = crnl if crnl in sample else nl
+    return dialect
+class CSV(object):
+    """
+    Proxy for a CSV file.
+    """
+    def __init__(self, path, has_header=None, encoding="utf-8", **dialect_kwargs):
+        self.path = path
+        self._has_header = has_header
+        self.encoding = encoding or "utf-8"
+        self._dialect_kwargs = dialect_kwargs
+    @toolz.memoize
+    def _sniff_dialect(self):
+        dialect = sniff_dialect(self.path, encoding=self.encoding)
+        for k, v in self._dialect_kwargs.items():
+            if k in dialect_terms:
+                setattr(dialect, k, v)
+        return dialect
+    @property
+    def dialect(self):
+        return self._sniff_dialect()
+    @property
+    def dialect_options(self):
+        return dialect_to_dict(self.dialect)
+    @property
+    def has_header(self):
+        if self._has_header is None:
+            self._has_header = infer_header(self.path, encoding=self.encoding)
+        return self._has_header
+    @property
+    def header(self):
+        if not self.has_header:
+            return None
+        with open(self.path, encoding=self.encoding, newline="") as f:
+            reader = csv.reader(f, **self.dialect_options)
+            header = next(reader)
+        return tuple(header)
+    def to_df(self):
+        return pd.read_csv(self.path, encoding=self.encoding, dialect=self.dialect)
+    @contextlib.contextmanager
+    def reader(self, as_dict=False):
+        if as_dict and not self.header:
+            raise ValueError("missing header")
+        with open(self.path, encoding=self.encoding, newline="") as fd:
+            if as_dict:
+                reader = csv.DictReader(fd, **self.dialect_options)
+            else:
+                if self.has_header:
+                    fd.readline()  # skip header
+                reader = csv.reader(fd, **self.dialect_options)
+            yield reader
+    @toolz.memoize
+    def infer_schema(self):
+        if not self.has_header:
+            return None
+        mapping = {
+            np.int8: types.INT8,
+            np.int16: types.INT16,
+            np.int32: types.INT32,
+            np.int64: types.INT64,
+            np.float16: types.FLOAT32,
+            np.float32: types.FLOAT32,
+            np.float64: types.FLOAT64,
+            np.datetime64: types.DATETIME,
+            np.object_: types.STRING,
+            np.str_: types.STRING,
+        }
+        # np.bool removed since numpy 1.20 https://github.com/numpy/numpy/releases/tag/v1.20.0
+        if np.__version__ < "1.20.0":
+            mapping[np.bool] = types.BOOLEAN
+        else:
+            mapping[np.bool_] = types.BOOLEAN
+        df = pd.read_csv(self.path, encoding=self.encoding, dialect=self.dialect, nrows=500)
+        schema = Schema()
+        for col in df.columns:
+            canonical_type = mapping.get(df.dtypes[col].type, types.STRING)
+            schema.add_field_by_attrs(col, canonical_type)
+        return schema

recurvedata/pigeon/docs/datasources-example.json ADDED Viewed

@@ -0,0 +1,82 @@
+{
+  "__meta__": {
+    "description": "THIS FILE IS AUTO GENERATED, PLEASE DO NOT EDIT",
+    "version": "2023-03-21T16:56:28.143088"
+  },
+  "clickhouse_default": {
+    "host": "clickhouse.ym",
+    "password": "gAAAAABkGXE805UXC1yT86W2NJ-L5s6VThImXfhqZNAWq8ejW_kr40hL6HDBkfNynAwZnK7xJ-hKdDp_kmB9pvoa0vooRCwTVg==",
+    "port": 19000,
+    "user": "ymetl"
+  },
+  "emr_hdfs_default": {
+    "host": "emr-header-2",
+    "port": 50070,
+    "user_name": "ymetl"
+  },
+  "emr_hive_default": {
+    "auth": "LDAP",
+    "hdfs_options": {
+      "host": "emr-header-2",
+      "port": 50070,
+      "user_name": "ymetl"
+    },
+    "hive_conf": {
+      "tez.queue.name": "etl"
+    },
+    "host": "ha.hive.emr.ym",
+    "password": "gAAAAABkGXE8L8IcXEm6k2vnkNd8vtapiIFe7vbqTu5ywGJrSkoZy1We4o_hWRElIJ3SCQvHTNsXdEW59qTbybasR2kSqIJ_Ys0lX3_xLIeuZ307qJGFKIE=",
+    "port": 10001,
+    "user": "ymetl"
+  },
+  "emr_impala_default": {
+    "auth_mechanism": "PLAIN",
+    "host": "ha.impala.emr.ym",
+    "password": "gAAAAABkGXE8WZQCSss2zXKeBswFEG1Qvdv6QqfFDwfszwu1bgP6ZSd1wRiIXv7tXL8cBWGBeZP1eYqexcwo5Cehor_9lUXjp7YVrAQEXzTjcWa9zlpkK50=",
+    "port": 21051,
+    "user": "ymetl"
+  },
+  "hdfs_default": {
+    "host": "hdfsnn.ym",
+    "port": 50070,
+    "user_name": "ymetl"
+  },
+  "hive_default": {
+    "auth": "LDAP",
+    "hdfs_options": {
+      "host": "hdfsnn.ym",
+      "port": 50070,
+      "user_name": "ymetl"
+    },
+    "hive_conf": {
+      "spark.yarn.queue": "etl"
+    },
+    "host": "hive.ym",
+    "password": "gAAAAABkGXE8zVq-ZOWQyzTMzP-ogS-TqV8K_gxklD61LmsEZeN54pOBIDpKJD9n5913vD4mZRTEEKzxKunLde9dpVW4u2lbZyepP-YT-tEbqjIfrW-gRUY=",
+    "port": 10000,
+    "user": "ymetl"
+  },
+  "impala_default": {
+    "auth_mechanism": "PLAIN",
+    "host": "ha.impala.ym",
+    "password": "gAAAAABkGXE8ciN_A0sQmzh1VOKeTprtjOtp_JPR7yCgZQeZiUD0lQ4dIshzWxLfb_YIqEcxL7uXYyxk0jFVwpFGJQUl8gNaCESFtg_Cei7tLwD4cm5KA9o=",
+    "port": 21051,
+    "user": "ymetl"
+  },
+  "mysql_default": {
+    "host": "mysql.ym",
+    "password": "gAAAAABkGXE8d1AG1mqenBsAOgGx_blaQQUceK0D_R1Vbo-wLe2ZHdHEmW9dSJ4fgYMkwy95-6uSjdXP3RfnankyXgd-BBLyiA==",
+    "port": 3306,
+    "user": "dev"
+  },
+  "phoenix_default": {
+    "host": "phoenix-etl.ym",
+    "port": 8765
+  },
+  "tidb_default": {
+    "host": "tidb-etl.ym",
+    "password": "gAAAAABkGXE8-zHHCsScuOqgSB6bVTmA1Mxdl_jp2Z-DprOC5Qh6cHrr33VOREGPyAJH_1Wh5SjHzGC1KmyRO49dbS38-bvQuJo8Z_ReCX1yL1DwlkgwMmv69xAlasFEEBwMP8CirioX",
+    "port": 4000,
+    "user": "dev"
+  }
+}

recurvedata/pigeon/docs/images/pigeon_design.png ADDED Viewed

Binary file

recurvedata/pigeon/docs/lightweight-data-sync-solution.md ADDED Viewed

@@ -0,0 +1,111 @@
+A Lightweight and General Data Synchronization Solution
+=======================
+Data synchronization between different database systems is a common requirement in the big data field. A typical scenario is that the business system uses MySQL for transactions and random queries; the data warehouse uses Hive; the results after ETL are then put into systems such as MySQL, AWS Redshift, etc. for use by BI and reporting tools.
+First, let's clarify the requirements and goals:
+- Real-time: Non-real-time, offline synchronization, generally T+1, or as fine as hourly granularity
+- Scalability: Need to support multiple heterogeneous data sources, such as MySQL, Hive, ElasticSearch, etc.
+- Performance requirements: Because it is an offline system, there is no strict requirement for performance, but it is best to be as fast as possible and possible to optimize
+- Complexity: Low complexity, few dependencies, easy to use, easy to operate
+- Functional requirements: Need to meet full synchronization and incremental data synchronization
+## Solution
+Data synchronization is not a special problem, it is actually just two operations: read and write. Similar to this is the database backup and restore, many database systems have such tools, such as MySQL's `mysqldump` and `mysqlimport`, MongoDB's `mongodump` and `mongorestore`, etc. These tools generally use special encoding formats for performance and do not consider generality. But a general data synchronization system can be implemented using the same approach.
+![pigeon_design.png](./images/pigeon_design.png)
+The above picture describes the solution of this article, which is to split the read and write, and transition through CSV files.
+## Scalability
+The core of this design is to abstract data synchronization into two processes: export (read) and import (write), completely decoupled, so it has good scalability. Each data source only needs to implement the two operations of read and write. Taking common data sources as an example, let's see how to import data from CSV (exporting to CSV is easy, can be implemented using any programming language).
+| Data source | Import CSV |
+| ------------- | ------------------------------------------------------------ |
+| MySQL | Use `LOAD DATA LOCAL INFILE` for batch loading, or read the file to run the `INSERT` statement |
+| AWS Redshift | Use AWS S3 as a transfer, and use the `COPY` command for batch loading |
+| Hive | Specify the Serde as `org.apache.hadoop.hive.serde2.OpenCSVSerde` when creating the table, or convert the CSV to the default `TEXTFILE` format before importing; then use `LOAD DATA [LOCAL] INPATH` for batch loading. |
+| ElasticSearch | Read the file and insert in batches |
+| FTP, AWS S3 | Upload directly |
+## Performance Issues
+Another benefit of decoupling is performance optimization, because we can focus on optimizing export and import without worrying about the impact of each other.
+### Export Performance
+Export performance optimization is usually achieved through parallelization, that is, the data set is split and then processed in parallel.
+Taking MySQL as an example, if the table has an auto-increment primary key, first query the upper and lower bounds, split into N pieces, and then start M threads to consume (Sqoop also uses this approach, by adjusting the number of mappers to control). Each thread can write a separate file and then merge, or use a separate thread to aggregate and write; generally speaking, the first method is better in terms of performance.
+The premise of this optimization is to find a way to split as evenly as possible. If there is data skew, the improvement may not be significant, or even degrade to single thread. For the database, the field used for splitting also needs to have an index, and generally a auto-increment primary key or a timestamp with an index will be selected. The parallelism cannot be too high, otherwise it may bring too much pressure to the upstream system. Another implementation detail is that the data should be streamed to get and write to the file, rather than pulling all into memory, otherwise it may cause too much memory usage, or even OOM.
+In addition, considering that the export process may be interrupted abnormally, you can also consider using a checkpoint mechanism to retry from the failure.
+### Import Performance
+Import performance optimization is usually achieved through the batch idea.
+Some databases, such as MySQL, Hive, Redshift, etc., support directly loading CSV files, which is generally the most efficient way. If batch loading is not supported, you can also call the batch import API (such as ElasticSearch's `/_bulk`, the database's `INSERT` statement usually supports inserting multiple records at once). Some data sources may support compressed files (such as Redshift supports GZIP and other compression formats), you can compress before importing to shorten the transmission time and bandwidth consumption.
+The failure retry of the import process can also use the checkpoint to achieve "resuming from the breakpoint", and you can also consider using a deduplication mechanism, such as using a bloom filter for checking.
+## Complexity
+From the design diagram in the previous section, you can see that this solution has low complexity, clear process, and easy to implement. Except for the local file system, there are basically no external dependencies. Pay attention to logs and statistics during implementation, which is convenient for tracking progress, analyzing problems, and locating faults.
+## Full and Incremental
+From the perspective of complexity, full synchronization is the easiest to implement and better guarantees the consistency of the data. However, as the data volume increases, the resource consumption and time required for each full synchronization will increase. Incremental synchronization is necessary and more complex.
+### Incremental Export
+The premise of incremental export is to be able to identify new data. The easiest way is to judge by the auto-increment primary key, but this is limited by the characteristics of the database itself. Some databases do not support auto-increment primary keys, and some databases do not guarantee monotonicity of auto-increment primary keys (such as [TiDB](<https://pingcap.com/docs/sql/mysql-compatibility/#auto-increment-id>), deploying multiple tidb-servers may result in the ID inserted later being smaller than the ID inserted earlier). It is more reliable to judge by time, time naturally increases and is strictly monotonic, and another benefit is that for periodic incremental synchronization, you don't need to save the checkpoint, you can calculate it directly.
+Having a monotonically increasing integer or time field (preferably time) is a necessary condition for incremental export, and in order to better export performance, this field also needs to be indexed.
+### Incremental Import
+Incremental import needs to consider more situations, such as import mode and idempotence.
+First, let's look at the import mode, which can be divided into two types: merge (`MERGE`) and append (`APPEND`) (in fact, there is also a special incremental import, such as importing to a partition of Hive, which is the same as full import (`OVERWRITE`)).
+- `MERGE`: The new and updated records in the upstream system need to be synchronized to the target system, similar to `UPSERT`
+- `APPEND`: The upstream system only adds, does not update, similar to `INSERT`
+The implementation of `APPEND` is relatively simple, but if imported multiple times, (when there is no unique constraint) it is easy to generate duplicate data (not idempotent). In fact, `APPEND` is an extreme case of `MERGE`, so it can be converted to `MERGE` for implementation.
+The premise of implementing `MERGE` is that you need a field to distinguish the uniqueness of the record, such as a primary key, a unique constraint (as long as it can be logically distinguished). Different data sources implement `MERGE` in different ways. Some data sources support `UPSERT` operation, such as Phoenix, Kudu, MongoDB, etc.; ElasticSearch is also similar to `UPSERT` when indexing documents; some databases support `REPLACE` operation; MySQL also has `INSERT ON DUPLICATE UPDATE`. In fact, for MySQL, Redshift, Hive and other relational databases, there is also a general solution: use `FULL JOIN` or `LEFT JOIN + UNION ALL` (refer to [Talking about Idempotence](http://liyangliang.me/posts/2019/03/idempotence/)).
+This implementation of incremental import has a limitation, that is, it cannot synchronize the physical delete operation of the upstream system. If there is such a requirement, you can consider changing to soft delete, or using full synchronization.
+### Import Process
+Whether it is full or incremental, the import process needs to ensure at least two points: "transactionality" and cannot (as little as possible) affect the use of the target data. This problem mainly occurs in the database system, and generally does not occur in scenarios such as ElasticSearch and object storage.
+"Transactionality" means that for the data to be imported, either all succeed or all fail, and partial import cannot occur.
+During the import process, the target data should be available, or the affected time should be as short as possible. For example, long-term table locking should not occur, causing queries to fail.
+You can optimize the process: first import to the staging table, prepare the final result table, and then replace it with the target table. When importing to the staging table, you can keep deleting and retrying, to ensure that the new data is completely imported before proceeding to the next step. For full import, you can directly rename the staging table to the target table, or use the `INSERT OVERWRITE` statement to copy the data. For incremental import, you need to create an intermediate table to store the result data, and after completing, use the rename or data copy method to update to the target table.
+## Limitations
+There are mainly two limitations: 1. Need to write to disk; 2. CSV.
+In some scenarios, writing to disk may bring some additional performance overhead, but in the offline system, this impact should be negligible. Pay attention to file cleaning, otherwise the entire disk space may be used up. The biggest problem should be that the export and import are not completely decoupled, and must be deployed on the same machine, and ensure that the same file path is used. Because of this state, it to some extent limits the ability to horizontally scale (note, only the synchronization of a single table needs to be completed on one machine, and multiple tables can be horizontally scaled).
+Using CSV files as a data exchange format is actually a compromise, with both advantages and disadvantages. Regarding the CSV format, there is also a discussion in [this article](http://liyangliang.me/posts/2019/03/data-encoding/). Here is a summary of the shortcomings:
+- Cannot distinguish between numbers and strings that happen to be composed of numbers. However, this can be solved by using an additional schema, such as exporting the data at the same time, also exporting a schema, or using the schema of the target database to determine when importing.
+- Does not support binary data.
+- There may be escape problems.
+- Cannot distinguish between empty strings and null values (`None`, `NULL`), one solution is to use a special value to represent null values, such as `\N`.
+Overall, using CSV should be able to meet more than 90% of the use cases.
+Using Kafka as a data exchange bus can break these limitations, but it also increases the complexity of the system. You can choose according to the actual situation.

recurvedata/pigeon/dumper/__init__.py ADDED Viewed

@@ -0,0 +1,171 @@
+from recurvedata.pigeon.connector import get_connector
+from recurvedata.pigeon.handler.csv_handler import CSVFileHandler, create_csv_file_handler_factory
+def new_to_csv_dumper(
+    dbtype, connection=None, database=None, connector=None, filename=None, transformer=None, hive=False, **dumper_kwargs
+):
+    if connector is None:
+        connector = get_connector(dbtype, connection=connection, database=database)
+    handler_factory_params = ["merge_files", "encoding", "write_header"] + CSVFileHandler.ERROR_HANDLE_PARAMS
+    factory_options = dict(filename=filename, hive=hive, transformer=transformer)
+    for p in handler_factory_params:
+        if p in dumper_kwargs:
+            factory_options[p] = dumper_kwargs.pop(p)
+    factory = create_csv_file_handler_factory(**factory_options)
+    dumper_kwargs.setdefault("handler_factories", [factory])
+    row_factory = dumper_kwargs.pop("row_factory", None)
+    if dbtype == "cassandra":
+        from .cass import CassandraDumper
+        dumper = CassandraDumper(connector, **dumper_kwargs)
+    else:
+        from .dbapi import DBAPIDumper
+        dumper = DBAPIDumper(connector, **dumper_kwargs)
+    if row_factory is not None:
+        dumper.row_factory = row_factory
+    return dumper
+def new_tidb_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("tidb", connection, database, filename, transformer, hive=False, **dumper_kwargs)
+def new_tidb_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("tidb", connection, database, filename, transformer, hive=True, **dumper_kwargs)
+def new_mysql_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("mysql", connection, database, filename, transformer, hive=False, **dumper_kwargs)
+def new_mysql_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("mysql", connection, database, filename, transformer, hive=True, **dumper_kwargs)
+def new_redshift_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("redshift", connection, database, filename, transformer, hive=False, **dumper_kwargs)
+def new_redshift_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("redshift", connection, database, filename, transformer, hive=True, **dumper_kwargs)
+def new_impala_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("impala", connection, database, filename, transformer, hive=False, **dumper_kwargs)
+def new_impala_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("impala", connection, database, filename, transformer, hive=True, **dumper_kwargs)
+def new_phoenix_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("phoenix", connection, database, filename, transformer, hive=False, **dumper_kwargs)
+def new_phoenix_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("phoenix", connection, database, filename, transformer, hive=True, **dumper_kwargs)
+def new_clickhouse_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("clickhouse", connection, database, filename, transformer, hive=False, **dumper_kwargs)
+def new_clickhouse_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("clickhouse", connection, database, filename, transformer, hive=True, **dumper_kwargs)
+def new_cassandra_to_csv_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("cassandra", connection, database, filename, transformer, hive=False, **dumper_kwargs)
+def new_cassandra_to_hive_dumper(connection=None, database=None, filename=None, transformer=None, **dumper_kwargs):
+    return new_to_csv_dumper("cassandra", connection, database, filename, transformer, hive=True, **dumper_kwargs)
+def new_elasticsearch_to_csv_dumper(hosts=None, filename=None, transformer=None, **dumper_kwargs):
+    from recurvedata.pigeon.dumper.es import ElasticSearchDumper
+    factory = create_csv_file_handler_factory(filename=filename, transformer=transformer)
+    dumper_kwargs.setdefault("handler_factories", [factory])
+    dumper = ElasticSearchDumper(connector=get_connector("es", host=hosts), **dumper_kwargs)
+    return dumper
+def new_elasticsearch_to_hive_dumper(hosts=None, filename=None, transformer=None, **dumper_kwargs):
+    from recurvedata.pigeon.dumper.es import ElasticSearchDumper
+    factory = create_csv_file_handler_factory(filename=filename, transformer=transformer, hive=True)
+    dumper_kwargs.setdefault("handler_factories", [factory])
+    dumper = ElasticSearchDumper(connector=get_connector("es", host=hosts), **dumper_kwargs)
+    return dumper
+def new_ftp_dumper(conf=None, **dumper_kwargs):
+    from recurvedata.pigeon.dumper.ftp import FtpDumper
+    dumper = FtpDumper(connector=get_connector("ftp", conf=conf), **dumper_kwargs)
+    return dumper
+def new_mongodb_to_csv_dumper(connection=None, filename=None, transformer=None, **dumper_kwargs):
+    from recurvedata.pigeon.dumper.mongodb import MongoDBDumper
+    factory = create_csv_file_handler_factory(filename=filename, transformer=transformer)
+    dumper_kwargs.setdefault("handler_factories", [factory])
+    dumper = MongoDBDumper(connector=get_connector("mongodb", connection=connection), **dumper_kwargs)
+    return dumper
+def new_mongodb_to_hive_dumper(connection=None, filename=None, transformer=None, **dumper_kwargs):
+    from recurvedata.pigeon.dumper.mongodb import MongoDBDumper
+    factory = create_csv_file_handler_factory(filename=filename, transformer=transformer, hive=True)
+    dumper_kwargs.setdefault("handler_factories", [factory])
+    dumper = MongoDBDumper(connector=get_connector("mongodb", connection=connection), **dumper_kwargs)
+    return dumper
+def new_google_bigquery_to_csv_dumper(
+    filename=None,
+    transformer=None,
+    key_path=None,
+    key_dict=None,
+    proxies=None,
+    location=None,
+    hive=False,
+    **dumper_kwargs,
+):
+    from recurvedata.pigeon.connector import new_google_bigquery_connector
+    from recurvedata.pigeon.dumper.dbapi import DBAPIDumper
+    connector = new_google_bigquery_connector(key_path=key_path, key_dict=key_dict, proxies=proxies, location=location)
+    factory = create_csv_file_handler_factory(filename=filename, transformer=transformer, hive=hive, encoding="utf-8")
+    dumper_kwargs.setdefault("handler_factories", [factory])
+    dumper = DBAPIDumper(connector, **dumper_kwargs)
+    row_factory = dumper_kwargs.pop("row_factory", None)
+    if row_factory is not None:
+        dumper.row_factory = row_factory
+    return dumper
+def new_clickhouse_native_to_csv_dumper(
+    connection=None, database=None, filename=None, transformer=None, **dumper_kwargs
+):
+    return new_to_csv_dumper(
+        "clickhouse_native", connection, database, filename, transformer, hive=False, **dumper_kwargs
+    )
+def new_clickhouse_native_to_hive_dumper(
+    connection=None, database=None, filename=None, transformer=None, **dumper_kwargs
+):
+    return new_to_csv_dumper(
+        "clickhouse_native", connection, database, filename, transformer, hive=True, **dumper_kwargs
+    )