PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/pigeon/connector/mongodb.py ADDED Viewed

@@ -0,0 +1,56 @@
+import datetime
+from collections import OrderedDict
+import bson
+import pymongo
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.schema import Schema, types
+from recurvedata.pigeon.utils import LoggingMixin
+@register_connector_class(['mongodb'])
+class MongoDBConnector(LoggingMixin):
+    def __init__(self, host=None, port=None, **kwargs):
+        self.host = host
+        self.port = port
+        kwargs.setdefault('document_class', OrderedDict)
+        self.kwargs = kwargs
+    def connect(self, **kwargs):
+        opts = self.kwargs.copy()
+        opts.update(kwargs)
+        for k, v in opts.copy().items():
+            try:
+                pymongo.common.validate(k, v)
+            except pymongo.errors.ConfigurationError as e:
+                opts.pop(k)
+        return pymongo.MongoClient(host=self.host, port=self.port, **opts)
+    def infer_schema(self, doc: dict):
+        schema = Schema()
+        for field, value in doc.items():
+            schema.add_field_by_attrs(field, self._infer_data_type(value))
+        return schema
+    def _infer_data_type(self, value):
+        if isinstance(value, float):
+            return types.FLOAT64
+        if isinstance(value, int):
+            return types.INT64
+        if isinstance(value, (str, bson.ObjectId)):
+            return types.STRING
+        if isinstance(value, datetime.datetime):
+            return types.DATETIME
+        if isinstance(value, bool):
+            return types.BOOLEAN
+        if isinstance(value, (list, dict)):
+            # 被 JSON 序列化
+            return types.JSON
+        # 其他类型都当作字符串
+        return types.STRING

recurvedata/pigeon/connector/mssql.py ADDED Viewed

@@ -0,0 +1,467 @@
+import datetime
+import os
+import urllib
+from collections import OrderedDict
+import cytoolz as toolz
+import pyodbc
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.connector.azure_blob import AzureBlobConnector
+from recurvedata.pigeon.connector.dbapi import DBAPIConnector
+from recurvedata.pigeon.schema import types
+from recurvedata.pigeon.utils import fs, md5hash, safe_int
+# https://github.com/mkleehammer/pyodbc/wiki/Cursor#description
+# The 'type code' value is the class type used to create the Python objects when reading rows.
+# For example, a varchar column's type will be str.
+_mssql_type_to_canonical_type = {
+    int: types.INT64,
+    float: types.FLOAT64,
+    bool: types.BOOLEAN,
+    datetime.datetime: types.DATETIME,
+    str: types.STRING,
+}
+_canonical_type_to_mssql_type = {
+    types.BOOLEAN: "BIT",
+    types.INT8: "TINYINT",
+    types.INT16: "SMALLINT",
+    types.INT32: "INT",
+    types.INT64: "BIGINT",
+    types.FLOAT32: "REAL",
+    types.FLOAT64: "DOUBLE PRECISION",
+    types.DATE: "DATE",
+    types.DATETIME: "DATETIME",
+    # 使用 NVARCHAR (national character varying) 来支持 unicode
+    types.STRING: "NVARCHAR",
+    types.JSON: "NVARCHAR",
+}
+@register_connector_class("mssql")
+class SQLServerConnector(DBAPIConnector):
+    _sqla_driver = "mssql+pyodbc"
+    _identifier_start_quote = "["
+    _identifier_end_quote = "]"
+    _param_placeholder = "?"
+    _default_port = 1433
+    _autocommit = False
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        database=None,
+        user=None,
+        password=None,
+        conn_string=None,
+        schema=None,
+        odbc_driver: str = "ODBC Driver 18 for SQL Server",
+        encrypt: bool = True,
+        trust_server_certificate: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(host, port, database, user, password, schema, *args, **kwargs)
+        self.odbc_driver = odbc_driver
+        self.encrypt = encrypt
+        self.trust_server_certificate = trust_server_certificate
+        if conn_string:
+            attrs = self.parse_conn_string(conn_string)
+            for k, v in attrs.items():
+                setattr(self, k, v)
+    @property
+    def conn_string(self):
+        # TODO: 使用传进来的 conn string 里相应参数
+        options = OrderedDict(
+            {
+                "Driver": f"{self.odbc_driver}",
+                "Server": f"tcp:{self.host},{self.port}",
+                "Database": self.database,
+                "Uid": self.user,
+                "Pwd": "{%s}" % self.password,
+                "Encrypt": "yes" if self.encrypt else "no",
+                "TrustServerCertificate": "yes" if self.trust_server_certificate else "no",
+                "Connection Timeout": 30,
+            }
+        )
+        options.update(self.kwargs.get("odbc_options", {}))
+        return ";".join([f"{k}={v}" for k, v in options.items()])
+    @staticmethod
+    def parse_conn_string(conn_string: str):
+        parts = conn_string.strip(";").split(";")
+        kvs = {}
+        for p in parts:
+            k, v = p.split("=")
+            kvs[k.lower()] = v
+        server = kvs["server"].split(":")[1].split(",")
+        return {
+            "host": server[0],
+            "port": int(server[1]),
+            "user": kvs["uid"],
+            "password": kvs["pwd"][1:-1],  # remove leading and trailing {}
+            "database": kvs["database"],
+        }
+    def connect_impl(self, autocommit=None, *args, **kwargs):
+        if autocommit is None:
+            autocommit = self._autocommit
+        return pyodbc.connect(self.conn_string, autocommit=autocommit)
+    def cursor(self, autocommit=None, dryrun=False, commit_on_close=True, **kwargs):
+        if autocommit is None:
+            autocommit = self._autocommit
+        return super().cursor(autocommit, dryrun, commit_on_close, **kwargs)
+    def has_schema(self, schema):
+        rv = self.fetchone(f"SELECT * FROM sys.schemas WHERE name='{schema}'")
+        return bool(rv)
+    def has_table(self, table, schema=None, **kwargs):
+        schema, table = self._get_schema_table(table, schema)
+        schema = schema or "dbo"
+        query = f"""
+            SELECT name FROM sys.tables
+            WHERE schema_name(schema_id) = '{schema}' AND name = '{table}'
+        """
+        return bool(self.fetchall(query))
+    def create_schema(self, schema):
+        with self.cursor() as cursor:
+            cursor.execute(f"SELECT * FROM sys.schemas WHERE name='{schema}'")
+            exists = bool(cursor.fetchall())
+            if not exists:
+                cursor.execute(f"CREATE SCHEMA {self.quote_identifier(schema)}")
+    def create_master_key(self):
+        queries = """
+            IF NOT EXISTS (SELECT * FROM sys.symmetric_keys)
+            CREATE MASTER KEY
+        """
+        self.execute(queries)
+    def get_columns(self, table, schema=None, exclude=None):
+        schema, table = self._get_schema_table(table, schema)
+        if not self.has_table(table=table, schema=schema):
+            raise ValueError(f"Table {schema}.{table} not exists")
+        # the table/view name may be case-sensitive
+        query = f"""
+            SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS
+            WHERE table_schema='{schema}' AND table_name = '{table}'
+            ORDER BY ordinal_position
+        """
+        rv = self.fetchall(query)
+        cols = [x[0] for x in rv]
+        if exclude:
+            cols = [x for x in cols if x not in exclude]
+        return cols
+    def drop_table_if_exists(self, schema, table, external_table=False):
+        schema, table = self._get_schema_table(table, schema)
+        external = " EXTERNAL " if external_table else " "
+        queries = f"""
+          IF EXISTS (
+            SELECT * FROM sys.tables WHERE SCHEMA_NAME(schema_id) = '{schema}' AND name = '{table}'
+          )
+          DROP {external} table {schema}.{table}
+        """
+        self.execute(queries)
+    def load_csv(
+        self,
+        table,
+        filename,
+        schema="dbo",
+        columns=None,
+        delimiter=",",
+        quotechar='"',
+        lineterminator="\r\n",
+        escapechar=None,
+        skiprows=0,
+        using_insert=None,
+        **kwargs,
+    ):
+        if not using_insert:
+            try:
+                options = dict(
+                    columns=columns,
+                    delimiter=delimiter,
+                    quotechar=quotechar,
+                    lineterminator=lineterminator,
+                    escapechar=escapechar,
+                    skiprows=skiprows,
+                )
+                options.update(**kwargs)
+                self.load_csv_bulk(table, filename, schema, **options)
+            except Exception as e:
+                self.logger.warning("bulk load local file is not supported, apply INSERT instead. error: %s", e)
+            else:
+                return
+        # SQL Server 有参数数量限制
+        # https://docs.microsoft.com/en-us/sql/sql-server/maximum-capacity-specifications-for-sql-server
+        num_params_limit = 2100 - 1
+        if not columns:
+            columns = self.get_columns(table=table, schema=schema)
+        batch_size = kwargs.get("batch_size", 1000)
+        new_batch_size = int(min(num_params_limit / len(columns), batch_size))
+        self.logger.info(
+            "table has %s columns, adjust batch_size from %s to %s", len(columns), batch_size, new_batch_size
+        )
+        kwargs["batch_size"] = new_batch_size
+        table = self._format_table_name(table, schema)
+        self.load_csv_by_inserting(
+            table, filename, columns, delimiter, quotechar, lineterminator, escapechar, skiprows, **kwargs
+        )
+    def load_csv_bulk(
+        self,
+        table,
+        filename,
+        schema="dbo",
+        columns=None,
+        delimiter=",",
+        quotechar='"',
+        lineterminator="\r\n",
+        escapechar=None,
+        skiprows=0,
+        **kwargs,
+    ):
+        raise NotImplementedError
+    def _format_table_name(self, table, schema):
+        if schema and "." not in table:
+            table = self.quote_identifier(f"{schema}.{table}")
+        return table
+    def _get_schema_table(self, table, schema):
+        if "." in table:
+            schema, table = table.split(".")
+        if not schema:
+            schema = "dbo"
+        return schema, table
+    @staticmethod
+    def to_canonical_type(type_code, size):
+        return _mssql_type_to_canonical_type.get(type_code, types.STRING)
+    @staticmethod
+    def from_canonical_type(canonical_type, size):
+        if canonical_type == types.STRING:
+            # 使用 4 个字节表示一个字符比较安全
+            # https://docs.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-2017#arguments
+            # max indicates that the maximum storage size is 2^30-1 characters
+            size = safe_int(size) * 4
+            if size > 4000:
+                size = "max"
+            elif size == 0:
+                size = "max"
+            mssql_type = f"NVARCHAR({size})"
+        else:
+            mssql_type = _canonical_type_to_mssql_type.get(canonical_type, "NVARCHAR(200)")
+        return mssql_type
+    def generate_ddl(self, table, schema="dbo", database=None, if_exists=True):
+        schema, table = self._get_schema_table(table, schema)
+        if not self.has_table(table, schema):
+            raise ValueError(f"Table {table!r} not exists in {database!r}")
+        query = f"""
+            SELECT column_name, data_type, character_maximum_length, is_nullable
+            FROM INFORMATION_SCHEMA.COLUMNS
+            WHERE table_schema = '{schema}' AND table_name = '{table}'
+            ORDER BY ordinal_position
+        """
+        with self.cursor() as cursor:
+            cursor.execute(query)
+            columns = cursor.fetchall()
+        col_definitions = []
+        # column_name, data_type, character_maximum_length, is_nullable
+        for col in columns:
+            dtype = col.data_type
+            if col.character_maximum_length:
+                dtype = f"{dtype}({col.character_maximum_length})"
+            null_modifier = "DEFAULT" if col.is_nullable == "YES" else "NOT"
+            definition = f"[{col.column_name}] {dtype.upper()} {null_modifier} NULL"
+            col_definitions.append(definition)
+        body = ",\n\t\t\t\t".join(col_definitions)
+        ddl = f"""
+            CREATE TABLE [{schema}].[{table}] (
+                {body}
+            )
+        """
+        if if_exists:
+            ddl = f"""
+            IF NOT EXISTS (
+                SELECT * FROM sys.tables
+                WHERE schema_name(schema_id) = '{schema}' AND name = '{table}'
+            )
+            {ddl}
+            """
+        return ddl
+    def is_mssql(self):
+        return True
+    def _get_sqlalchemy_uri(self):
+        return "mssql+pyodbc:///?odbc_connect=%s" % urllib.parse.quote_plus(self.conn_string)
+# 兼容老代码
+MSSQLConnector = SQLServerConnector
+class BaseAzureSQLConnector(SQLServerConnector):
+    """Base class for Azure SQL based connectors (Synapse and Fabric)
+    Provides common functionality for Azure SQL services
+    reference:
+        - https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql?view=fabric
+        - https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql?view=azure-sqldw-latest
+    """
+    def _get_credential(self, blob: AzureBlobConnector) -> str:
+        """Get Azure Blob Storage credential for COPY INTO command.
+        Args:
+            blob: Azure Blob Storage connector instance
+        Returns:
+            str: Credential string for COPY INTO command
+        """
+        if blob.account_key:
+            return f"CREDENTIAL=(IDENTITY= 'Storage Account Key', SECRET='{blob.account_key}'),"
+        elif blob.sas_token:
+            return f"CREDENTIAL=(IDENTITY= 'Shared Access Signature', SECRET='{blob.sas_token}'),"
+        else:
+            return ""
+    def load_csv_bulk(
+        self,
+        table: str,
+        filename: str,
+        schema="dbo",
+        columns=None,
+        delimiter=",",
+        quotechar='"',
+        lineterminator="\r\n",
+        escapechar=None,
+        skiprows=0,
+        **kwargs,
+    ):
+        """
+        Bulk load data using COPY command for Azure SQL services
+        Args:
+            table: Target table name
+            filename: Source file path
+            schema: Schema name
+            columns: List of column names
+            delimiter: Field delimiter
+            quotechar: Quote character
+            lineterminator: Line terminator
+            escapechar: Escape character
+            skiprows: Number of rows to skip
+            **kwargs: Additional arguments
+        """
+        blob = self.create_blob_connector()
+        if not blob:
+            raise RuntimeError("blob storage is not configured")
+        # upload
+        if filename.endswith(".gz"):
+            file_to_upload = filename
+        else:
+            self.logger.info("compressing file %s", filename)
+            file_to_upload = fs.gzip_compress(filename, using_cmd=True)
+        if "." in table:
+            schema, table = table.split(".")
+        container = self.kwargs.get("blob_options", {}).get("container_name", self._generate_blob_container_name())
+        blob.create_container(container)
+        blob_name = f"{self.database}/{schema}/{table}/{os.path.basename(file_to_upload)}"
+        self.logger.info(f"uploading {file_to_upload} to {container}/{blob_name}")
+        blob_path = blob.upload(container, file_to_upload, blob_name)
+        if columns:
+            column_list = f'({", ".join(columns)})'
+        else:
+            column_list = ""
+        query = f"""
+            COPY INTO {self.quote_identifier(schema)}.{self.quote_identifier(table)} {column_list}
+            FROM '{blob.get_url(container, blob_name)}'
+            WITH (
+                FILE_TYPE = 'CSV',
+                {self._get_credential(blob)}
+                COMPRESSION = 'Gzip',
+                FIELDQUOTE = '{quotechar}',
+                FIELDTERMINATOR = '{delimiter}',
+                ROWTERMINATOR = '{lineterminator}',
+                FIRSTROW = {skiprows + 1}
+            )
+            OPTION (LABEL = 'COPY {schema}.{table}')
+        """
+        try:
+            self.logger.info("running COPY command")
+            self.execute(query, autocommit=False, commit_on_close=True)
+            self.logger.info("COPY finished")
+        except Exception as e:
+            self.logger.exception("failed to copy data to database")
+            raise e
+        finally:
+            if file_to_upload != filename:
+                self.logger.info("delete %s", file_to_upload)
+                fs.remove_files_safely(file_to_upload)
+            self.logger.info(f"delete blob: {blob_path}")
+            try:
+                blob.delete_blob(container, blob_name)
+            except Exception as e:
+                self.logger.error(f"operation on blob storage fails: {e}")
+    @toolz.memoize
+    def create_blob_connector(self):
+        """Create blob connector"""
+        blob_options = self.kwargs.get("blob_options")
+        if not blob_options:
+            return None
+        return AzureBlobConnector(**blob_options)
+    def _generate_blob_container_name(self):
+        """Generate blob container name that follows Azure naming rules:
+        - 3-63 characters long
+        - Lowercase letters, numbers, and hyphens only
+        - Must start and end with a letter or number
+        - No consecutive hyphens
+        """
+        # Get instance name and limit its length to 20 characters
+        instance = self.host.split(".", 1)[0][:20]
+        # Remove any non-alphanumeric characters and convert to lowercase
+        instance = "".join(c for c in instance if c.isalnum()).lower()
+        # Ensure instance is not empty
+        if not instance:
+            instance = "default"
+        # Generate container name with fixed prefix and limited length
+        container_name = f"pigeon-{instance}-{md5hash(self.host)[:8]}"
+        # Ensure total length is within limits (63 chars)
+        if len(container_name) > 63:
+            container_name = container_name[:63]
+        # Ensure name ends with alphanumeric
+        while not container_name[-1].isalnum():
+            container_name = container_name[:-1]
+        return container_name
+@register_connector_class("azure_mssql")
+class AzureSQLServerConnector(BaseAzureSQLConnector):
+    pass

recurvedata/pigeon/connector/mysql.py ADDED Viewed

@@ -0,0 +1,175 @@
+import re
+import cytoolz as toolz
+import pymysql
+import sqlalchemy
+import sqlalchemy.engine.url
+from pymysql.constants import FIELD_TYPE
+from pymysql.converters import escape_string
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.connector.dbapi import DBAPIConnector, _ShowTableLikeMixin
+from recurvedata.pigeon.schema import types
+from recurvedata.pigeon.utils import fs, safe_int
+_mysql_type_to_canonical_type = {
+    FIELD_TYPE.TINY: types.INT8,
+    FIELD_TYPE.SHORT: types.INT16,
+    FIELD_TYPE.LONG: types.INT32,
+    FIELD_TYPE.LONGLONG: types.INT64,
+    FIELD_TYPE.INT24: types.INT64,
+    FIELD_TYPE.FLOAT: types.FLOAT32,
+    FIELD_TYPE.DOUBLE: types.FLOAT64,
+    FIELD_TYPE.DECIMAL: types.FLOAT64,
+    FIELD_TYPE.NEWDECIMAL: types.FLOAT64,
+    FIELD_TYPE.TIMESTAMP: types.DATETIME,
+    FIELD_TYPE.DATETIME: types.DATETIME,
+    FIELD_TYPE.DATE: types.DATE,
+    # others: types.STRING
+}
+_canonical_type_to_mysql_type = {
+    types.BOOLEAN: 'TINYINT',
+    types.INT8: 'TINYINT',
+    types.INT16: 'SMALLINT',
+    types.INT32: 'INT',
+    types.INT64: 'BIGINT',
+    types.FLOAT32: 'FLOAT',
+    types.FLOAT64: 'DOUBLE',
+    types.DATE: 'DATE',
+    types.DATETIME: 'DATETIME',
+    types.STRING: 'TEXT',
+    types.JSON: 'TEXT',
+}
+@register_connector_class(['mysql', 'tidb'])
+class MySQLConnector(_ShowTableLikeMixin, DBAPIConnector):
+    _sqla_driver = 'mysql+pymysql'
+    _sqla_url_query = {'charset': 'utf8mb4'}
+    _default_port = 3306
+    def connect_impl(self, autocommit=False, *args, **kwargs):
+        kwargs.setdefault('cursorclass', pymysql.cursors.SSCursor)
+        return pymysql.connect(host=self.host,
+                               port=self.port or 3306,
+                               user=self.user,
+                               password=self.password,
+                               database=self.database,
+                               charset='utf8mb4',
+                               autocommit=autocommit,
+                               *args, **kwargs)
+    def _get_sqlalchemy_uri(self):
+        url = sqlalchemy.engine.url.URL(drivername=self._sqla_driver, host=self.host, port=self.port,
+                                        username=self.user, password=self.password,
+                                        database=self.database or '',
+                                        query=self._sqla_url_query)
+        return url.__to_string__(hide_password=False)
+    @classmethod
+    def escape_string(cls, v):
+        return escape_string(v)
+    def load_csv(self, table, filename, columns=None, delimiter=',', quotechar='"',
+                 lineterminator='\r\n', escapechar=None, skiprows=0, using_insert=False, **kwargs):
+        table = self.quote_identifier(table)
+        if using_insert:
+            method = self.load_csv_by_inserting
+        else:
+            if self.is_tidb():
+                method = self._load_csv_tidb
+            else:
+                method = self._load_csv_mysql
+        return method(table, filename, columns,
+                      delimiter, quotechar, lineterminator, escapechar,
+                      skiprows=skiprows, **kwargs)
+    def _load_csv_mysql(self, table, filename, columns=None, delimiter=',', quotechar='"',
+                        lineterminator='\r\n', escapechar=None, skiprows=0, **kwargs):
+        if columns:
+            cols = '({})'.format(', '.join(columns))
+        else:
+            cols = ''
+        escape = "ESCAPED BY '{}'".format(escape_string(escapechar)) if escapechar else ''
+        lineterminator = escape_string(lineterminator)
+        ignore_lines = f'IGNORE {skiprows} LINES' if skiprows else ''
+        query = f'''
+            LOAD DATA LOCAL INFILE '{filename}'
+            INTO TABLE {table}
+            FIELDS TERMINATED BY '{delimiter}' ENCLOSED BY '{quotechar}' {escape}
+            LINES TERMINATED BY '{lineterminator}'
+            {ignore_lines}
+            {cols}
+        '''.strip()
+        self._log(query)
+        with self.cursor(local_infile=True) as cursor:
+            cursor.execute(query)
+    def _load_csv_tidb(self, table, filename, columns=None, delimiter=',', quotechar='"',
+                       lineterminator='\r\n', escapechar=None, skiprows=0, **kwargs):
+        infile = filename
+        if skiprows:
+            infile = fs.skip_lines(filename, skiprows)
+        self._load_csv_mysql(table, infile, columns,
+                             delimiter, quotechar, lineterminator, escapechar,
+                             skiprows=0, **kwargs)
+        if infile != filename:
+            fs.remove_files_safely(infile)
+    def is_mysql(self):
+        return True
+    @toolz.memoize
+    def is_tidb(self):
+        with self.cursor() as cursor:
+            try:
+                cursor.execute('SELECT tidb_version()')
+                cursor.fetchall()
+                return True
+            except Exception as e:
+                return False
+    @staticmethod
+    def to_canonical_type(type_code, size):
+        return _mysql_type_to_canonical_type.get(type_code, types.STRING)
+    @staticmethod
+    def from_canonical_type(canonical_type, size):
+        if canonical_type == types.STRING:
+            mysql_type = 'TEXT'
+            size = safe_int(size)
+            # utf8mb4 uses 4 bytes for one rune
+            # 255 / 4 = 63
+            if 0 < size < 63:
+                mysql_type = 'VARCHAR(255)'
+            # 65535 / 4 = 16383
+            elif size >= 16383:
+                # MEDIUMTEXT is enough
+                mysql_type = 'MEDIUMTEXT'
+        else:
+            mysql_type = _canonical_type_to_mysql_type.get(canonical_type, 'TEXT')
+        return mysql_type
+    def generate_ddl(self, table, database=None, if_exists=True):
+        if database is None:
+            database = self.database
+        if not self.has_table(table, database):
+            raise ValueError(f'Table {table!r} not exists in {database!r}')
+        with self.cursor() as cursor:
+            cursor.execute(f'USE {self.quote_identifier(database)}')
+            cursor.execute(f'SHOW CREATE TABLE {self.quote_identifier(table)}')
+            if_exists_stmt = ' IF NOT EXISTS ' if if_exists else ' '
+            body = re.search(r'CREATE TABLE (.*)', cursor.fetchall()[0][1], flags=re.S).group(1)
+            return f'CREATE TABLE{if_exists_stmt}{body}'
+TiDBConnector = MySQLConnector