PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/operators/utils/__init__.py ADDED Viewed

@@ -0,0 +1,51 @@
+import datetime
+from typing import TYPE_CHECKING
+import dateutil.parser
+if TYPE_CHECKING:
+    import pandas as pd
+def parse_to_date(s: str) -> datetime.date:
+    if isinstance(s, pd.Timestamp):
+        return s.date()
+    return dateutil.parser.parse(s).date()
+def infer_schema_from_dataframe(df: "pd.DataFrame"):
+    import numpy as np
+    from recurvedata.pigeon.schema import Schema, types
+    mapping = {
+        np.bool: types.BOOLEAN,
+        np.int8: types.INT8,
+        np.int16: types.INT16,
+        np.int32: types.INT32,
+        np.int64: types.INT64,
+        np.float16: types.FLOAT32,
+        np.float32: types.FLOAT32,
+        np.float64: types.FLOAT64,
+        np.datetime64: types.DATETIME,
+        np.object_: types.STRING,
+        np.str_: types.STRING,
+        np.bool_: types.BOOLEAN,
+    }
+    schema = Schema()
+    for col in df.columns:
+        canonical_type = mapping.get(df.dtypes[col].type, types.STRING)
+        schema.add_field_by_attrs(col, canonical_type)
+    return schema
+def once(func):
+    def wrapper(*args, **kwargs):
+        if not wrapper.called:
+            wrapper.result = func(*args, **kwargs)
+            wrapper.called = True
+        return wrapper.result
+    wrapper.called = False
+    return wrapper

recurvedata/operators/utils/file_factory.py ADDED Viewed

@@ -0,0 +1,150 @@
+import collections
+import csv
+import json
+import os
+import shutil
+import time
+from recurvedata.pigeon.utils import fs
+_csv_dialect_options = {
+    "delimiter": ",",
+    "quoting": csv.QUOTE_ALL,
+    "lineterminator": "\r\n",
+}
+def gzip_decompress(src_file, dst_file=None, inplace=True):
+    if not dst_file:
+        dst_file = fs.new_tempfile(dir=os.path.dirname(src_file))
+    fs.gzip_decompress(src_file, dst_file)
+    if inplace:
+        os.rename(dst_file, src_file)
+        return src_file
+    return dst_file
+def zip_decompress(src_file, dst_file=None, inplace=True):
+    if not dst_file:
+        # Create a temporary directory for extraction
+        dst_dir = os.path.join(os.path.dirname(src_file), f"tmp_zip_{os.path.basename(src_file)}_{int(time.time())}")
+        os.makedirs(dst_dir, exist_ok=True)
+        dst_file = dst_dir
+    # Ensure the target directory exists
+    if not os.path.exists(dst_file):
+        os.makedirs(dst_file, exist_ok=True)
+    fs.zip_decompress(src_file, dst_file)
+    if inplace:
+        # For inplace replacement, we need to:
+        # 1. Remove the original zip file
+        # 2. Move the extracted content to the original location
+        extracted_files = os.listdir(dst_file)
+        if len(extracted_files) == 1:
+            # If there's only one file, move it to replace the original
+            extracted_file = os.path.join(dst_file, extracted_files[0])
+            os.remove(src_file)  # Remove original zip
+            os.rename(extracted_file, src_file)  # Move extracted file to original location
+            os.rmdir(dst_file)  # Clean up empty temp dir
+        else:
+            # If multiple files, keep them in the directory
+            os.remove(src_file)  # Remove original zip
+            return dst_file  # Return the directory containing extracted files
+        return src_file
+    return dst_file
+def convert_excel_to_csv(src_file, dst_file=None, skiprows=0, inplace=True, lineterminator="\r\n"):
+    import pandas as pd
+    if not dst_file:
+        dst_file = fs.new_tempfile(dir=os.path.dirname(src_file))
+    df = pd.read_excel(src_file, skiprows=skiprows)
+    df.to_csv(dst_file, lineterminator=lineterminator, header=False, index=False)
+    if inplace:
+        os.rename(dst_file, src_file)
+        return src_file
+    return dst_file
+def convert_jsonlines_to_csv(src_file, dst_file=None, skiprows=0, src_encoding="utf8", inplace=True):
+    """把 JSONLines 格式文件转换成 CSV，JSONLines 文件的每一行都是一个 JSON object"""
+    if not dst_file:
+        dst_file = fs.new_tempfile(dir=os.path.dirname(src_file))
+    decoder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict)
+    with open(src_file, "r", encoding=src_encoding) as f_in, open(dst_file, "w") as f_out:
+        _skip_header_rows(f_in, skiprows)
+        line = f_in.readline()
+        row = decoder.decode(line)
+        writer = csv.DictWriter(f_out, fieldnames=list(row.keys()), **_csv_dialect_options)
+        writer.writerow(row)
+        for line in f_in:
+            writer.writerow(decoder.decode(line))
+    if inplace:
+        os.rename(dst_file, src_file)
+        return src_file
+    return dst_file
+def convert_encoding(filename, src_encoding, dst_encoding="utf8", skiprows=0, inplace=True):
+    if src_encoding == dst_encoding:
+        return filename
+    target = fs.new_tempfile(dir=os.path.dirname(filename))
+    with open(filename, "r", encoding=src_encoding) as f_in, open(target, "w", encoding=dst_encoding) as f_out:
+        _skip_header_rows(f_in, skiprows)
+        shutil.copyfileobj(f_in, f_out)
+    if inplace:
+        os.rename(target, filename)
+        return filename
+    return target
+def convert_csv_dialect(
+    filename, src_dialect_options, dst_dialect_options=None, skiprows=0, src_encoding="utf8", inplace=True
+):
+    if dst_dialect_options is None:
+        dst_dialect_options = _csv_dialect_options.copy()
+    if _same_dict(src_dialect_options, dst_dialect_options):
+        if src_encoding != "utf8":
+            convert_encoding(filename, src_encoding=src_encoding, skiprows=skiprows, inplace=True)
+        return filename
+    dst_file = fs.new_tempfile(dir=os.path.dirname(filename))
+    with open(filename, "r", encoding=src_encoding) as f_in, open(dst_file, "w") as f_out:
+        _skip_header_rows(f_in, skiprows)
+        reader = csv.reader(f_in, **src_dialect_options)
+        writer = csv.writer(f_out, **dst_dialect_options)
+        for row in reader:
+            writer.writerow(row)
+    if inplace:
+        os.rename(dst_file, filename)
+        return filename
+    return dst_file
+def _skip_header_rows(f, n=0):
+    for _ in range(n):
+        f.readline()
+def _same_dict(a: dict, b: dict):
+    if len(a) != len(b):
+        return False
+    for k in a:
+        if k not in b or a[k] != b[k]:
+            return False
+    return True

recurvedata/operators/utils/fs.py ADDED Viewed

@@ -0,0 +1,10 @@
+import os.path
+def get_exist_path(candidate_paths: list[str]) -> str:
+    for path in candidate_paths:
+        if not path:
+            continue
+        path = os.path.expanduser(path)
+        if os.path.exists(path):
+            return path

recurvedata/operators/utils/lineage.py ADDED Viewed

@@ -0,0 +1,265 @@
+import datetime
+import logging
+import re
+from collections import namedtuple
+from typing import Union
+try:
+    import sqlparse
+    from sql_metadata.keywords_lists import QueryType, TokenType
+    from sql_metadata.parser import Parser
+    from sql_metadata.utils import UniqueList
+except ImportError:
+    Parser = object
+logger = logging.getLogger(__name__)
+Table = namedtuple("Table", ["data_source", "database", "table"])
+VERSION = 1
+class LineageParser(Parser):
+    """
+    2.5.1 sql-metadata 发现的问题，都已处理：
+        1. 需要屏蔽 _preprocess_query，否则 hive / impala 里很多 " 被替换成 `，容易造成后续解析错误
+        2. 有挺多不支持的 sql，都列在 NOT_SUPPORT_PREFIXES 里
+        3. with xxx insert into 这种会被误认为 select 类型，已处理
+        4. insert overwrite table 语句，要写入的表，识别不到
+        5. insert into xxx partition (dt) 里的 dt 会被识别成表
+        6. create table xxx(xxx) partitioned by (dt string) 里的 dt 会被识别成表
+        7. 有些注释好像会导致解析错误（待确认），现在会提前去掉注释
+    """
+    NOT_SUPPORT_PREFIXES = (
+        "SET",
+        "COMPUTE",
+        "REFRESH",
+        "DROP STATS",
+        "DROP INCREMENTAL STATS",
+        "INVALIDATE METADATA",
+        "SHOW TABLE",
+        "DESCRIBE ",
+        "TRUNCATE ",
+        "MSCK REPAIR TABLE ",
+        "USE ",
+        "CREATE DATABASE",
+        "CREATE EXTERNAL TABLE",
+        "CREATE VIEW",
+        "DROP VIEW",  # todo: view 表看要不要解析
+        "DROP FUNCTION",
+        "CREATE FUNCTION",
+        "SHOW FUNCTIONS",
+        "COMMENT ON",
+        "GRANT ",
+        "IF NOT EXISTS",
+        "UNLOAD",
+        "VACUUM",  # redshift ,
+    )
+    NOT_TABLE_KEYS = ("PARTITION", "TABLE", "WHERE")
+    def __init__(self, sql: str, default_db: str, ds_name: str, ds_type: str) -> None:
+        super().__init__(sql)
+        self.default_db = default_db
+        self.ds_name = ds_name
+        self.dialect = ds_type  # todo: current not used
+    def _preprocess_query(self):
+        """
+        sql-metadata 会特殊处理 "，导致后续解析报错。
+        这里先替换掉，后续可能需要对不同的 dialect 分别处理
+            比如: hive/impala 不需要把 " 替换成 `
+        :return:
+        """
+        query = self._raw_query
+        query = re.sub(r"as\(", "AS (", query, flags=re.I)
+        return query
+    def __repr__(self):
+        return f"parser: query_type {self.query_type};tables {self.tables}"
+    @classmethod
+    def not_supported_query(cls, ds_type, query):
+        query = query.strip().upper()
+        for prefix in cls.NOT_SUPPORT_PREFIXES:
+            if query.startswith(prefix):
+                return True
+        return False
+    @property
+    def query_type(self) -> "QueryType":
+        if self._query_type:
+            return self._query_type
+        query_type = super().query_type
+        if query_type == QueryType.SELECT:  # with xxx insert into 这种会被误认为 select 类型
+            insert_table = self.get_insert_table_name()
+            if insert_table:
+                self._query_type = query_type = QueryType.INSERT
+        return query_type
+    @property
+    def tables(self):
+        """
+        1. 防止把 partition (dt) 也误认为 tables
+        2. 防止把 insert into table 中的 table 当做 tables
+        """
+        if self._tables is not None:
+            return self._tables
+        tables = UniqueList()
+        with_names = self.with_names
+        for token in self._not_parsed_tokens:
+            if not token.is_potential_table_name:
+                continue
+            if (
+                token.is_alias_of_table_or_alias_of_subquery
+                or token.is_with_statement_nested_in_subquery
+                or token.is_constraint_definition_inside_create_table_clause(query_type=self.query_type)
+                or token.is_columns_alias_of_with_query_or_column_in_insert_query(with_names=with_names)
+            ):
+                continue
+            if token.normalized in self.NOT_TABLE_KEYS:
+                continue
+            # 防止 insert into xxx partition (dt) 里的 dt 被识别成 table
+            # 防止 create table xxx(xxx) partitioned by (dt string) 里的 dt 被识别成 table
+            left_parenthesis = token.find_nearest_token(
+                value=True, value_attribute="is_left_parenthesis", direction="left"
+            )
+            right_parenthesis = token.find_nearest_token(
+                value=True, value_attribute="is_right_parenthesis", direction="left"
+            )
+            if (left_parenthesis and right_parenthesis and left_parenthesis.position > right_parenthesis.position) or (
+                left_parenthesis and not right_parenthesis
+            ):
+                if left_parenthesis.previous_token and left_parenthesis.previous_token.normalized in (
+                    "PARTITION",
+                    "BY",
+                ):
+                    continue
+            table_name = str(token.value.strip("`"))
+            token.token_type = TokenType.TABLE
+            tables.append(table_name.lower())  # # 额外添加了 lower()，防止 with_names 和 tables 大小写不一致
+        self._tables = []
+        for table in tables - UniqueList([name.lower() for name in with_names]):
+            self._tables.append(table)
+        return self._tables
+    def get_insert_table_name(self):
+        sql = self._query.lower()
+        if "insert into" not in sql and "insert overwrite" not in sql:
+            return
+        insert_token = None
+        for try_num in range(99):
+            if insert_token is None:
+                if self.tokens[0].normalized == "INSERT":
+                    insert_token = self.tokens[0]
+                else:
+                    insert_token = self.tokens[0].find_nearest_token(
+                        "INSERT", value_attribute="normalized", direction="right"
+                    )
+            else:
+                insert_token = insert_token.find_nearest_token(
+                    "INSERT", value_attribute="normalized", direction="right"
+                )
+            if insert_token.position < 0:
+                return
+            if insert_token.next_token.normalized in ("INTO", "OVERWRITE"):
+                break
+        else:
+            return
+        table_token = insert_token.next_token.next_token
+        if table_token.normalized == "TABLE":
+            table_token = table_token.next_token
+        insert_table_name = table_token.value.lower()
+        if insert_table_name not in self.tables:
+            logger.warning(
+                f"get_insert_table_name error: " f"table_token {insert_table_name} _tables {self._tables}, please check"
+            )
+        return insert_table_name
+    def get_create_table_name(self):
+        if self.query_type != QueryType.CREATE:
+            return
+        return (self.tables and self.tables[0]) or None
+    def get_lineage(self):
+        if self.query_type == QueryType.DROP:
+            return LineageResult([], self._format_table(self.tables[0]), self.query_type, self._raw_query)
+        if self.query_type in (QueryType.ALTER, QueryType.DELETE):
+            return
+        tables = self.tables[:]
+        downstream_table = (
+            self.get_create_table_name() or self.get_insert_table_name()
+        )  # todo: update/upsert table not supported
+        if downstream_table:
+            if downstream_table in tables:
+                tables.remove(downstream_table)
+            if not tables:
+                return
+            return LineageResult(
+                self._format_table(tables), self._format_table(downstream_table), self.query_type, self._raw_query
+            )
+    def _format_table(self, table_or_tables: Union[list[str], str]):
+        if isinstance(table_or_tables, list):
+            res_lst = []
+            for table in table_or_tables:
+                if "." in table:
+                    db, table = table.split(".")  # todo: redshift
+                else:
+                    db = self.default_db
+                res_lst.append(Table(data_source=self.ds_name, database=db, table=table))
+            return res_lst
+        else:
+            if "." in table_or_tables:
+                db, table = table_or_tables.split(".")  # todo: redshift
+            else:
+                db, table = self.default_db, table_or_tables
+            return Table(data_source=self.ds_name, database=db, table=table)
+class LineageResult(object):
+    def __init__(self, upstream_tables: list[Table], downstream_table: Table, query_type: "QueryType", sql: str):
+        self.upstream_tables = upstream_tables
+        self.downstream_table = downstream_table
+        self.query_type = query_type
+        self.sql = sql
+    def to_dict(self):
+        return {
+            "upstream": [dict(table._asdict()) for table in self.upstream_tables],
+            "downstream": dict(self.downstream_table._asdict()),
+            "query_type": self.query_type.value,
+            "sql": self.sql,
+            "version": VERSION,
+            "created_at": datetime.datetime.now(),
+        }
+def parse_lineage(sql, default_db, recurve_ds_name, recurve_ds_type):
+    lineage_lst = []
+    raw_sql = sql
+    remove_comment_sql = sqlparse.format(raw_sql, strip_comments=True)
+    for sql in sqlparse.split(remove_comment_sql):
+        sql = sql.strip(";\n\r\t ")
+        if not sql:
+            continue
+        if LineageParser.not_supported_query(recurve_ds_type, sql):
+            logger.debug(f"currently lineage not support ds_type {recurve_ds_type}")
+            continue
+        parser = LineageParser(sql, default_db, recurve_ds_name, recurve_ds_type)
+        lineage_result = parser.get_lineage()
+        if not lineage_result:
+            continue
+        lineage_lst.append(lineage_result.to_dict())
+    return lineage_lst
+def supported_recurve_ds_type(ds_type):
+    return ds_type in ("hive", "impala")

recurvedata/operators/web_init.py ADDED Viewed

@@ -0,0 +1,15 @@
+import logging
+logger = logging.getLogger(__name__)
+# todo: move to common
+def init_operator_web(op_cls, router, operator_params: dict):
+    if not hasattr(op_cls, "init_web"):
+        return
+    logger.info(f"operator_params: {operator_params} {op_cls.name()}")
+    init_func = getattr(op_cls, "init_web")
+    try:
+        init_func(router, operator_params.get(op_cls.name(), {}))
+    except Exception as e:
+        logger.error(f"{op_cls} init_web fail, {str(e)}")