PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/pigeon/utils/fs.py ADDED Viewed

@@ -0,0 +1,266 @@
+import bz2
+import contextlib
+import datetime
+import glob
+import gzip
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+import zipfile
+from itertools import islice
+from recurvedata.pigeon.utils import ensure_list
+def new_tempfile(suffix="", prefix=None, dir=None):
+    ts = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+    suffix = "{}_{}".format(ts, suffix)
+    kwargs = {"suffix": suffix, "dir": dir}
+    if prefix:
+        kwargs["prefix"] = prefix
+    _, filename = tempfile.mkstemp(**kwargs)
+    return filename
+class new_stagefile_factory:
+    def __init__(self, directory):
+        if not os.path.isabs(directory):
+            directory = os.path.join("/tmp", directory)
+        self.directory = directory
+    def __call__(self, name):
+        os.makedirs(self.directory, exist_ok=True)
+        return os.path.join(self.directory, name)
+def merge_files(files, filename=None, num_skip_lines=0, delete=True):
+    """Concat multiple files into one file.
+    :param files: source file names
+    :param filename: target filename, will create a tempfile if not provided
+    :param num_skip_lines: skip n lines before merge into target file
+    :param delete: delete source files after been merged
+    :return: the target filename
+    """
+    if filename is None:
+        _, filename = tempfile.mkstemp()
+    if not num_skip_lines:
+        if len(files) == 1 and delete:
+            # just rename
+            os.rename(files[0], filename)
+        else:
+            # merge by `cat` for better performance
+            cmd = f'cat {" ".join(files)} > {filename}'
+            _run_command(cmd)
+    else:
+        with open(filename, "wb") as fout:
+            for f in files:
+                with open(f, "rb") as fin:
+                    for _ in range(num_skip_lines):
+                        fin.readline()
+                    shutil.copyfileobj(fin, fout)
+    if delete:
+        remove_files_safely(files)
+    return filename
+def skip_lines(infile, lines, inplace=False):
+    tmpfile = new_tempfile()
+    with open(infile, "rb") as fin, open(tmpfile, "wb") as fout:
+        # skip the first n lines
+        for _ in range(lines):
+            fin.readline()
+        # copy the rest to another file
+        shutil.copyfileobj(fin, fout)
+    if inplace:
+        os.rename(tmpfile, infile)
+        return infile
+    return tmpfile
+def read_lines(filename, start_line, lines_num=1):
+    with open(filename) as f:
+        for line in islice(f, start_line, start_line + lines_num):
+            yield line
+def is_file_empty(filename):
+    """Detect file is empty or not, the non-exists file is considered as empty"""
+    try:
+        return os.stat(filename).st_size == 0
+    except FileNotFoundError:
+        return True
+def remove_files(files):
+    for f in ensure_list(files):
+        os.unlink(f)
+def remove_files_safely(files):
+    with contextlib.suppress(OSError, TypeError, ValueError):
+        remove_files(files)
+def remove_files_by_pattern(pattern):
+    files = glob.glob(pattern)
+    logging.info("files to be deleted: %s", str(files))
+    remove_files_safely(files)
+def remove_folder_safely(folder):
+    if not os.path.exists(folder):
+        return
+    shutil.rmtree(folder, ignore_errors=True)
+def gzip_compress(filename, target_filename=None, using_cmd=False):
+    """Compress a file using gzip
+    :param filename: the path of input file
+    :param target_filename: the path of output file, a temporary filename will be made otherwise
+    :param using_cmd: use the gzip command line instead of Python GzipFile to speedup
+    :return: the target_filename
+    """
+    if target_filename is None:
+        target_filename = new_tempfile(suffix=".gz")
+    if using_cmd:
+        _run_command(f"gzip {filename} -c > {target_filename}")
+        return target_filename
+    with open(filename, "rb") as f_in, gzip.GzipFile(target_filename, "wb") as f_out:
+        shutil.copyfileobj(f_in, f_out)
+    return target_filename
+def gzip_decompress(filename, target_filename=None, using_cmd=False):
+    """Decompress a gzip file
+    :param filename: the path of the gzip file
+    :param target_filename: the path of output file, a temporary filename will be made otherwise
+    :param using_cmd: use the gzip command line instead of Python GzipFile to speedup
+    :return: the target_filename
+    """
+    if target_filename is None:
+        target_filename = new_tempfile()
+    if using_cmd:
+        _run_command(f"gzip -d {filename} -c > {target_filename}")
+        return target_filename
+    with gzip.GzipFile(filename, "rb") as f_in, open(target_filename, "wb") as f_out:
+        shutil.copyfileobj(f_in, f_out)
+    return target_filename
+def bzip2_compress(filename, target_filename=None, using_cmd=False):
+    """Compress a file using bzip2
+    :param filename: the path of input file
+    :param target_filename: the path of output file, a temporary filename will be made otherwise
+    :param using_cmd: use the bzip2 command line instead of Python BZ2File to speedup
+    :return: the target_filename
+    """
+    if target_filename is None:
+        target_filename = new_tempfile(suffix=".bz2")
+    if using_cmd:
+        _run_command(f"bzip2 {filename} -c > {target_filename}")
+        return target_filename
+    with open(filename, "rb") as f_in, bz2.BZ2File(target_filename, "wb") as f_out:
+        shutil.copyfileobj(f_in, f_out)
+    return target_filename
+def bzip2_decompress(filename, target_filename=None, using_cmd=False):
+    """Decompress a bzip2 file
+    :param filename: the path of the bzip2 file
+    :param target_filename: the path of output file, a temporary filename will be made otherwise
+    :param using_cmd: use the gzip command line instead of Python BZ2File to speedup
+    :return: the target_filename
+    """
+    if target_filename is None:
+        target_filename = new_tempfile()
+    if using_cmd:
+        _run_command(f"bzip2 -d {filename} -c > {target_filename}")
+        return target_filename
+    with bz2.BZ2File(filename, "rb") as f_in, open(target_filename, "wb") as f_out:
+        shutil.copyfileobj(f_in, f_out)
+    return target_filename
+def zip_compress(filename, target_filename=None, using_cmd=False, arcname=None):
+    """Compress a file using zip
+    :param filename: the path of input file
+    :param target_filename: the path of output file, a temporary filename will be made otherwise
+    :param using_cmd: use the zip command line instead of Python ZipFile to speedup
+    :param arcname: filename in the archive file, only supported with using_cmd=False
+    :return: the target_filename
+    """
+    if target_filename is None:
+        target_filename = new_tempfile(suffix=".zip")
+    directory, basename = os.path.split(filename.rstrip("/"))
+    if using_cmd:
+        # 先删除生成的临时文件，只使用生成的文件名，要不然会报错
+        # zip warning: missing end signature--probably not a zip file (did you
+        # zip warning: remember to use binary mode when you transferred it?)
+        # zip warning: (if you are trying to read a damaged archive try -F)
+        remove_files_safely(target_filename)
+        if arcname is not None:
+            logging.warning("arcname is not supported while using cmd")
+        _run_command(f"cd {directory} && zip -r {target_filename} {basename}")
+        return target_filename
+    with zipfile.ZipFile(target_filename, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+        zf.write(filename, arcname=arcname or basename)
+    return target_filename
+def zip_decompress(filename, target_directory=None, using_cmd=False):
+    """Decompress a .zip file
+    :param filename: the path of input file
+    :param target_directory: the path of output directory, a temporary directory will be made otherwise
+    :param using_cmd: use the unzip command line instead of Python ZipFile to speedup
+    :return: the output directory
+    """
+    if not target_directory:
+        target_directory = tempfile.mkdtemp()
+    if using_cmd:
+        _run_command(f"unzip {filename} -d {target_directory}")
+        return target_directory
+    with zipfile.ZipFile(filename, "r") as zf:
+        zf.extractall(target_directory)
+    return target_directory
+@contextlib.contextmanager
+def ensure_remove(filename):
+    try:
+        yield filename
+    finally:
+        remove_files_safely(filename)
+def schema_filename(base):
+    return f"{base}.schema"
+def exists(path):
+    return os.path.exists(path)
+def _run_command(cmd):
+    logging.info(cmd)
+    subprocess.check_output(cmd, shell=True)

recurvedata/pigeon/utils/json.py ADDED Viewed

@@ -0,0 +1,44 @@
+import datetime
+import decimal
+import json
+class JSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, decimal.Decimal):
+            return float(obj)
+        if isinstance(obj, (datetime.date, datetime.datetime)):
+            return obj.isoformat()
+        if isinstance(obj, datetime.timedelta):
+            return str(obj)
+        return super().default(obj)
+def json_dumps(obj, **kwargs):
+    kwargs.setdefault('cls', JSONEncoder)
+    return json.dumps(obj, **kwargs)
+def dump_json(obj, fp=None, **kwargs):
+    kwargs.setdefault('indent', 4)
+    kwargs.setdefault('ensure_ascii', False)
+    kwargs.setdefault('sort_keys', True)
+    kwargs.setdefault('cls', JSONEncoder)
+    if fp is None:
+        return json.dumps(obj, **kwargs)
+    else:
+        if isinstance(fp, str):
+            with open(fp, 'w') as fp:
+                return json.dump(obj, fp, **kwargs)
+        return json.dump(obj, fp, **kwargs)
+def load_json(fp, **kwargs):
+    if isinstance(fp, str):
+        with open(fp, 'r') as fp:
+            return json.load(fp, **kwargs)
+    else:
+        return json.load(fp, **kwargs)

recurvedata/pigeon/utils/keyed_tuple.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""
+This file is taken from SQLAlchemy
+"""
+class AbstractKeyedTuple(tuple):
+    __slots__ = ()
+    def keys(self):
+        """Return a list of string key names for this :class:`.KeyedTuple`.
+        .. seealso::
+            :attr:`.KeyedTuple._fields`
+        """
+        return list(self._fields)
+class KeyedTuple(AbstractKeyedTuple):
+    """``tuple`` subclass that adds labeled names.
+    E.g.::
+        >>> k = KeyedTuple([1, 2, 3], labels=["one", "two", "three"])
+        >>> k.one
+        1
+        >>> k.two
+        2
+    Result rows returned by :class:`.Query` that contain multiple
+    ORM entities and/or column expressions make use of this
+    class to return rows.
+    The :class:`.KeyedTuple` exhibits similar behavior to the
+    ``collections.namedtuple()`` construct provided in the Python
+    standard library, however is architected very differently.
+    Unlike ``collections.namedtuple()``, :class:`.KeyedTuple` is
+    does not rely on creation of custom subtypes in order to represent
+    a new series of keys, instead each :class:`.KeyedTuple` instance
+    receives its list of keys in place.   The subtype approach
+    of ``collections.namedtuple()`` introduces significant complexity
+    and performance overhead, which is not necessary for the
+    :class:`.Query` object's use case.
+    .. seealso::
+        :ref:`ormtutorial_querying`
+    """
+    def __new__(cls, vals, labels=None):
+        t = tuple.__new__(cls, vals)
+        if labels:
+            t.__dict__.update(zip(labels, vals))
+        else:
+            labels = []
+        t.__dict__["_labels"] = labels
+        return t
+    @property
+    def _fields(self):
+        """Return a tuple of string key names for this :class:`.KeyedTuple`.
+        This method provides compatibility with ``collections.namedtuple()``.
+        .. seealso::
+            :meth:`.KeyedTuple.keys`
+        """
+        return tuple([l for l in self._labels if l is not None])
+    def __setattr__(self, key, value):
+        raise AttributeError("Can't set attribute: %s" % key)
+    def _asdict(self):
+        """Return the contents of this :class:`.KeyedTuple` as a dictionary.
+        This method provides compatibility with ``collections.namedtuple()``,
+        with the exception that the dictionary returned is **not** ordered.
+        """
+        return {key: self.__dict__[key] for key in self.keys()}

recurvedata/pigeon/utils/mp.py ADDED Viewed

@@ -0,0 +1,156 @@
+import logging
+import threading
+import time
+from multiprocessing import Process
+from multiprocessing.queues import Queue
+from queue import Empty, Full
+from subprocess import PIPE, STDOUT, CalledProcessError, Popen
+from typing import Any, List, Optional, Tuple, Union
+def safe_join_subprocesses(workers, result_queue):
+    result = []
+    live_workers = list(workers)
+    while live_workers:
+        try:
+            while 1:
+                result.append(result_queue.get(False))
+        except Empty:
+            pass
+        time.sleep(0.5)  # Give tasks a chance to put more data in
+        if not result_queue.empty():
+            continue
+        live_workers = [p for p in live_workers if p.is_alive()]
+    return result
+def has_process_fail(workers: List[Process], log=True):
+    for p in workers:
+        if p.is_alive():
+            continue
+        if p.exitcode != 0:
+            if log:
+                logging.info(f"found process {p.pid} fail, exitcode {p.exitcode}")
+            return True
+    return False
+def terminate_processes(workers: List[Process]):
+    for p in workers:
+        if p.is_alive():
+            logging.info(f"start terminate process {p.pid}")
+            p.terminate()
+            logging.info(f"finish terminate process {p.pid}")
+def master_safe_put_queue(
+    queue: Queue, obj: Any, workers: List[Process], block=True, timeout: Optional[int] = None
+) -> Optional[bool]:
+    """
+    一种调用 queue.put 的场景，是 master put 数据，worker 消费数据.
+    在默认的 timeout=None, block=True 下，
+    如果 queue.maxsize 较小，且 workers 遇到了报错，没法及时消费，
+    就会导致 master 在调用 queue.put 的时候卡住。
+    master_safe_put_queue 可以解决这个问题，
+    当 timeout=None, block=True 的情况下，
+    会用一个较小的 timeout（10s），死循环不断尝试 queue.put(timeout=10)，
+    当 queue.put 卡住达到 10s 的时候，会报错 queue.Full，
+    这时候检查 workers 是否有异常退出的进程，
+        如果 workers 有异常退出的进程，则返回 True, 表示 worker 有异常退出导致 master queue.put 卡住;
+        如果 workers 都正常，则表示确实是 worker 消费速度较慢，重新调用 queue.put(timeout=10) 继续死循环
+    其他情况下与 queue.put 一致
+    :param queue: queue
+    :param obj: the obj to put into queue
+    :param workers: sub processes
+    :param block: should block when queue has no free slot
+    :param timeout: queue.put's timeout
+    :return: True 表示 workers 有异常退出导致 master queue.put 卡住；否则返回 None
+    """
+    if timeout is None and block:
+        while True:
+            try:
+                return queue.put(obj, timeout=10)
+            except Full:
+                if has_process_fail(workers):
+                    return True
+    else:
+        return queue.put(obj, block=block, timeout=timeout)
+def safe_join_subprocesses_early_stop(workers: List[Process], result_queue: Queue) -> Tuple[List, bool]:
+    """
+    this function wait and read the sub workers' result from result_queue,
+    exit when
+        1) one sub worker fail
+        or
+        2) all sub workers success
+    :param workers: sub progresses
+    :param result_queue: queue which sub progresses put result into
+    :return: result got from sub workers, and early_stop flag
+    """
+    result = []
+    early_stop = False
+    live_workers = list(workers)
+    last_check_early_stop_time = time.time()
+    while live_workers:
+        try:
+            while 1:
+                result.append(result_queue.get(False))
+                if time.time() - last_check_early_stop_time > 10:
+                    if has_process_fail(live_workers):
+                        early_stop = True
+                        return result, early_stop
+                    last_check_early_stop_time = time.time()
+        except Empty:
+            pass
+        time.sleep(0.5)  # Give tasks a chance to put more data in
+        if not result_queue.empty():
+            continue
+        if has_process_fail(live_workers):
+            early_stop = True
+            return result, early_stop
+        last_check_early_stop_time = time.time()
+        live_workers = [p for p in live_workers if p.is_alive()]
+    return result, early_stop
+def run_subprocess(cmd: Union[str, List], stdout=PIPE, stderr=STDOUT, return_output=False, **kwargs) -> Optional[str]:
+    p = Popen(cmd, stdout=stdout, stderr=stderr, **kwargs)
+    logging.info(f"started sub process: {cmd}, pid: {p.pid}")
+    lines: List[str] = []
+    for raw_line in iter(p.stdout.readline, b""):
+        line = raw_line.decode("utf8").rstrip()
+        logging.info(line)
+        if return_output:
+            lines.append(line)
+    p.wait()
+    logging.info("sub process exited with return code %s", p.returncode)
+    if p.returncode:
+        raise CalledProcessError(p.returncode, p.args)
+    return "\n".join(lines)
+class PropagatingThread(threading.Thread):
+    def run(self):
+        self.exc = None
+        try:
+            if hasattr(self, "_Thread__target"):
+                # Thread uses name mangling prior to Python 3.
+                self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
+            else:
+                self.ret = self._target(*self._args, **self._kwargs)
+        except BaseException as e:
+            self.exc = e
+    def join(self, timeout=None):
+        super().join(timeout)
+        if self.exc:
+            raise self.exc
+        return self.ret