PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/pigeon/connector/hdfs.py ADDED Viewed

@@ -0,0 +1,204 @@
+import concurrent.futures
+import logging
+import os
+import shutil
+from pywebhdfs.webhdfs import PyWebHdfsClient
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.utils import extract_dict, mp
+@register_connector_class('hdfs')
+class HDFSConnector(object):
+    def __init__(self, host, port, username=None, user_name=None, **kwargs):
+        self.host = host
+        self.port = port
+        self.user_name = username or user_name
+        extra_opts = extract_dict(kwargs, ['path_to_hosts', 'timeout', 'base_uri_pattern', 'request_extra_opts'])
+        self.hdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name, **extra_opts)
+    def list_dir(self, path):
+        return self.hdfs.list_dir(path)
+    def make_dir(self, path, **kwargs):
+        return self.hdfs.make_dir(path, **kwargs)
+    def delete_file(self, path, recursive=False):
+        return self.hdfs.delete_file_dir(path, recursive=recursive)
+    def upload_file(self, local_path, hdfs_path=None, overwrite=True):
+        if not hdfs_path:
+            hdfs_path = os.path.basename(local_path)
+        if not os.path.dirname(hdfs_path):
+            hdfs_path = os.path.join('/tmp', hdfs_path)
+        self.delete_file(hdfs_path)
+        with open(local_path, 'rb') as data:
+            self.hdfs.create_file(hdfs_path, data, overwrite=overwrite)
+        return hdfs_path
+    def upload_files(self, local_paths, hdfs_folder, num_threads=2):
+        """num_threads is currently not used"""
+        for lf in local_paths:
+            hdfs_filename = os.path.join(hdfs_folder, os.path.basename(lf))
+            self.upload_file(lf, hdfs_filename, overwrite=True)
+            logging.info(f'uploaded {lf} to {hdfs_filename}')
+class HDFSCliConnector(HDFSConnector):
+    def __init__(self, hdfs_cli=None, **kwargs):
+        if not hdfs_cli:
+            hdfs_cli = shutil.which('hdfs')
+        if not hdfs_cli:
+            raise ValueError('could not locate hdfs command line')
+        self.hdfs_cli = hdfs_cli
+    def list_dir(self, path):
+        raise NotImplementedError
+    def make_dir(self, path, **kwargs):
+        self._run_cmd(f'-mkdir {path}')
+    def delete_file(self, path, recursive=False):
+        self._run_cmd(f'-rm {"-r" if recursive else ""} -f {path}')
+    def upload_file(self, local_path, hdfs_path=None, overwrite=True):
+        if not hdfs_path:
+            hdfs_path = os.path.basename(local_path)
+        if not os.path.dirname(hdfs_path):
+            hdfs_path = os.path.join('/tmp', hdfs_path)
+        self._run_cmd(f'-put {"-f" if overwrite else ""} {local_path} {hdfs_path}')
+        return hdfs_path
+    def upload_files(self, local_paths, hdfs_folder, num_threads=2):
+        local_path_groups = partition_files_equally(local_paths, num_threads)
+        sub_cmds = [f'-put {" ".join(x)} {hdfs_folder}' for x in local_path_groups]
+        pool_size = min(num_threads, len(local_path_groups))
+        with concurrent.futures.ThreadPoolExecutor(max_workers=pool_size) as executor:
+            for _ in executor.map(self._run_cmd, sub_cmds):
+                # exhaust the iterator returned by executor.map
+                # if any task raises an exception, other tasks will be canceled by executor
+                pass
+    def _run_cmd(self, sub_cmd):
+        cmd = f'{self.hdfs_cli} dfs {sub_cmd}'
+        logging.info(cmd)
+        output = mp.run_subprocess(cmd, return_output=True, shell=True)
+        if 'NotReplicatedYetException' in output:
+            raise IOError('Incomplete copying from /data/oneflow to /tmp/oneflow/ !')
+        return output
+def partition_files_equally(local_paths, num_groups: int):
+    groups = _do_partition_files_equally([(f, os.stat(f).st_size) for f in local_paths], num_groups)
+    return [[x[0] for x in g] for g in groups if g]
+def _do_partition_files_equally(filename_size_pairs, num_groups: int):
+    """把文件划分为若干个总大小相当的分组
+    抄了这个算法: https://cloud.tencent.com/developer/article/1659134，以下文字来自该文章
+    这个问题是典型的动态规划的问题，理论上是无法找到最优解的，但是本次只是为了解决实际生产中的问题，而不是要AC，所以我们只需要找到一个相对合理的算法，使得partition的分配相对均衡就好了。
+    输入：int数组，分组数divisionNum
+    1. 对数组倒序排序
+    2. 计算数组的平均值 avg
+    3. 遍历数组。
+        * 如果第一个数大于等于avg，将这个数单独作为一组，因为再加下一个数也不会使得求和更接近avg；
+          然后将剩下的数重新求平均，表示需要让剩下的数分配得更加平均，这样可以避免极值的影响，然后重新开始下一轮计算
+        * 如果第一个数num小于avg，我们将这个数加入到数组中，然后我们需要找到一（或若干）个数，使得其和更接近delta = avg-num，
+            - 继续遍历数组，若发现某个数k==delta，将k加入到数组，结束本轮寻找
+            - 若发现a > delta > b；此时要继续判断，如果(delta - b) > (a - delta)，将b加入到数组，delta = delta - b，然后继续遍历；
+              如果(delta - b) < (a - delta)，保存distance = delta - b，然后将a将入到数组中，继续往下遍历，
+              判断能否找到距离 < distance的，如果有则选择距离更小的这组，否则选择将b加入数组。
+    :param filename_size_pairs: 文件路径和大小的组合，格式 [(name1, size1), (name2, size2)...]
+    :param num_groups: 分组数量
+    """
+    filename_size_pairs = sorted(filename_size_pairs, key=lambda x: x[1], reverse=True)
+    total_size = sum(x[1] for x in filename_size_pairs)
+    avg = total_size / num_groups
+    groups = []
+    for idx in range(num_groups):
+        if idx == num_groups - 1:
+            # 最后一个分组，把剩余的全部放一起
+            groups.append(filename_size_pairs)
+            break
+        if filename_size_pairs and filename_size_pairs[0][1] >= avg:
+            sub_group = [filename_size_pairs[0]]
+            total_size -= filename_size_pairs[0][1]
+            avg = total_size / (num_groups - len(groups))
+        else:
+            sub_group, _ = __get_list(filename_size_pairs, avg, abs(avg))
+        groups.append(sub_group)
+        for item in sub_group:
+            filename_size_pairs.remove(item)
+    return groups
+def __get_list(filename_size_pairs, delta: float, distance: float):
+    result = []
+    if not filename_size_pairs:
+        return result, -1
+    for idx, (filename, size) in enumerate(filename_size_pairs):
+        if delta < size:
+            continue
+        if delta == size:
+            result.append((filename, size))
+            return result, 0
+        else:
+            if idx == 0:
+                result.append((filename, size))
+                delta -= size
+                distance = abs(delta)
+                tmp, d = __get_list(filename_size_pairs[idx + 1:], delta, distance)
+                result.extend(tmp)
+                return result, d
+            else:
+                dis1 = abs(filename_size_pairs[idx - 1][1] - delta)
+                dis2 = abs(delta - size)
+                if dis1 > dis2:
+                    result.append((filename, size))
+                    delta -= size
+                    tmp, d = __get_list(filename_size_pairs[idx + 1:], delta, dis2)
+                    result.extend(tmp)
+                    return result, d
+                else:
+                    tmp, d = __get_list(filename_size_pairs[idx:], delta, dis2)
+                    if dis1 > d:
+                        result.extend(tmp)
+                        return result, d
+                    result.append(filename_size_pairs[idx - 1])
+                    return result, dis1
+    dis = abs(delta - filename_size_pairs[-1][1])
+    if dis < distance:
+        return filename_size_pairs[-1:], dis
+    return [], -1
+if __name__ == '__main__':
+    data = [('233dafd9b1d0b03e6e784987fe748be5.5', 400275118),
+            ('233dafd9b1d0b03e6e784987fe748be5.2', 1147688439),
+            ('233dafd9b1d0b03e6e784987fe748be5.4', 1232810556),
+            ('233dafd9b1d0b03e6e784987fe748be5.3', 1318304652),
+            ('233dafd9b1d0b03e6e784987fe748be5.0', 1392554705),
+            ('233dafd9b1d0b03e6e784987fe748be5.8', 1440314997),
+            ('233dafd9b1d0b03e6e784987fe748be5.7', 1453587946),
+            ('233dafd9b1d0b03e6e784987fe748be5.6', 1470806585),
+            ('233dafd9b1d0b03e6e784987fe748be5.1', 1509157699),
+            ('233dafd9b1d0b03e6e784987fe748be5.9', 1546082238)]
+    groups = _do_partition_files_equally(data, 5)
+    for g in groups:
+        print(g, sum(x[1] for x in g))
+    # [('233dafd9b1d0b03e6e784987fe748be5.9', 1546082238), ('233dafd9b1d0b03e6e784987fe748be5.5', 400275118)] 1946357356
+    # [('233dafd9b1d0b03e6e784987fe748be5.1', 1509157699), ('233dafd9b1d0b03e6e784987fe748be5.2', 1147688439)] 2656846138
+    # [('233dafd9b1d0b03e6e784987fe748be5.6', 1470806585), ('233dafd9b1d0b03e6e784987fe748be5.4', 1232810556)] 2703617141
+    # [('233dafd9b1d0b03e6e784987fe748be5.7', 1453587946), ('233dafd9b1d0b03e6e784987fe748be5.3', 1318304652)] 2771892598
+    # [('233dafd9b1d0b03e6e784987fe748be5.8', 1440314997), ('233dafd9b1d0b03e6e784987fe748be5.0', 1392554705)] 2832869702

recurvedata/pigeon/connector/hive_impala.py ADDED Viewed

@@ -0,0 +1,383 @@
+# flake8: noqa: E402
+# pylint: disable=wrong-import-position
+import os
+import re
+import shutil
+from typing import List, Optional, Union
+import pyhive.hive
+_ = 0  # prevent PyCharm to auto-format
+import cytoolz as toolz
+# impyla breaks TCLIService, which leads to ImportError while importing pyhive
+# see https://github.com/cloudera/impyla/issues/277
+import impala.dbapi
+import sqlalchemy
+from impala.error import HiveServer2Error
+from pyhive.exc import OperationalError
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.connector.dbapi import DBAPIConnector, _ShowTableLikeMixin
+from recurvedata.pigeon.connector.hdfs import HDFSCliConnector, HDFSConnector
+from recurvedata.pigeon.const import HIVE_FILE_FORMATS
+from recurvedata.pigeon.schema import types
+from recurvedata.pigeon.utils import ensure_list, trim_suffix
+from recurvedata.pigeon.utils.sql import sqlformat
+_hive_type_to_canonical_type = {
+    "BOOLEAN": types.BOOLEAN,
+    "TINYINT": types.INT8,
+    "SMALLINT": types.INT16,
+    "INT": types.INT32,
+    "BIGINT": types.INT64,
+    "FLOAT": types.FLOAT32,
+    "DOUBLE": types.FLOAT64,
+    "DECIMAL": types.FLOAT64,
+    "REAL": types.FLOAT64,
+    "TIMESTAMP": types.DATETIME,
+    "DATE": types.DATE,
+    "CHAR": types.STRING,
+    "VARCHAR": types.STRING,
+    "STRING": types.STRING,
+}
+_canonical_type_to_hive_type = {
+    types.BOOLEAN: "BOOLEAN",
+    types.INT8: "TINYINT",
+    types.INT16: "SMALLINT",
+    types.INT32: "INT",
+    types.INT64: "BIGINT",
+    types.FLOAT32: "DOUBLE",
+    types.FLOAT64: "DOUBLE",
+    # treat date, datetime as string
+    types.DATE: "STRING",
+    types.DATETIME: "STRING",
+    types.STRING: "STRING",
+    types.JSON: "STRING",
+}
+class _HiveSQLMixin:
+    def create_partition_table_like(self, table, like_table, partitions):
+        """建一个分区表 Like 已有的一个未分区的表，并添加分区键"""
+        if not self.has_table(like_table):
+            raise ValueError(f"like table {like_table!r} not exists")
+        partitions = [f"`{pname}` {ptype}" for pname, ptype in partitions.items()]
+        partitions = ", ".join(partitions)
+        with self.cursor() as cursor:
+            cursor.execute(f"SELECT * FROM {like_table} LIMIT 0")
+            columns = [(x[0], x[1]) for x in cursor.description]
+        columns = ",\n".join("{} {}".format(*x) for x in columns)
+        sql = f"""
+            CREATE TABLE {table} (
+                {columns}
+            ) PARTITIONED BY ({partitions})
+        """
+        self.execute(sqlformat(sql))
+    def is_table_partitioned(self, database, table):
+        with self.cursor() as cursor:
+            try:
+                # 查看表的分区情况，如果没有报错就返回True
+                cursor.execute(f"SHOW PARTITIONS {database}.{table}")
+                return True
+            except (OperationalError, HiveServer2Error) as e:
+                msg = str(e).lower()
+                if "table not found" in msg or "table does not exist:" in msg:
+                    return False
+                elif "is not a partitioned table" in msg or "table is not partitioned" in msg:
+                    return False
+                else:
+                    raise e
+    @staticmethod
+    def to_canonical_type(type_code, size):
+        type_code = trim_suffix(type_code, "_TYPE")
+        return _hive_type_to_canonical_type.get(type_code, types.STRING)
+    @staticmethod
+    def from_canonical_type(canonical_type, size):
+        return _canonical_type_to_hive_type.get(canonical_type, "STRING")
+    def generate_create_table_ddl(self, name, schema, **kwargs):
+        cols = []
+        for f in schema:
+            col_name = self.quote_identifier(f.name)
+            if f.comment:
+                cols.append(f"{col_name} {self.from_canonical_type(f.type, f.size)} COMMENT {f.comment!r}")
+            else:
+                cols.append(f"{col_name} {self.from_canonical_type(f.type, f.size)}")
+        file_format = kwargs.get("file_format", "PARQUET")
+        col_types = ", \n".join(cols)
+        name = self.quote_identifier(name)
+        ddl = f"CREATE TABLE {name} (\n{col_types}\n) STORED AS {file_format}"
+        return ddl
+@register_connector_class("hive")
+class HiveConnector(_ShowTableLikeMixin, _HiveSQLMixin, DBAPIConnector):
+    _sqla_driver = "hive"
+    _log_query = False
+    _default_port = 10000
+    _complex_types = ("array", "map", "struct")
+    def connect_impl(self, autocommit=False, *args, **kwargs):
+        params = {
+            "host": self.host,
+            "port": self.port,
+            "username": self.user,
+            "database": self.database or "default",
+        }
+        if self.password:
+            params.update({"password": self.password, "auth": self.kwargs["auth"]})
+        hive_conf = self.hive_conf
+        hive_conf.update(kwargs.get("hive_conf", {}))
+        if hive_conf:
+            params["configuration"] = hive_conf
+        return pyhive.hive.connect(**params)
+    def create_engine(self, engine_kwargs=None, url_queries=None):
+        return sqlalchemy.create_engine("hive://", creator=self.connect)
+    def is_hive(self):
+        return True
+    @toolz.memoize
+    def create_hdfs_connector(self) -> Optional[HDFSConnector]:
+        hdfs_options = self.kwargs.get("hdfs_options")
+        if not hdfs_options:
+            return None
+        return HDFSConnector(**hdfs_options)
+    def has_complex_type_fields(self, table):
+        table = self.quote_identifier(table)
+        with self.cursor() as cursor:
+            cursor.execute("DESCRIBE {}".format(table))
+            for r in cursor.fetchall():
+                if r[0] == "":
+                    break
+                has_complex = any(x in r[1].lower() for x in self._complex_types)
+                if has_complex:
+                    return True
+        return False
+    def get_columns(self, table, database=None, exclude=None):
+        if database is None:
+            database = self.database
+        with self.cursor() as cursor:
+            if not self.has_table(table, database, cursor=cursor):
+                raise ValueError("Table {!r} not exists in {!r}".format(table, database))
+            # Hive bug https://issues.apache.org/jira/browse/HIVE-12184
+            cursor.execute("USE {}".format(self.quote_identifier(database)))
+            cursor.execute("DESCRIBE {}".format(self.quote_identifier(table)))
+            cols = []
+            for r in cursor.fetchall():
+                # the following is partition information
+                if r[0] == "":
+                    break
+                cols.append(r[0])
+        if exclude:
+            cols = [x for x in cols if x not in exclude]
+        return cols
+    def load_local_file(self, table, filepath, overwrite=True):
+        hdfs_clients = []
+        hdfs_cli = shutil.which("hdfs")
+        if hdfs_cli:
+            hdfs = HDFSCliConnector(hdfs_cli)
+            hdfs_clients.append(hdfs)
+        webhdfs = self.create_hdfs_connector()
+        if webhdfs:
+            hdfs_clients.append(webhdfs)
+        exc = None
+        for hdfs in hdfs_clients:
+            self.logger.info(f"try to load file using {hdfs}")
+            try:
+                self._load_local_file_to_hive_impl(table, filepath, hdfs, overwrite)
+                self.logger.info("finished load files")
+            except Exception as e:
+                exc = e
+                self.logger.exception(f"failed to load file using {hdfs}")
+            else:
+                exc = None
+                break
+        if exc:
+            raise exc
+    def _load_local_file_to_hive_impl(
+        self, table: str, filepath: Union[str, List[str]], hdfs: HDFSConnector, overwrite=True
+    ):
+        staging_folder = self.kwargs.get("hdfs_options", {}).get("staging_folder", "/tmp")
+        hdfs_folder = os.path.join(staging_folder, f"{self.database}_{table}_")
+        hdfs.delete_file(hdfs_folder, recursive=True)
+        hdfs.make_dir(hdfs_folder)
+        hdfs.upload_files(ensure_list(filepath), hdfs_folder)
+        query = f"LOAD DATA INPATH '{hdfs_folder}/*' {'OVERWRITE' if overwrite else ''} INTO TABLE {table}"
+        self.execute(query)
+        hdfs.delete_file(hdfs_folder, recursive=True)
+    def generate_ddl(self, table, database=None, if_exists=True, file_format="text"):
+        file_format = file_format.lower()
+        if file_format not in HIVE_FILE_FORMATS:
+            raise ValueError(f"Format {file_format!r} is not supported")
+        if database is None:
+            database = self.database
+        if not self.has_table(table, database):
+            raise ValueError(f"Table {table!r} not exists in {database!r}")
+        with self.cursor() as cursor:
+            cursor.execute(f"USE {self.quote_identifier(database)}")
+            cursor.execute(f"SHOW CREATE TABLE {self.quote_identifier(table)}")
+            result = cursor.fetchall()
+        body = ""
+        for r in result[1:]:
+            if "ROW FORMAT" in r[0]:
+                break
+            body += r[0]
+        if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
+        file_format_stmt = f" STORED AS {HIVE_FILE_FORMATS[file_format]}"
+        return f"CREATE TABLE{if_exists_stmt}{self.quote_identifier(table)} ({body}{file_format_stmt}"
+    def _add_leading_comment_impl(self, query, comment):
+        comment = self._safe_comment(comment)
+        return "-- {}\n{}".format(comment, query.strip("\n"))
+    @property
+    def hive_conf(self):
+        """
+        用于设置 hive query 的参数，与在 hive 里执行 set xxx=xxx 基本一致（数字需要用字符串形式）；
+        字典类型，例如 {
+                    'spark.yarn.queue': 'etl',
+                    'spark.app.name': 'pigeon',
+                    'spark.executor.instances': '3'
+                  }
+        注意字典里数字要写成字符串的形式
+        """
+        if "hive_conf" in self.kwargs:
+            # 考虑到 hive_conf 都是单层 k,v ，不使用 deepcopy
+            return self.kwargs["hive_conf"].copy()
+        return {}
+    def generate_load_staging_table_ddl(self, staging_table, table, database=None, exclude_columns=None):
+        if database is None:
+            database = self.database
+        if exclude_columns:
+            exclude_columns = [col.lower().replace("`", "") for col in exclude_columns]
+        with self.cursor() as cursor:
+            cursor.execute(f"USE {self.quote_identifier(database)}")
+            cursor.execute(f"SHOW CREATE TABLE {self.quote_identifier(table)}")
+            result = cursor.fetchall()
+        body = pre_row = ""
+        for r in result[1:]:
+            row = r[0].lower().strip()
+            if row.startswith("partitioned by ("):
+                continue
+            if row.startswith("comment"):
+                continue
+            if exclude_columns:
+                col_name = row.split(" ")[0].strip("`")
+                if col_name in exclude_columns:
+                    continue
+            if row.endswith(")"):
+                row = ",".join(row.rsplit(")", 1))
+            if row.startswith("row format"):
+                pre_row = ")".join(pre_row.rsplit(",", 1))
+                body += pre_row
+                break
+            body += pre_row
+            pre_row = row
+        return f"CREATE TABLE {self.quote_identifier(staging_table)} ({body}"
+@register_connector_class("impala")
+class ImpalaConnector(_ShowTableLikeMixin, _HiveSQLMixin, DBAPIConnector):
+    _sqla_driver = "impala"
+    _default_port = 21050
+    def connect_impl(self, autocommit=False, *args, **kwargs):
+        params = {
+            "host": self.host,
+            "port": self.port,
+            "database": self.database or "default",
+            "user": self.user,
+            "password": self.password,
+        }
+        if "auth_mechanism" in self.kwargs:
+            params["auth_mechanism"] = self.kwargs["auth_mechanism"]
+        return impala.dbapi.connect(**params)
+    def create_engine(self, engine_kwargs=None, url_queries=None):
+        return sqlalchemy.create_engine("impala://", creator=self.connect)
+    def is_impala(self):
+        return True
+    def get_columns(self, table, database=None, exclude=None):
+        if database is None:
+            database = self.database
+        with self.cursor() as cursor:
+            if not self.has_table(table, database, cursor=cursor):
+                raise ValueError("Table {!r} not exists in {!r}".format(table, database))
+            cursor.execute("DESCRIBE {}.{}".format(self.quote_identifier(database), self.quote_identifier(table)))
+            cols = [x[0] for x in cursor.fetchall()]
+        if exclude:
+            cols = [x for x in cols if x not in exclude]
+        return cols
+    def invalidate_metadata(self, table=None):
+        if table:
+            table = self.quote_identifier(table)
+        else:
+            table = ""
+        query = f"INVALIDATE METADATA {table}"
+        self.execute(query)
+    def refresh(self, table, compute_stats=True):
+        table = self.quote_identifier(table)
+        queries = "REFRESH {}".format(table)
+        try:
+            self.execute(queries)
+        except Exception as e:
+            self.logger.error(f"failed to refresh, err: {e}, use INVALIDATE")
+            queries = "INVALIDATE METADATA {}".format(table)
+            self.execute(queries)
+        if compute_stats:
+            queries = "COMPUTE INCREMENTAL STATS {}".format(table)
+            self.execute(queries)
+    def generate_ddl(self, table, database=None, if_exists=True, file_format="text"):
+        file_format = file_format.lower()
+        # ORC is not supported in Impala
+        # https://www.cloudera.com/documentation/enterprise/5-12-x/topics/impala_file_formats.html
+        if file_format == "orc" or file_format not in HIVE_FILE_FORMATS:
+            raise ValueError(f"Format {file_format!r} is not supported")
+        if database is None:
+            database = self.database
+        if not self.has_table(table, database):
+            raise ValueError(f"Table {table!r} not exists in {database!r}")
+        with self.cursor() as cursor:
+            cursor.execute(f"USE {self.quote_identifier(database)}")
+            cursor.execute(f"SHOW CREATE TABLE {self.quote_identifier(table)}")
+            body = re.search(r"\.(.*)\nSTORED", cursor.fetchall()[0][0], flags=re.S).group(1)
+        if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
+        file_format_stmt = f" STORED AS {HIVE_FILE_FORMATS[file_format]}"
+        return f"CREATE TABLE{if_exists_stmt}{body}{file_format_stmt}"
+    @toolz.memoize
+    def create_hdfs_connector(self):
+        hdfs_options = self.kwargs.get("hdfs_options")
+        if not hdfs_options:
+            return None
+        return HDFSConnector(**hdfs_options)

recurvedata/pigeon/connector/microsoft_fabric.py ADDED Viewed

@@ -0,0 +1,95 @@
+from collections import OrderedDict
+from typing import Any, Optional
+import pyodbc
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.connector.mssql import BaseAzureSQLConnector
+from recurvedata.pigeon.schema import types
+from recurvedata.pigeon.utils import safe_int
+@register_connector_class("microsoft_fabric")  # type: ignore
+class MsFabricConnector(BaseAzureSQLConnector):
+    """Connector for Microsoft Fabric.
+    This connector extends BaseAzureSQLConnector to support Microsoft Fabric specific features:
+    - Azure AD authentication
+    - Workspace-level access control
+    - Special ODBC driver configuration
+    """
+    def __init__(
+        self,
+        host: str | None = None,
+        port: int | None = None,
+        database: str | None = None,
+        schema: str | None = None,
+        tenant_id: str | None = None,
+        client_id: str | None = None,
+        client_secret: str | None = None,
+        authentication: str = "ServicePrincipal",
+        odbc_driver: str = "ODBC Driver 18 for SQL Server",
+        encrypt: bool = True,
+        trust_server_certificate: bool = False,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(host, port, database, schema=schema, *args, **kwargs)
+        self.tenant_id = tenant_id
+        self.client_id = client_id
+        self.client_secret = client_secret
+        self.authentication = authentication
+        self.odbc_driver = odbc_driver
+        self.driver = "mssql+pyodbc"
+        self.encrypt = encrypt
+        self.trust_server_certificate = trust_server_certificate
+    def _get_sqlalchemy_uri(self) -> str:
+        """Generate SQLAlchemy URI for Microsoft Fabric."""
+        return (
+            f"{self.driver}://{self.client_id}:{self.client_secret}@{self.host}:{self.port}/"
+            f"{self.database}?driver={self.odbc_driver}&encrypt={self.encrypt}&trust_server_certificate={self.trust_server_certificate}"
+        )
+    def is_fabric(self) -> bool:
+        """Check if this is a Microsoft Fabric connector."""
+        return True
+    @staticmethod
+    def to_canonical_type(type_code: Any, size: Optional[int] = None) -> str:
+        """Convert Microsoft Fabric type to canonical type."""
+        return BaseAzureSQLConnector.to_canonical_type(type_code, size)
+    @staticmethod
+    def from_canonical_type(canonical_type: str, size: Optional[int] = None) -> str:
+        """Convert canonical type to Microsoft Fabric type."""
+        if canonical_type == types.STRING:
+            if size is None or size == 0:
+                return "VARCHAR(max)"
+            safe_size = safe_int(size * 4)
+            if safe_size > 4000:
+                return "VARCHAR(max)"
+            return f"VARCHAR({safe_size})"
+        return BaseAzureSQLConnector.from_canonical_type(canonical_type, size)
+    @property
+    def conn_string(self) -> str:
+        """Generate connection string for Microsoft Fabric with Azure AD authentication."""
+        options = OrderedDict(
+            {
+                "Driver": f"{{{self.odbc_driver}}}",
+                "Server": f"{self.host}",
+                "Database": str(self.database),
+                "Authentication": "ActiveDirectoryServicePrincipal",
+                "Encrypt": "yes" if self.encrypt else "no",
+                "TrustServerCertificate": "yes" if self.trust_server_certificate else "no",
+                "Uid": self.client_id,
+                "Pwd": self.client_secret,
+                "Connection Timeout": 30,
+            }
+        )
+        return ";".join([f"{k}={v}" for k, v in options.items()])
+    def connect_impl(self, autocommit=False, *args, **kwargs):
+        return pyodbc.connect(self.conn_string, autocommit=autocommit)