PyPI - recurvedata-lib - Versions diffs - 0.1.487__py2.py3-none-any.whl - Mend

recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show

recurvedata/__init__.py +0 -0
recurvedata/__version__.py +1 -0
recurvedata/client/__init__.py +3 -0
recurvedata/client/client.py +150 -0
recurvedata/client/server_client.py +91 -0
recurvedata/config.py +99 -0
recurvedata/connectors/__init__.py +20 -0
recurvedata/connectors/_register.py +46 -0
recurvedata/connectors/base.py +111 -0
recurvedata/connectors/config_schema.py +1575 -0
recurvedata/connectors/connectors/__init__.py +0 -0
recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
recurvedata/connectors/connectors/auth.py +44 -0
recurvedata/connectors/connectors/azure_blob.py +89 -0
recurvedata/connectors/connectors/azure_synapse.py +79 -0
recurvedata/connectors/connectors/bigquery.py +359 -0
recurvedata/connectors/connectors/clickhouse.py +219 -0
recurvedata/connectors/connectors/dingtalk.py +61 -0
recurvedata/connectors/connectors/doris.py +215 -0
recurvedata/connectors/connectors/es.py +62 -0
recurvedata/connectors/connectors/feishu.py +65 -0
recurvedata/connectors/connectors/ftp.py +50 -0
recurvedata/connectors/connectors/generic.py +49 -0
recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
recurvedata/connectors/connectors/google_service_account.py +225 -0
recurvedata/connectors/connectors/hive.py +207 -0
recurvedata/connectors/connectors/impala.py +210 -0
recurvedata/connectors/connectors/jenkins.py +51 -0
recurvedata/connectors/connectors/mail.py +89 -0
recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
recurvedata/connectors/connectors/mongo.py +79 -0
recurvedata/connectors/connectors/mssql.py +131 -0
recurvedata/connectors/connectors/mysql.py +191 -0
recurvedata/connectors/connectors/n8n.py +141 -0
recurvedata/connectors/connectors/oss.py +74 -0
recurvedata/connectors/connectors/owncloud.py +36 -0
recurvedata/connectors/connectors/phoenix.py +36 -0
recurvedata/connectors/connectors/postgres.py +230 -0
recurvedata/connectors/connectors/python.py +50 -0
recurvedata/connectors/connectors/redshift.py +187 -0
recurvedata/connectors/connectors/s3.py +93 -0
recurvedata/connectors/connectors/sftp.py +87 -0
recurvedata/connectors/connectors/slack.py +35 -0
recurvedata/connectors/connectors/spark.py +99 -0
recurvedata/connectors/connectors/starrocks.py +175 -0
recurvedata/connectors/connectors/tencent_cos.py +40 -0
recurvedata/connectors/connectors/tidb.py +49 -0
recurvedata/connectors/const.py +315 -0
recurvedata/connectors/datasource.py +189 -0
recurvedata/connectors/dbapi.py +469 -0
recurvedata/connectors/fs.py +66 -0
recurvedata/connectors/ftp.py +40 -0
recurvedata/connectors/object_store.py +60 -0
recurvedata/connectors/pigeon.py +172 -0
recurvedata/connectors/proxy.py +104 -0
recurvedata/connectors/service.py +223 -0
recurvedata/connectors/utils.py +47 -0
recurvedata/consts.py +49 -0
recurvedata/core/__init__.py +0 -0
recurvedata/core/config.py +46 -0
recurvedata/core/configurable.py +27 -0
recurvedata/core/consts.py +2 -0
recurvedata/core/templating.py +206 -0
recurvedata/core/tracing.py +223 -0
recurvedata/core/transformer.py +186 -0
recurvedata/core/translation.py +91 -0
recurvedata/dbt/client.py +97 -0
recurvedata/dbt/consts.py +99 -0
recurvedata/dbt/cosmos_utils.py +275 -0
recurvedata/dbt/error_codes.py +18 -0
recurvedata/dbt/schemas.py +98 -0
recurvedata/dbt/service.py +451 -0
recurvedata/dbt/utils.py +246 -0
recurvedata/error_codes.py +71 -0
recurvedata/exceptions.py +72 -0
recurvedata/executors/__init__.py +4 -0
recurvedata/executors/cli/__init__.py +7 -0
recurvedata/executors/cli/connector.py +117 -0
recurvedata/executors/cli/dbt.py +118 -0
recurvedata/executors/cli/main.py +82 -0
recurvedata/executors/cli/parameters.py +18 -0
recurvedata/executors/client.py +190 -0
recurvedata/executors/consts.py +50 -0
recurvedata/executors/debug_executor.py +100 -0
recurvedata/executors/executor.py +300 -0
recurvedata/executors/link_executor.py +189 -0
recurvedata/executors/models.py +34 -0
recurvedata/executors/schemas.py +222 -0
recurvedata/executors/service/__init__.py +0 -0
recurvedata/executors/service/connector.py +380 -0
recurvedata/executors/utils.py +172 -0
recurvedata/filestorage/__init__.py +11 -0
recurvedata/filestorage/_factory.py +33 -0
recurvedata/filestorage/backends/__init__.py +0 -0
recurvedata/filestorage/backends/fsspec.py +45 -0
recurvedata/filestorage/backends/local.py +67 -0
recurvedata/filestorage/backends/oss.py +56 -0
recurvedata/filestorage/interface.py +84 -0
recurvedata/operators/__init__.py +10 -0
recurvedata/operators/base.py +28 -0
recurvedata/operators/config.py +21 -0
recurvedata/operators/context.py +255 -0
recurvedata/operators/dbt_operator/__init__.py +2 -0
recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
recurvedata/operators/dbt_operator/operator.py +353 -0
recurvedata/operators/link_operator/__init__.py +1 -0
recurvedata/operators/link_operator/operator.py +120 -0
recurvedata/operators/models.py +55 -0
recurvedata/operators/notify_operator/__init__.py +1 -0
recurvedata/operators/notify_operator/operator.py +180 -0
recurvedata/operators/operator.py +119 -0
recurvedata/operators/python_operator/__init__.py +1 -0
recurvedata/operators/python_operator/operator.py +132 -0
recurvedata/operators/sensor_operator/__init__.py +1 -0
recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
recurvedata/operators/sensor_operator/operator.py +172 -0
recurvedata/operators/spark_operator/__init__.py +1 -0
recurvedata/operators/spark_operator/operator.py +200 -0
recurvedata/operators/spark_operator/spark_sample.py +47 -0
recurvedata/operators/sql_operator/__init__.py +1 -0
recurvedata/operators/sql_operator/operator.py +90 -0
recurvedata/operators/task.py +211 -0
recurvedata/operators/transfer_operator/__init__.py +40 -0
recurvedata/operators/transfer_operator/const.py +10 -0
recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
recurvedata/operators/transfer_operator/load_task_email.py +188 -0
recurvedata/operators/transfer_operator/load_task_es.py +86 -0
recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
recurvedata/operators/transfer_operator/mixin.py +31 -0
recurvedata/operators/transfer_operator/operator.py +231 -0
recurvedata/operators/transfer_operator/task.py +223 -0
recurvedata/operators/transfer_operator/utils.py +134 -0
recurvedata/operators/ui.py +80 -0
recurvedata/operators/utils/__init__.py +51 -0
recurvedata/operators/utils/file_factory.py +150 -0
recurvedata/operators/utils/fs.py +10 -0
recurvedata/operators/utils/lineage.py +265 -0
recurvedata/operators/web_init.py +15 -0
recurvedata/pigeon/connector/__init__.py +294 -0
recurvedata/pigeon/connector/_registry.py +17 -0
recurvedata/pigeon/connector/aliyun_oss.py +80 -0
recurvedata/pigeon/connector/awss3.py +123 -0
recurvedata/pigeon/connector/azure_blob.py +176 -0
recurvedata/pigeon/connector/azure_synapse.py +51 -0
recurvedata/pigeon/connector/cass.py +151 -0
recurvedata/pigeon/connector/clickhouse.py +403 -0
recurvedata/pigeon/connector/clickhouse_native.py +351 -0
recurvedata/pigeon/connector/dbapi.py +571 -0
recurvedata/pigeon/connector/doris.py +166 -0
recurvedata/pigeon/connector/es.py +176 -0
recurvedata/pigeon/connector/feishu.py +1135 -0
recurvedata/pigeon/connector/ftp.py +163 -0
recurvedata/pigeon/connector/google_bigquery.py +283 -0
recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
recurvedata/pigeon/connector/hdfs.py +204 -0
recurvedata/pigeon/connector/hive_impala.py +383 -0
recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
recurvedata/pigeon/connector/mongodb.py +56 -0
recurvedata/pigeon/connector/mssql.py +467 -0
recurvedata/pigeon/connector/mysql.py +175 -0
recurvedata/pigeon/connector/owncloud.py +92 -0
recurvedata/pigeon/connector/postgresql.py +267 -0
recurvedata/pigeon/connector/power_bi.py +179 -0
recurvedata/pigeon/connector/qcloud_cos.py +79 -0
recurvedata/pigeon/connector/redshift.py +123 -0
recurvedata/pigeon/connector/sftp.py +73 -0
recurvedata/pigeon/connector/sqlite.py +42 -0
recurvedata/pigeon/connector/starrocks.py +144 -0
recurvedata/pigeon/connector/tableau.py +162 -0
recurvedata/pigeon/const.py +21 -0
recurvedata/pigeon/csv.py +172 -0
recurvedata/pigeon/docs/datasources-example.json +82 -0
recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
recurvedata/pigeon/dumper/__init__.py +171 -0
recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
recurvedata/pigeon/dumper/base.py +141 -0
recurvedata/pigeon/dumper/cass.py +213 -0
recurvedata/pigeon/dumper/dbapi.py +346 -0
recurvedata/pigeon/dumper/es.py +112 -0
recurvedata/pigeon/dumper/ftp.py +64 -0
recurvedata/pigeon/dumper/mongodb.py +103 -0
recurvedata/pigeon/handler/__init__.py +4 -0
recurvedata/pigeon/handler/base.py +153 -0
recurvedata/pigeon/handler/csv_handler.py +290 -0
recurvedata/pigeon/loader/__init__.py +87 -0
recurvedata/pigeon/loader/base.py +83 -0
recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
recurvedata/pigeon/loader/csv_to_doris.py +215 -0
recurvedata/pigeon/loader/csv_to_es.py +51 -0
recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
recurvedata/pigeon/loader/csv_to_hive.py +468 -0
recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
recurvedata/pigeon/meta.py +116 -0
recurvedata/pigeon/row_factory.py +42 -0
recurvedata/pigeon/schema/__init__.py +124 -0
recurvedata/pigeon/schema/types.py +13 -0
recurvedata/pigeon/sync.py +283 -0
recurvedata/pigeon/transformer.py +146 -0
recurvedata/pigeon/utils/__init__.py +134 -0
recurvedata/pigeon/utils/bloomfilter.py +181 -0
recurvedata/pigeon/utils/date_time.py +323 -0
recurvedata/pigeon/utils/escape.py +15 -0
recurvedata/pigeon/utils/fs.py +266 -0
recurvedata/pigeon/utils/json.py +44 -0
recurvedata/pigeon/utils/keyed_tuple.py +85 -0
recurvedata/pigeon/utils/mp.py +156 -0
recurvedata/pigeon/utils/sql.py +328 -0
recurvedata/pigeon/utils/timing.py +155 -0
recurvedata/provider_manager.py +0 -0
recurvedata/providers/__init__.py +0 -0
recurvedata/providers/dbapi/__init__.py +0 -0
recurvedata/providers/flywheel/__init__.py +0 -0
recurvedata/providers/mysql/__init__.py +0 -0
recurvedata/schedulers/__init__.py +1 -0
recurvedata/schedulers/airflow.py +974 -0
recurvedata/schedulers/airflow_db_process.py +331 -0
recurvedata/schedulers/airflow_operators.py +61 -0
recurvedata/schedulers/airflow_plugin.py +9 -0
recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
recurvedata/schedulers/base.py +99 -0
recurvedata/schedulers/cli.py +228 -0
recurvedata/schedulers/client.py +56 -0
recurvedata/schedulers/consts.py +52 -0
recurvedata/schedulers/debug_celery.py +62 -0
recurvedata/schedulers/model.py +63 -0
recurvedata/schedulers/schemas.py +97 -0
recurvedata/schedulers/service.py +20 -0
recurvedata/schedulers/system_dags.py +59 -0
recurvedata/schedulers/task_status.py +279 -0
recurvedata/schedulers/utils.py +73 -0
recurvedata/schema/__init__.py +0 -0
recurvedata/schema/field.py +88 -0
recurvedata/schema/schema.py +55 -0
recurvedata/schema/types.py +17 -0
recurvedata/schema.py +0 -0
recurvedata/server/__init__.py +0 -0
recurvedata/server/app.py +7 -0
recurvedata/server/connector/__init__.py +0 -0
recurvedata/server/connector/api.py +79 -0
recurvedata/server/connector/schemas.py +28 -0
recurvedata/server/data_service/__init__.py +0 -0
recurvedata/server/data_service/api.py +126 -0
recurvedata/server/data_service/client.py +18 -0
recurvedata/server/data_service/consts.py +1 -0
recurvedata/server/data_service/schemas.py +68 -0
recurvedata/server/data_service/service.py +218 -0
recurvedata/server/dbt/__init__.py +0 -0
recurvedata/server/dbt/api.py +116 -0
recurvedata/server/error_code.py +49 -0
recurvedata/server/exceptions.py +19 -0
recurvedata/server/executor/__init__.py +0 -0
recurvedata/server/executor/api.py +37 -0
recurvedata/server/executor/schemas.py +30 -0
recurvedata/server/executor/service.py +220 -0
recurvedata/server/main.py +32 -0
recurvedata/server/schedulers/__init__.py +0 -0
recurvedata/server/schedulers/api.py +252 -0
recurvedata/server/schedulers/schemas.py +50 -0
recurvedata/server/schemas.py +50 -0
recurvedata/utils/__init__.py +15 -0
recurvedata/utils/_typer.py +61 -0
recurvedata/utils/attrdict.py +19 -0
recurvedata/utils/command_helper.py +20 -0
recurvedata/utils/compat.py +12 -0
recurvedata/utils/compression.py +203 -0
recurvedata/utils/crontab.py +42 -0
recurvedata/utils/crypto_util.py +305 -0
recurvedata/utils/dataclass.py +11 -0
recurvedata/utils/date_time.py +464 -0
recurvedata/utils/dispatch.py +114 -0
recurvedata/utils/email_util.py +104 -0
recurvedata/utils/files.py +386 -0
recurvedata/utils/helpers.py +170 -0
recurvedata/utils/httputil.py +117 -0
recurvedata/utils/imports.py +132 -0
recurvedata/utils/json.py +80 -0
recurvedata/utils/log.py +117 -0
recurvedata/utils/log_capture.py +153 -0
recurvedata/utils/mp.py +178 -0
recurvedata/utils/normalizer.py +102 -0
recurvedata/utils/redis_lock.py +474 -0
recurvedata/utils/registry.py +54 -0
recurvedata/utils/shell.py +15 -0
recurvedata/utils/singleton.py +33 -0
recurvedata/utils/sql.py +6 -0
recurvedata/utils/timeout.py +28 -0
recurvedata/utils/tracing.py +14 -0
recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0

recurvedata/pigeon/connector/owncloud.py ADDED Viewed

@@ -0,0 +1,92 @@
+import logging
+import os
+import traceback
+import pandas as pd
+from owncloud import Client, HTTPResponseError
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.utils.fs import new_tempfile, remove_files_safely
+class OwncloudException(Exception):
+    pass
+class OwncloudDownloadException(OwncloudException):
+    pass
+class OwncloudUploadException(OwncloudException):
+    pass
+class NewOwncloudClient(Client):
+    def get_webdav_url(self):
+        return self._webdav_url
+@register_connector_class('owncloud')
+class OwncloudConnector(object):
+    def __init__(self, url: str = None, user: str = None, password: str = None, **kwargs):
+        self.url = url
+        self.user = user
+        self.password = password
+        self.oc = NewOwncloudClient(url, **kwargs)
+        self.oc.login(user, password)
+    def download_file(self, remote_path: str, local_path: str):
+        logging.info(f'Downloading remote file {remote_path} to {local_path}.')
+        try:
+            status = self.oc.get_file(remote_path, local_path)
+            if status:
+                logging.info(f'Successfully download remote file {remote_path} to {local_path}.')
+            else:
+                raise OwncloudDownloadException(f'Failed to download remote file {remote_path}, unknown error.')
+        except HTTPResponseError as e:
+            logging.error(traceback.format_exc())
+            raise OwncloudDownloadException(
+                f'Failed to download remote file {remote_path}, HTTPResponseError {e.res}.'
+            )
+        return status
+    def upload_file(self, remote_path: str, local_source_file: str, **kwargs):
+        logging.info(f'Uploading local file {local_source_file} to {remote_path}.')
+        try:
+            res = self.oc.put_file(remote_path, local_source_file, **kwargs)
+            logging.info(f'Successfully upload local file {local_source_file} to remote {remote_path}.')
+        except Exception as e:
+            logging.error(traceback.format_exc())
+            raise OwncloudUploadException(
+                f'Failed to upload local file {local_source_file} to remote {remote_path}, {e.args}.'
+            )
+        return res
+    def get_pandas_df(self, remote_path: str) -> pd.DataFrame:
+        temp_file_path = new_tempfile()
+        if self.download_file(remote_path, temp_file_path):
+            file_type = os.path.splitext(remote_path)[-1]
+            try:
+                if file_type and file_type.lower() in ('.xlsx', '.xls'):
+                    df = pd.read_excel(temp_file_path)
+                elif file_type and file_type.lower() in ('.parquet', '.parq'):
+                    df = pd.read_parquet(temp_file_path)
+                elif file_type and file_type.lower() == '.json':
+                    df = pd.read_json(temp_file_path)
+                else:
+                    df = pd.read_csv(temp_file_path)
+            except Exception as e:
+                logging.error(traceback.format_exc())
+                raise ValueError(f'Failed to load remote file {remote_path} to pandas df, {e.args}.')
+            finally:
+                remove_files_safely(temp_file_path)
+            logging.info(f'Successfully load remote file {remote_path} to pandas df, {len(df)} rows.')
+            return df
+    @property
+    def webdav_url(self):
+        return self.oc.get_webdav_url()
+    @property
+    def http_auth_conf(self):
+        return {'username': f'{self.user}', 'password': f'{self.password}'}

recurvedata/pigeon/connector/postgresql.py ADDED Viewed

@@ -0,0 +1,267 @@
+import psycopg2
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.connector.dbapi import ClosingCursor, DBAPIConnector, NullCursor
+from recurvedata.pigeon.schema import types
+_pg_type_to_canonical_type = {
+    16: types.BOOLEAN,
+    21: types.INT16,
+    23: types.INT32,
+    20: types.INT64,
+    114: types.JSON,
+    700: types.FLOAT32,
+    701: types.FLOAT64,
+    1700: types.FLOAT64,
+    1114: types.DATETIME,
+    1184: types.DATETIME,
+    1082: types.DATE,
+    1043: types.STRING,
+    1014: types.STRING,
+    1015: types.STRING,
+    1008: types.STRING,
+    1009: types.STRING,
+    2951: types.STRING,
+}
+canonical_type_to_pg_type = {
+    types.BOOLEAN: "BOOLEAN",
+    types.INT8: "INT2",
+    types.INT16: "INT2",
+    types.INT32: "INT4",
+    types.INT64: "INT8",
+    types.FLOAT32: "FLOAT4",
+    types.FLOAT64: "FLOAT8",
+    types.DATETIME: "TIMESTAMP",
+    types.DATE: "DATE",
+    types.STRING: "TEXT",
+    types.JSON: "JSON",
+}
+class NamedCursor(ClosingCursor):
+    """NamedCursor is a server side cursor, using DECLARE and FETCH internally
+    http://initd.org/psycopg/docs/usage.html#server-side-cursors
+    """
+    def __init__(self, connection, commit_on_close=True, name=None):
+        self.connection = connection
+        self._commit_on_close = commit_on_close
+        if name is not None:
+            self._cursor = connection.cursor(name, withhold=True)
+            self._cursor.itersize = 1000
+        else:
+            self._cursor = connection.cursor()
+@register_connector_class(["postgres", "postgresql"])
+class PostgresConnector(DBAPIConnector):
+    _sqla_driver = "postgresql+psycopg2"
+    _identifier_start_quote = '"'
+    _identifier_end_quote = '"'
+    _default_port = 5432
+    def connect_impl(self, autocommit=False, *args, **kwargs):
+        conn = psycopg2.connect(
+            host=self.host,
+            port=self.port,
+            user=self.user,
+            password=self.password,
+            database=self.database,
+            *args,
+            **kwargs,
+        )
+        conn.autocommit = autocommit
+        if self.schema:
+            with conn.cursor() as cursor:
+                cursor.execute(f"SET search_path TO {self.schema}, public")
+        return conn
+    def cursor(self, autocommit=False, dryrun=False, commit_on_close=True, **kwargs):
+        """Returns a DBAPI cursor"""
+        if dryrun:
+            return NullCursor()
+        cursor_name = kwargs.pop("cursor_name", None)
+        conn = self.connect(autocommit, **kwargs)
+        return NamedCursor(conn, commit_on_close=commit_on_close, name=cursor_name)
+    def has_table(self, table, database=None, schema="public", **kwargs):
+        schema, table = self._get_schema_table(table, schema)
+        if database is not None and database != self.database:
+            conn = self.clone()
+            conn.database = database
+        else:
+            conn = self
+        with conn.cursor() as cursor:
+            cursor.execute(
+                """
+                SELECT EXISTS (
+                  SELECT 1 FROM information_schema.tables
+                  WHERE table_name = %s AND table_schema = %s
+                )
+            """,
+                (table, schema),
+            )
+            return bool(cursor.fetchone()[0])
+    def get_columns(self, table, schema="public", database=None):
+        schema, table = self._get_schema_table(table, schema)
+        if database is None:
+            database = self.database
+        if not self.has_table(table, database, schema=schema):
+            raise ValueError("Table {!r}.{!r} not exists in {!r}".format(schema, table, database))
+        with self.cursor() as cursor:
+            cursor.execute('SELECT * FROM "{}"."{}" LIMIT 0'.format(schema, table))
+            cursor.fetchall()
+            return [x[0] for x in cursor.description]
+    def generate_ddl(self, table, schema="public", database=None, field_filter=(), if_exists=True):
+        schema, table = self._get_schema_table(table, schema)
+        if database is None:
+            database = self.database
+        if not self.has_table(table, database, schema=schema):
+            raise ValueError(f"Table {schema!r}.{table!r} not exists in {database!r}")
+        with self.cursor() as cursor:
+            # get table comment
+            tbl_comment_sql = f"""
+            SELECT pgd.description AS table_comment
+            FROM pg_catalog.pg_description pgd
+            WHERE pgd.objsubid = 0 AND pgd.objoid = (SELECT c.oid
+                                                     FROM pg_catalog.pg_class c
+                                                         LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
+                                                     WHERE n.nspname = {schema!r}
+                                                           AND c.relname = {table!r} AND
+                                                           c.relkind IN ('r', 'v', 'm', 'f'));
+            """
+            cursor.execute(tbl_comment_sql)
+            t_comment = cursor.fetchall()
+            # get columns
+            col_comment_sql = f"""
+            SELECT
+                a.attname                                       AS "field",
+                pg_catalog.format_type(a.atttypid, a.atttypmod) AS "type",
+                (SELECT pg_catalog.pg_get_expr(d.adbin, d.adrelid)
+                 FROM pg_catalog.pg_attrdef d
+                 WHERE d.adrelid = a.attrelid AND d.adnum = a.attnum
+                       AND a.atthasdef)
+                                                                AS "default",
+                a.attnotnull                                    AS "isnull",
+                pgd.description                                 AS "comment"
+            FROM pg_catalog.pg_attribute a
+                LEFT JOIN pg_catalog.pg_description pgd ON (
+                    pgd.objoid = a.attrelid AND pgd.objsubid = a.attnum)
+            WHERE a.attrelid = (SELECT c.oid
+                                FROM pg_catalog.pg_class c
+                                    LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
+                                WHERE n.nspname = {schema!r}
+                                      AND c.relname = {table!r} AND c.relkind IN ('r', 'v', 'm', 'f'))
+                  AND a.attnum > 0 AND NOT a.attisdropped
+            ORDER BY a.attnum;
+            """
+            cursor.execute(col_comment_sql)
+            col_info = cursor.fetchall()
+        if t_comment:
+            comments = [f"COMMENT ON TABLE {self.quote_identifier(table)} IS {t_comment[0][0]!r};"]
+        else:
+            comments = []
+        cols = []
+        for col in col_info:
+            if col in field_filter:
+                continue
+            default = " DEFAULT {col[2]}" if col[2] else ""
+            isnull = " NOT NULL " if col[3] else ""
+            if "character varying" in col[1]:
+                ctype = col[1].replace("character varying", "varchar")
+                cols.append(f"{self.quote_identifier(col[0])} {ctype}{isnull}{default}")
+            else:
+                cols.append(f"{self.quote_identifier(col[0])} {col[1]}{isnull}{default}")
+            if col[4]:
+                comments.append(
+                    f"COMMENT ON COLUMN {self.quote_identifier(table)}.{self.quote_identifier(col[0])} IS {col[4]!r};"
+                )
+        if_exists_stmt = " IF NOT EXISTS " if if_exists else " "
+        cols_stmt = ", ".join(cols)
+        comments_stmt = " ".join(comments)
+        return f"CREATE TABLE{if_exists_stmt}{self.quote_identifier(table)} ({cols_stmt}); {comments_stmt}"
+    def is_postgres(self):
+        return True
+    @staticmethod
+    def to_canonical_type(type_code, size):
+        return _pg_type_to_canonical_type.get(type_code, types.STRING)
+    @staticmethod
+    def from_canonical_type(canonical_type, size):
+        return canonical_type_to_pg_type.get(canonical_type, "TEXT")
+    def load_csv(
+        self,
+        table,
+        filename,
+        schema="public",
+        columns=None,
+        delimiter=",",
+        quotechar='"',
+        lineterminator="\r\n",
+        escapechar=None,
+        skiprows=0,
+        using_insert=True,
+        **kwargs,
+    ):
+        # if using_insert:
+        #     method = self.load_csv_by_inserting
+        # else:
+        #     method = self._copy_csv
+        if not using_insert:
+            self.logger.warning("load file directly is not implemented yet, fallback to using bulk INSERT")
+        method = self.load_csv_by_inserting
+        schema, table = self._get_schema_table(table, schema)
+        table = self._format_table_name(table, schema)
+        return method(
+            table, filename, columns, delimiter, quotechar, lineterminator, escapechar, skiprows=skiprows, **kwargs
+        )
+    def _copy_csv(
+        self,
+        table,
+        filename,
+        columns=None,
+        delimiter=",",
+        quotechar='"',
+        lineterminator="\r\n",
+        escapechar=None,
+        skiprows=0,
+        **kwargs,
+    ):
+        conn = self.connect()
+        cursor = conn.cursor()
+        self.logger.info("copy file %s into %s", filename, table)
+        with open(filename, "r") as f:
+            if skiprows:
+                for _ in range(skiprows):
+                    f.readline()
+            # the copy_from method does support standard CSV
+            cursor.copy_from(f, table, sep=delimiter, columns=columns)
+        conn.commit()
+        conn.close()
+    def _get_schema_table(self, table, schema):
+        if "." in table:
+            schema, table = table.split(".")
+        if not schema:
+            schema = "public"
+        return schema, table
+    def _format_table_name(self, table, schema):
+        if schema and "." not in table:
+            table = self.quote_identifier(f"{schema}.{table}")
+        return table

recurvedata/pigeon/connector/power_bi.py ADDED Viewed

@@ -0,0 +1,179 @@
+import copy
+import logging
+import time
+from typing import Dict, List, Union
+import msal
+import pandas as pd
+import requests
+config = dict(
+    # Can be set to 'MasterUser' or 'ServicePrincipal'
+    AUTHENTICATION_MODE='ServicePrincipal',
+    POWER_BI_TENANT_ID='',
+    POWER_BI_CLIENT_ID='',
+    # Client Secret (App Secret) of the AAD app. Required only for ServicePrincipal authentication mode.
+    POWER_BI_CLIENT_SECRET='',
+    # Scope of AAD app. Use the below configuration to use all the permissions provided in the AAD(Azure Active Directory) app through Azure portal.
+    POWER_BI_SCOPE=['https://analysis.windows.net/powerbi/api/.default'],  # 公有云
+    POWER_BI_SCOPE_CN=['https://analysis.chinacloudapi.cn/powerbi/api/.default'],  # 中国区
+    # URL used for initiating authorization request
+    POWER_BI_AUTHORITY='https://login.microsoftonline.com/tenant_id',
+    POWER_BI_AUTHORITY_CN='https://login.chinacloudapi.cn/tenant_id',
+    POWER_BI_API_URL_PREFIX='https://api.powerbi.com/v1.0/myorg',
+    POWER_BI_API_URL_PREFIX_CN='https://api.powerbi.cn/v1.0/myorg'
+)
+class PBIRefreshFailedException(Exception):
+    pass
+class PBIRefreshTimeoutException(Exception):
+    pass
+class PowerBI:
+    def __init__(self, tenant_id: str, client_id: str, client_secret: str, **kwargs):
+        self.config = copy.deepcopy(config)
+        self.config["POWER_BI_TENANT_ID"] = tenant_id
+        self.config["POWER_BI_CLIENT_ID"] = client_id
+        self.config["POWER_BI_CLIENT_SECRET"] = client_secret
+        for k, v in kwargs.items():
+            if k in self.config:
+                self.config[k] = v
+        self.access_token = None
+        self.token_abort_time = None
+    def get_access_token(self):
+        if self.access_token is not None and time.time() < self.token_abort_time:
+            return self.access_token
+        try:
+            # Service Principal auth is the recommended by Microsoft to achieve App Owns Data Power BI embedding
+            authority = self.config['POWER_BI_AUTHORITY_CN'].replace('tenant_id', self.config['POWER_BI_TENANT_ID'])
+            client_app = msal.ConfidentialClientApplication(
+                client_id=self.config['POWER_BI_CLIENT_ID'],
+                client_credential=self.config['POWER_BI_CLIENT_SECRET'],
+                authority=authority
+            )
+            # Make a client call if Access token is not available in cache
+            response = client_app.acquire_token_for_client(scopes=self.config['POWER_BI_SCOPE_CN'])
+            self.access_token = response
+            self.token_abort_time = time.time() + (response["expires_in"] - 60)
+            return response
+        except Exception as ex:
+            raise Exception('Error retrieving Access token\n' + str(ex))
+    @property
+    def request_header(self):
+        """
+        Get Power BI API request header
+        """
+        access_token = self.get_access_token()
+        return {
+            'Content-Type': 'application/json',
+            'Authorization': f"{access_token['token_type']} {access_token['access_token']}"
+        }
+    def get_refresh_job_info(self, group_id: str, dataset_id: str, request_id: str = None, limit: int = 10) -> Union[Dict, List[Dict]]:
+        """
+        Parameters:
+            group_id: The workspace ID
+            dataset_id: The dataset ID
+            request_id: 如果指定 request_id，返回对应 request Dict 否则返回 List[Dict]
+            limit: numbers of recently requests (Descending), default 10
+        """
+        url = f"{self.config['POWER_BI_API_URL_PREFIX_CN']}/groups/{group_id}/datasets/{dataset_id}/refreshes/?$top={limit}"
+        r = requests.get(url, headers=self.request_header)
+        r.raise_for_status()
+        ret = r.json()["value"]
+        if request_id:
+            return next(filter(lambda x: x["requestId"] == request_id, ret), None)
+        return ret
+    def refresh_dataset_in_group(self, group_id: str, dataset_id: str, is_wait: bool = True, timeout: int = 300, check_interval: int = 20, limit: int = 10):
+        """
+        推荐使用 refresh_datasets()
+        https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/refresh-dataset-in-group \n
+        Limitation:
+            For Shared capacities, a maximum of 8 requests per day, including refreshes executed by using scheduled refresh, can be initiated.
+        Parameters:
+            group_id: The workspace ID
+            dataset_id: The dataset ID
+            is_wait: Wait until refresh finish, default wait for refreshing
+            timeout: Default timeout is 5 minutes if is_wait is True
+            check_interval: Default 20 seconds
+            limit: numbers of recently requests (Descending)
+        Returns:
+            The refreshing job information
+        """
+        logging.info(f"Start refreshing dataset {dataset_id} in group {group_id}")
+        url = f"{self.config['POWER_BI_API_URL_PREFIX_CN']}/groups/{group_id}/datasets/{dataset_id}/refreshes"
+        r = requests.post(url, headers=self.request_header)
+        r.raise_for_status()
+        request_id = r.headers.get("RequestId")
+        # get refresh job information
+        job_info = self.get_refresh_job_info(group_id, dataset_id, request_id, limit)
+        if not job_info:  # if don't receive specific job, wait a second
+            time.sleep(5)
+            job_info = self.get_refresh_job_info(group_id, dataset_id, request_id, limit)
+        logging.info(f"Refresh detail: request_id -> {request_id}, job_info -> {job_info}")
+        if not is_wait:
+            if job_info["status"] == "Failed":
+                logging.info(f"Refresh failed: {dataset_id}")
+                raise PBIRefreshFailedException(job_info)
+            return job_info
+        abort_time = time.time() + timeout
+        while job_info["status"] != "Completed":
+            if job_info["status"] == "Failed":
+                logging.info(f"Refresh failed: {dataset_id}")
+                raise PBIRefreshFailedException(job_info)
+            time.sleep(check_interval)
+            if time.time() > abort_time:
+                logging.info(f"Refresh timeout: {dataset_id}")
+                raise PBIRefreshTimeoutException(job_info)
+            job_info = self.get_refresh_job_info(group_id, dataset_id, request_id, limit)
+            logging.info(f"Retry: {job_info}")
+        logging.info(f"Refresh completed: {dataset_id}")
+        return job_info
+    def refresh_datasets(self, refresh_list: pd.DataFrame, is_wait: bool = True, timeout: int = 300, check_interval: int = 20, limit: int = 10):
+        """
+        传入包含 group_id，dataset_id 列的 dataframe，刷新完毕返回刷新的情况
+        """
+        summary = {
+            "Completed": [], "Failed": [], "Timeout": [], "Error": []
+        }
+        if not {"group_id", "dataset_id"}.issubset(refresh_list.columns):
+            raise Exception(f"Contain wrong columns, input must include group_id and dataset_id, while target dataframe has {refresh_list.columns.to_list()} columns.")
+        for _, row in refresh_list.iterrows():
+            group_id, dataset_id = row["group_id"], row["dataset_id"]
+            try:
+                job_info = self.refresh_dataset_in_group(group_id, dataset_id, is_wait, timeout, check_interval, limit)
+                summary["Completed"].append({"group_id": group_id, "dataset_id": dataset_id, "job_info": job_info})
+            except PBIRefreshFailedException as e:
+                summary["Failed"].append({"group_id": group_id, "dataset_id": dataset_id, "job_info": e.args[0]})
+            except PBIRefreshTimeoutException as e:
+                summary["Timeout"].append({"group_id": group_id, "dataset_id": dataset_id, "job_info": e.args[0]})
+            except Exception as e:
+                summary["Error"].append({"group_id": group_id, "dataset_id": dataset_id, "reason": repr(e)})
+        return summary
+    def get_datasets_in_group(self, group_id: str) -> pd.DataFrame:
+        """
+        https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/get-datasets-in-group \n
+        Returns a list of datasets from the specified workspace.
+        """
+        url = f"{self.config['POWER_BI_API_URL_PREFIX_CN']}/groups/{group_id}/datasets"
+        r = requests.get(url, headers=self.request_header)
+        r.raise_for_status()
+        df = pd.DataFrame(r.json()["value"]).rename(columns={"id": "dataset_id"})
+        df["group_id"] = group_id
+        return df

recurvedata/pigeon/connector/qcloud_cos.py ADDED Viewed

@@ -0,0 +1,79 @@
+import os
+import qcloud_cos
+from recurvedata.pigeon.connector._registry import register_connector_class
+from recurvedata.pigeon.utils.timing import ProgressCallback
+@register_connector_class("cos")
+class COSConnector(object):
+    def __init__(self, secret_id, secret_key, region, proxies=None, endpoint=None, **kwargs):
+        self.secret_id = secret_id
+        self.secret_key = secret_key
+        self.region = region
+        self.config = qcloud_cos.CosConfig(
+            Region=region, SecretId=secret_id, SecretKey=secret_key, Endpoint=endpoint, Proxies=proxies
+        )
+        self.cos = qcloud_cos.CosS3Client(self.config)
+    def has_bucket(self, bucket_name):
+        return self.cos.bucket_exists(bucket_name)
+    def create_bucket(self, bucket_name):
+        if not self.has_bucket(bucket_name):
+            self.cos.create_bucket(bucket_name)
+    def delete_bucket(self, bucket_name):
+        if self.has_bucket(bucket_name):
+            self.cos.delete_bucket(bucket_name)
+    def has_object(self, bucket_name, key):
+        return self.cos.object_exists(bucket_name, key)
+    def delete_object(self, bucket_name, key):
+        self.cos.delete_object(bucket_name, key)
+    def list_objects(self, bucket_name, prefix=""):
+        res = self.cos.list_objects(Bucket=bucket_name, Prefix=prefix)
+        return [x["Key"] for x in res.get("Contents", [])]
+    def delete_keys_by_prefix(self, bucket_name, prefix):
+        keys = self.list_objects(bucket_name, prefix)
+        for key in keys:
+            self.delete_object(bucket_name, key)
+    def upload(self, bucket_name, filename, key=None, folder=None, overwrite=True, num_threads=4, **kwargs):
+        if not key:
+            key = os.path.basename(filename)
+        if folder:
+            key = os.path.join(folder, key)
+        if not overwrite:
+            if self.has_object(bucket_name=bucket_name, key=key):
+                return key
+        self.cos.upload_file(
+            Bucket=bucket_name,
+            LocalFilePath=filename,
+            Key=key,
+            MAXThread=num_threads,
+            progress_callback=ProgressCallback(),
+        )
+        return key
+    def download(self, bucket_name, key, folder=None, filename=None, overwrite=True, num_threads=4, **kwargs):
+        if not self.has_object(bucket_name, key):
+            raise ValueError(f"{key} not exists in {bucket_name}")
+        if not filename:
+            filename = os.path.basename(key)
+        if folder:
+            filename = os.path.join(folder, filename)
+        if not overwrite and os.path.exists(filename):
+            return filename
+        self.cos.download_file(Bucket=bucket_name, Key=key, DestFilePath=filename, MAXThread=num_threads)
+        return filename