recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os.path
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
from slugify import slugify
|
|
6
|
+
|
|
7
|
+
from recurvedata.pigeon.connector import get_connector, get_connector_class
|
|
8
|
+
from recurvedata.pigeon.const import LOAD_APPEND, LOAD_MERGE
|
|
9
|
+
from recurvedata.pigeon.handler.csv_handler import create_csv_file_handler_factory
|
|
10
|
+
from recurvedata.pigeon.loader import CSVToHiveLoader, CSVToMySQLLoader, CSVToRedshiftLoader
|
|
11
|
+
from recurvedata.pigeon.utils import ensure_list, fs
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _parse_db_table(db_table):
|
|
17
|
+
t_db_table = db_table.split(".")
|
|
18
|
+
if len(t_db_table) == 2:
|
|
19
|
+
db, table = t_db_table
|
|
20
|
+
schema = None
|
|
21
|
+
elif len(t_db_table) == 3:
|
|
22
|
+
db, schema, table = t_db_table
|
|
23
|
+
else:
|
|
24
|
+
raise ValueError(f"Invalid database and table {db_table!r}")
|
|
25
|
+
return {
|
|
26
|
+
"database": db,
|
|
27
|
+
"schema": schema,
|
|
28
|
+
"table": table,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Location:
|
|
33
|
+
"""
|
|
34
|
+
Syntax:
|
|
35
|
+
完整URL形式:{protocol}://{user}:{password}@{host}:{port}/{database}.{table}
|
|
36
|
+
简写形式: {dbconf}:{database}.{table}
|
|
37
|
+
本地文件: file://{path}
|
|
38
|
+
本地文件简写:file:{path}
|
|
39
|
+
Example:
|
|
40
|
+
mysql://dev:pass@172.16.24.93:3306/testdb.test
|
|
41
|
+
tidb:testdb.test
|
|
42
|
+
file:///tmp/result.csv
|
|
43
|
+
file:/tmp/result.csv
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
ATTRS = ["protocol", "user", "password", "host", "port", "database", "schema", "table", "dbconf", "path"]
|
|
47
|
+
|
|
48
|
+
def __init__(self, spec=None, **kwargs):
|
|
49
|
+
if not kwargs:
|
|
50
|
+
if not spec:
|
|
51
|
+
raise ValueError("Location spec is required")
|
|
52
|
+
if spec.startswith("file:"):
|
|
53
|
+
self._from_file(spec)
|
|
54
|
+
elif "://" in spec:
|
|
55
|
+
self._from_url(spec)
|
|
56
|
+
else:
|
|
57
|
+
self._from_simple(spec)
|
|
58
|
+
else:
|
|
59
|
+
if spec is not None:
|
|
60
|
+
kwargs["dbconf"] = spec
|
|
61
|
+
self._from_dict(kwargs)
|
|
62
|
+
|
|
63
|
+
def __str__(self):
|
|
64
|
+
if self.protocol == "file":
|
|
65
|
+
return self._format_file()
|
|
66
|
+
if self.dbconf:
|
|
67
|
+
return self._format_simple()
|
|
68
|
+
else:
|
|
69
|
+
return self._format_url()
|
|
70
|
+
|
|
71
|
+
def _format_file(self):
|
|
72
|
+
return f"{self.protocol}:{self.path}"
|
|
73
|
+
|
|
74
|
+
def _format_url(self, hide_password=True):
|
|
75
|
+
ret = [f"{self.protocol}://"]
|
|
76
|
+
if self.user:
|
|
77
|
+
ret.append(str(self.user))
|
|
78
|
+
if self.password:
|
|
79
|
+
password = "***" if hide_password else self.passsword
|
|
80
|
+
ret.append(f":{password}")
|
|
81
|
+
ret.append("@")
|
|
82
|
+
if self.host:
|
|
83
|
+
if ":" in self.host:
|
|
84
|
+
ret.append(f"[{self.host}]")
|
|
85
|
+
else:
|
|
86
|
+
ret.append(str(self.host))
|
|
87
|
+
if self.port:
|
|
88
|
+
ret.append(f":{self.port}")
|
|
89
|
+
if self.database:
|
|
90
|
+
ret.append(f"/{self.database}")
|
|
91
|
+
if self.schema:
|
|
92
|
+
ret.append(f".{self.schema}")
|
|
93
|
+
if self.table:
|
|
94
|
+
ret.append(f".{self.table}")
|
|
95
|
+
return "".join(ret)
|
|
96
|
+
|
|
97
|
+
def _format_simple(self):
|
|
98
|
+
ret = [f"{self.dbconf}:"]
|
|
99
|
+
if self.database:
|
|
100
|
+
ret.append(f"{self.database}")
|
|
101
|
+
if self.schema:
|
|
102
|
+
ret.append(f".{self.schema}")
|
|
103
|
+
if self.table:
|
|
104
|
+
ret.append(f".{self.table}")
|
|
105
|
+
return "".join(ret)
|
|
106
|
+
|
|
107
|
+
def __repr__(self):
|
|
108
|
+
return f"<{type(self).__name__} {str(self)}>"
|
|
109
|
+
|
|
110
|
+
def _from_file(self, url):
|
|
111
|
+
"""
|
|
112
|
+
Syntax:
|
|
113
|
+
file://{path}
|
|
114
|
+
file:{path}
|
|
115
|
+
"""
|
|
116
|
+
protocol, path = url.split(":", maxsplit=1)
|
|
117
|
+
if path.startswith("//"):
|
|
118
|
+
path = path[2:]
|
|
119
|
+
params = {"protocol": protocol, "path": path}
|
|
120
|
+
self._from_dict(params)
|
|
121
|
+
|
|
122
|
+
def _from_url(self, url):
|
|
123
|
+
"""
|
|
124
|
+
Syntax:
|
|
125
|
+
{protocol}://{user}:{password}@{host}:{port}/{database}.{table}
|
|
126
|
+
"""
|
|
127
|
+
parsed = urlparse(url)
|
|
128
|
+
params = {
|
|
129
|
+
"protocol": parsed.scheme,
|
|
130
|
+
"user": parsed.username,
|
|
131
|
+
"password": parsed.password,
|
|
132
|
+
"host": parsed.hostname,
|
|
133
|
+
"port": parsed.port,
|
|
134
|
+
}
|
|
135
|
+
params.update(_parse_db_table(parsed.path.strip("/")))
|
|
136
|
+
self._from_dict(params)
|
|
137
|
+
|
|
138
|
+
def _from_simple(self, spec):
|
|
139
|
+
"""
|
|
140
|
+
Syntax:
|
|
141
|
+
{dbconf}:{database}.{table}
|
|
142
|
+
"""
|
|
143
|
+
dbconf, db_table = spec.strip().split(":")
|
|
144
|
+
params = {"dbconf": dbconf}
|
|
145
|
+
params.update(_parse_db_table(db_table))
|
|
146
|
+
self._from_dict(params)
|
|
147
|
+
|
|
148
|
+
def _from_dict(self, params):
|
|
149
|
+
unknown_params = set(params) - set(self.ATTRS)
|
|
150
|
+
if unknown_params:
|
|
151
|
+
raise ValueError(f"Unknown params {unknown_params}")
|
|
152
|
+
if not params.get("protocol") and not params.get("dbconf"):
|
|
153
|
+
raise ValueError("protocol or dbconf is required")
|
|
154
|
+
if params.get("protocol") == "file" and not params.get("path"):
|
|
155
|
+
raise ValueError("path is required")
|
|
156
|
+
for k in self.ATTRS:
|
|
157
|
+
setattr(self, k, params.get(k))
|
|
158
|
+
|
|
159
|
+
def to_dict(self):
|
|
160
|
+
ret = {}
|
|
161
|
+
for k in self.ATTRS:
|
|
162
|
+
v = getattr(self, k, None)
|
|
163
|
+
if v is not None and v != "":
|
|
164
|
+
ret[k] = v
|
|
165
|
+
return ret
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def is_local(self):
|
|
169
|
+
return self.dbconf == "file" or self.protocol == "file"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _get_dumper_cls(dtype):
|
|
173
|
+
# XXX: so ugly here
|
|
174
|
+
if dtype in ["cassandra"]:
|
|
175
|
+
from recurvedata.pigeon.dumper.cass import CassandraDumper
|
|
176
|
+
|
|
177
|
+
return CassandraDumper
|
|
178
|
+
|
|
179
|
+
from recurvedata.pigeon.dumper.dbapi import DBAPIDumper
|
|
180
|
+
|
|
181
|
+
return DBAPIDumper
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _get_connector(location):
|
|
185
|
+
if location.protocol:
|
|
186
|
+
# only support dbapi
|
|
187
|
+
_conn_cls = get_connector_class(location.protocol)
|
|
188
|
+
connector = _conn_cls(
|
|
189
|
+
host=location.host,
|
|
190
|
+
port=location.port,
|
|
191
|
+
user=location.user,
|
|
192
|
+
password=location.passsword,
|
|
193
|
+
database=location.database,
|
|
194
|
+
)
|
|
195
|
+
else:
|
|
196
|
+
connector = get_connector(location.dbconf, database=location.database)
|
|
197
|
+
return connector
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _dump(src, handler_factory):
|
|
201
|
+
connector = _get_connector(src)
|
|
202
|
+
if src.schema:
|
|
203
|
+
table = f"{src.schema}.{src.table}"
|
|
204
|
+
else:
|
|
205
|
+
table = src.table
|
|
206
|
+
dumper = _get_dumper_cls(src.protocol or src.dbconf)(
|
|
207
|
+
connector,
|
|
208
|
+
table=table,
|
|
209
|
+
handler_factories=[handler_factory],
|
|
210
|
+
)
|
|
211
|
+
logger.info("Dump start".center(40, "="))
|
|
212
|
+
dumper.execute()
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
_loader_config = {
|
|
216
|
+
"hive": {
|
|
217
|
+
"cls": CSVToHiveLoader,
|
|
218
|
+
"connector": "hive_connector",
|
|
219
|
+
},
|
|
220
|
+
"redshift": {
|
|
221
|
+
"cls": CSVToRedshiftLoader,
|
|
222
|
+
"connector": "redshift_connector",
|
|
223
|
+
},
|
|
224
|
+
"mysql": {
|
|
225
|
+
"cls": CSVToMySQLLoader,
|
|
226
|
+
"connector": "connector",
|
|
227
|
+
},
|
|
228
|
+
}
|
|
229
|
+
_loader_config["tidb"] = _loader_config["mysql"]
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _load(dst, filename, mode, merge_keys=()):
|
|
233
|
+
connector = _get_connector(dst)
|
|
234
|
+
|
|
235
|
+
cfg = _loader_config[dst.protocol or dst.dbconf]
|
|
236
|
+
cls, connector_name = cfg["cls"], cfg["connector"]
|
|
237
|
+
kwargs = {
|
|
238
|
+
"database": dst.database,
|
|
239
|
+
"table": dst.table,
|
|
240
|
+
"filename": filename,
|
|
241
|
+
connector_name: connector,
|
|
242
|
+
}
|
|
243
|
+
if mode.upper() == LOAD_MERGE:
|
|
244
|
+
logger.info(f"Primary keys: {merge_keys} in {dst}")
|
|
245
|
+
kwargs.update({"mode": LOAD_MERGE, "primary_keys": ensure_list(merge_keys)})
|
|
246
|
+
elif mode.upper() == LOAD_APPEND:
|
|
247
|
+
kwargs.update({"mode": LOAD_APPEND})
|
|
248
|
+
loader = cls(**kwargs)
|
|
249
|
+
logger.info("Load start".center(40, "="))
|
|
250
|
+
loader.execute()
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _get_stage_filename(src, dst):
|
|
254
|
+
if src.is_local:
|
|
255
|
+
return src.path
|
|
256
|
+
if dst.is_local:
|
|
257
|
+
if not os.path.isabs(dst.path):
|
|
258
|
+
return os.path.abspath(dst.path)
|
|
259
|
+
return dst.path
|
|
260
|
+
tmpdir = f"{src.protocol or src.dbconf}_to_{dst.protocol or dst.dbconf}"
|
|
261
|
+
new_stagefile = fs.new_stagefile_factory(tmpdir)
|
|
262
|
+
return new_stagefile(slugify(f"{src}_to_{dst}") + ".txt")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def sync(src, dst, mode, merge_keys=()):
|
|
266
|
+
"""同步一个表"""
|
|
267
|
+
if not isinstance(src, Location):
|
|
268
|
+
src = Location(src)
|
|
269
|
+
if not isinstance(dst, Location):
|
|
270
|
+
dst = Location(dst)
|
|
271
|
+
|
|
272
|
+
if dst.protocol and dst.protocol != "file":
|
|
273
|
+
raise NotImplementedError("暂不支持URL形式的目标")
|
|
274
|
+
|
|
275
|
+
filename = _get_stage_filename(src, dst)
|
|
276
|
+
logger.info(f"Dump to file: {filename}")
|
|
277
|
+
|
|
278
|
+
if not src.is_local:
|
|
279
|
+
for_hive = (dst.protocol or dst.dbconf) in ["impala", "hive"]
|
|
280
|
+
handler_factory = create_csv_file_handler_factory(filename=filename, hive=for_hive)
|
|
281
|
+
_dump(src, handler_factory)
|
|
282
|
+
if not dst.is_local:
|
|
283
|
+
_load(dst, filename, mode=mode, merge_keys=merge_keys)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import struct
|
|
2
|
+
import zlib
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
# use ujson for better performance
|
|
7
|
+
import ujson as json
|
|
8
|
+
except ImportError:
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
from recurvedata.pigeon import const
|
|
12
|
+
from recurvedata.pigeon.schema import Schema
|
|
13
|
+
|
|
14
|
+
_Row = Union[Tuple, Dict[str, Any]]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Transformer:
|
|
18
|
+
_input_schema: Optional[Schema] = None
|
|
19
|
+
_use_input_schema_as_output: bool = False
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def input_schema(self) -> Optional[Schema]:
|
|
23
|
+
"""Returns the schema of input data"""
|
|
24
|
+
return self._input_schema
|
|
25
|
+
|
|
26
|
+
@input_schema.setter
|
|
27
|
+
def input_schema(self, schema: Schema):
|
|
28
|
+
"""Should be called by the handler"""
|
|
29
|
+
assert isinstance(schema, Schema)
|
|
30
|
+
self._input_schema = schema
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def output_schema(self) -> Optional[Schema]:
|
|
34
|
+
"""Subclasses that change the rows schema should provide the output schema.
|
|
35
|
+
|
|
36
|
+
These operations will change the output schema:
|
|
37
|
+
- Add or remove fields
|
|
38
|
+
- Change the name of fields
|
|
39
|
+
- Change the type of fields
|
|
40
|
+
|
|
41
|
+
An example of valid schema:
|
|
42
|
+
|
|
43
|
+
from recurvedata.pigeon.schema import Schema, Field, types
|
|
44
|
+
|
|
45
|
+
Schema([
|
|
46
|
+
Field(name='id', type=types.INT32),
|
|
47
|
+
Field(name='name', type=types.STRING, size=64),
|
|
48
|
+
Field(name='snapshot_time', type=types.DATETIME, comment='snapshot_time in UTC'),
|
|
49
|
+
Field(name='is_active', type=types.BOOLEAN)
|
|
50
|
+
])
|
|
51
|
+
|
|
52
|
+
Allowed types:
|
|
53
|
+
|
|
54
|
+
- INT8 = 'INT8' # 1-byte (8-bit) signed integers
|
|
55
|
+
- INT16 = 'INT16' # 2-byte (16-bit) signed integers
|
|
56
|
+
- INT32 = 'INT32' # 4-byte (32-bit) signed integers
|
|
57
|
+
- INT64 = 'INT64' # 8-byte (64-bit) signed integers
|
|
58
|
+
- FLOAT32 = 'FLOAT32' # 4-byte (32-bit) single-precision floating
|
|
59
|
+
- FLOAT64 = 'FLOAT64' # 8-byte (64-bit) double-precision floating
|
|
60
|
+
- BOOLEAN = 'BOOLEAN'
|
|
61
|
+
- DATETIME = 'DATETIME'
|
|
62
|
+
- DATE = 'DATE'
|
|
63
|
+
- STRING = 'STRING'
|
|
64
|
+
"""
|
|
65
|
+
if self._use_input_schema_as_output:
|
|
66
|
+
return self._input_schema
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
def transform(self, row: _Row, *args, **kwargs) -> Union[_Row, List[_Row]]:
|
|
70
|
+
"""This is the method called by Handler.
|
|
71
|
+
|
|
72
|
+
It internally calls `transform_impl` to do the real transform logic.
|
|
73
|
+
Subclasses should implement `transform_impl` but not this method.
|
|
74
|
+
|
|
75
|
+
:param row: a Row (namedtuple) object contains a row record fetched from database
|
|
76
|
+
:returns: returns one (tuple) or multiple (list of tuple) rows
|
|
77
|
+
"""
|
|
78
|
+
return self.transform_impl(row, *args, **kwargs)
|
|
79
|
+
|
|
80
|
+
def transform_impl(self, row: _Row, *args, **kwargs) -> Union[_Row, List[_Row]]:
|
|
81
|
+
return row
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def convert_json_to_hive_map(data: Union[str, bytes]) -> str:
|
|
85
|
+
if not data:
|
|
86
|
+
return const.HIVE_NULL
|
|
87
|
+
|
|
88
|
+
d = json.loads(data)
|
|
89
|
+
items = []
|
|
90
|
+
for key, value in d.items():
|
|
91
|
+
key = str(key).strip()
|
|
92
|
+
value = str(value).strip()
|
|
93
|
+
item = '{0}{1}{2}'.format(key, const.HIVE_MAP_KV_DELIMITER, value)
|
|
94
|
+
items.append(item)
|
|
95
|
+
return const.HIVE_MAP_ITEM_DELIMITER.join(items)
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def convert_json_to_hive_array(data: Union[str, bytes]) -> str:
|
|
99
|
+
if not data:
|
|
100
|
+
return const.HIVE_NULL
|
|
101
|
+
|
|
102
|
+
items = json.loads(data)
|
|
103
|
+
return const.HIVE_ARRAY_DELIMITER.join(items)
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def mysql_uncompress(value: bytes, return_str: bool = False) -> Union[bytes, str]:
|
|
107
|
+
"""A Python implementation of UNCOMPRESS function of MySQL.
|
|
108
|
+
|
|
109
|
+
Used to decompress result of COMPRESS function.
|
|
110
|
+
|
|
111
|
+
https://dev.mysql.com/doc/refman/5.7/en/encryption-functions.html#function_compress
|
|
112
|
+
|
|
113
|
+
:param value: the compressed data in bytes
|
|
114
|
+
:param return_str: the return value should be unicode
|
|
115
|
+
:type return_str: bool
|
|
116
|
+
:rtype: bytes | str
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
# Empty strings are stored as empty strings.
|
|
120
|
+
# Nonempty strings are stored as a 4-byte length of the uncompressed string
|
|
121
|
+
if not value or len(value) < 4:
|
|
122
|
+
return value
|
|
123
|
+
|
|
124
|
+
rv = zlib.decompress(value[4:])
|
|
125
|
+
|
|
126
|
+
if return_str:
|
|
127
|
+
rv = rv.decode()
|
|
128
|
+
return rv
|
|
129
|
+
|
|
130
|
+
@staticmethod
|
|
131
|
+
def mysql_compress(value: Optional[str]) -> Optional[bytes]:
|
|
132
|
+
if value is None:
|
|
133
|
+
return None
|
|
134
|
+
if value == '':
|
|
135
|
+
return b''
|
|
136
|
+
size = struct.pack('I', len(value))
|
|
137
|
+
data = zlib.compress(value.encode())
|
|
138
|
+
return size + data
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def json_loads(*args, **kwargs) -> Any:
|
|
142
|
+
return json.loads(*args, **kwargs)
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def json_dumps(*args, **kwargs) -> str:
|
|
146
|
+
return json.dumps(*args, **kwargs)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from typing import Dict, List, Set, Tuple, TypeVar, Union
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
import cytoolz as toolz
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def safe_int(v: Union[str, int, float], default: int = 0) -> int:
|
|
16
|
+
try:
|
|
17
|
+
return int(v)
|
|
18
|
+
except Exception:
|
|
19
|
+
return default
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def trim_prefix(s: str, sub: str) -> str:
|
|
23
|
+
if not s.startswith(sub):
|
|
24
|
+
return s
|
|
25
|
+
return s[len(sub) :]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def trim_suffix(s: str, sub: str) -> str:
|
|
29
|
+
if not s.endswith(sub):
|
|
30
|
+
return s
|
|
31
|
+
return s[: -len(sub)]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LoggingMixin(object):
|
|
35
|
+
@property
|
|
36
|
+
def logger(self) -> logging.Logger:
|
|
37
|
+
try:
|
|
38
|
+
return self._logger
|
|
39
|
+
except AttributeError:
|
|
40
|
+
self._logger = logging.root.getChild(self.__class__.__module__ + "." + self.__class__.__name__)
|
|
41
|
+
return self._logger
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def init_logging(
|
|
45
|
+
level_name="info",
|
|
46
|
+
fmt="%(asctime)s - %(levelname)s - %(name)s - %(filename)s:%(lineno)d - [%(process)d:%(threadName)s] - %(message)s",
|
|
47
|
+
silent_cassandra=True,
|
|
48
|
+
):
|
|
49
|
+
level = logging.INFO
|
|
50
|
+
if level_name == "info":
|
|
51
|
+
level = logging.INFO
|
|
52
|
+
elif level_name == "warning":
|
|
53
|
+
level = logging.WARNING
|
|
54
|
+
elif level_name == "error":
|
|
55
|
+
level = logging.ERROR
|
|
56
|
+
elif level_name == "debug":
|
|
57
|
+
level = logging.DEBUG
|
|
58
|
+
logging.basicConfig(level=level, format=fmt)
|
|
59
|
+
|
|
60
|
+
if silent_cassandra:
|
|
61
|
+
# cassandra is too noisy
|
|
62
|
+
logging.getLogger("cassandra.cluster").setLevel(logging.WARNING)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def ensure_list(v: Union[T, Tuple[T], List[T], Set[T]]) -> List[T]:
|
|
66
|
+
if isinstance(v, (tuple, set, list)):
|
|
67
|
+
return list(v)
|
|
68
|
+
return [v]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def ensure_str_list(v: str, sep: str = ",", strip: bool = True) -> List[str]:
|
|
72
|
+
if v is None:
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
if isinstance(v, str):
|
|
76
|
+
if not v:
|
|
77
|
+
return []
|
|
78
|
+
if strip:
|
|
79
|
+
return [x.strip() for x in v.split(sep)]
|
|
80
|
+
else:
|
|
81
|
+
return v.split(sep)
|
|
82
|
+
|
|
83
|
+
if isinstance(v, (tuple, set, list)):
|
|
84
|
+
return list(v)
|
|
85
|
+
raise TypeError(f'unsupported type "{type(v)}"')
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def ensure_query_list(v: Union[str, List[str]]) -> List[str]:
|
|
89
|
+
if not v:
|
|
90
|
+
return []
|
|
91
|
+
if isinstance(v, list):
|
|
92
|
+
return v
|
|
93
|
+
return list(filter(None, map(lambda x: x.strip(), v.split(";"))))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def extract_dict(d: Dict, keys: List) -> Dict:
|
|
97
|
+
return {k: v for k, v in d.items() if k in keys}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@contextmanager
|
|
101
|
+
def silent(*_excs, excs=None):
|
|
102
|
+
excs = excs or _excs or (Exception,)
|
|
103
|
+
try:
|
|
104
|
+
yield
|
|
105
|
+
except excs as e:
|
|
106
|
+
logging.exception("silent %s", type(e).__name__)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def replace_null_values(row: Union[List, Tuple, Dict], null_values: List, replacer=None):
|
|
110
|
+
def _f(v):
|
|
111
|
+
if v in null_values:
|
|
112
|
+
return replacer
|
|
113
|
+
return v
|
|
114
|
+
|
|
115
|
+
if isinstance(row, list):
|
|
116
|
+
return list(map(_f, row))
|
|
117
|
+
if isinstance(row, tuple):
|
|
118
|
+
return tuple(map(_f, row))
|
|
119
|
+
if isinstance(row, dict):
|
|
120
|
+
return toolz.valmap(_f, row)
|
|
121
|
+
raise TypeError(f"only list, tuple or dict type is supported, got {repr(type(row))}")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def md5hash(v: Union[str, bytes]) -> str:
|
|
125
|
+
if isinstance(v, str):
|
|
126
|
+
v = v.encode()
|
|
127
|
+
if not isinstance(v, bytes):
|
|
128
|
+
v = str(v).encode()
|
|
129
|
+
return hashlib.md5(v).hexdigest()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def randomized_suffix() -> str:
|
|
133
|
+
pure_time_str = str(time.time()).replace(".", "")
|
|
134
|
+
return pure_time_str[-1] + uuid4().hex[:6]
|