recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import io
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from functools import cached_property
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from recurvedata.core.templating import Renderer
|
|
14
|
+
from recurvedata.core.tracing import Tracing
|
|
15
|
+
from recurvedata.dbt.client import DbtClient
|
|
16
|
+
from recurvedata.dbt.consts import OVERWRITE_DIRECTORIES, OVERWRITE_FILES, DbtPath
|
|
17
|
+
from recurvedata.dbt.cosmos_utils import extract_graph
|
|
18
|
+
from recurvedata.dbt.error_codes import ERR
|
|
19
|
+
from recurvedata.dbt.schemas import CompileResult, DbtGraph, PreviewResult, RunModelResult
|
|
20
|
+
from recurvedata.dbt.utils import change_directory, format_var, run_deps_if_necessary
|
|
21
|
+
from recurvedata.exceptions import WrapRecurveException, wrap_error
|
|
22
|
+
from recurvedata.utils.compression import tar_gzip_uncompress
|
|
23
|
+
from recurvedata.utils.date_time import now
|
|
24
|
+
from recurvedata.utils.files import calculate_md5
|
|
25
|
+
from recurvedata.utils.helpers import get_env_id
|
|
26
|
+
|
|
27
|
+
tracer = Tracing()
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from recurvedata.connectors.service import DataSourceWrapper
|
|
31
|
+
try:
|
|
32
|
+
import yaml
|
|
33
|
+
from dbt.cli.main import dbtRunnerResult
|
|
34
|
+
from dbt.contracts.results import RunResult
|
|
35
|
+
except ImportError:
|
|
36
|
+
dbtRunnerResult = RunResult = None
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class DbtService:
|
|
43
|
+
project_id: int
|
|
44
|
+
project_connection_name: str = None
|
|
45
|
+
force_regenerate_dir: bool = False
|
|
46
|
+
ds: "DataSourceWrapper" = None
|
|
47
|
+
need_fetch_variable: bool = False # when compile/preview, need fetch variable
|
|
48
|
+
variables: dict = None # used in compile/preview
|
|
49
|
+
|
|
50
|
+
@cached_property
|
|
51
|
+
def client(self):
|
|
52
|
+
return DbtClient()
|
|
53
|
+
|
|
54
|
+
@cached_property
|
|
55
|
+
def path(self):
|
|
56
|
+
return DbtPath(project_id=self.project_id, env_id=get_env_id())
|
|
57
|
+
|
|
58
|
+
@wrap_error(ERR.DP_FETCH_PROJECT_FAILED)
|
|
59
|
+
@tracer.create_span()
|
|
60
|
+
def fetch_project(self):
|
|
61
|
+
def _is_the_same_file(file1: str, file2: str) -> bool:
|
|
62
|
+
def __read_file(filename: str) -> bytes:
|
|
63
|
+
with open(filename, "rb") as f:
|
|
64
|
+
return f.read()
|
|
65
|
+
|
|
66
|
+
if os.path.exists(file1) != os.path.exists(file2):
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
return __read_file(file1) == __read_file(file2)
|
|
70
|
+
|
|
71
|
+
def _overwrite_from_gzip_dir(src_dir: str, dst_dir: str):
|
|
72
|
+
for sub_dir in OVERWRITE_DIRECTORIES:
|
|
73
|
+
src_sub_dir = os.path.join(src_dir, sub_dir)
|
|
74
|
+
for root, dirs, files in os.walk(src_sub_dir):
|
|
75
|
+
dst_root = os.path.join(dst_dir, sub_dir, os.path.relpath(root, src_sub_dir))
|
|
76
|
+
os.makedirs(dst_root, exist_ok=True)
|
|
77
|
+
for tmp_file in files:
|
|
78
|
+
src_file = os.path.join(root, tmp_file)
|
|
79
|
+
dst_file = os.path.join(dst_root, tmp_file)
|
|
80
|
+
if _is_the_same_file(src_file, dst_file):
|
|
81
|
+
logger.info(f"skip {dst_file}")
|
|
82
|
+
continue
|
|
83
|
+
shutil.copy2(src_file, dst_file)
|
|
84
|
+
for dst_file_dir in os.listdir(dst_root):
|
|
85
|
+
if dst_file_dir not in dirs + files:
|
|
86
|
+
dst_file_dir = os.path.join(dst_root, dst_file_dir)
|
|
87
|
+
logger.info(f"remove {dst_file_dir}")
|
|
88
|
+
if os.path.isdir(dst_file_dir):
|
|
89
|
+
shutil.rmtree(dst_file_dir, ignore_errors=True)
|
|
90
|
+
else:
|
|
91
|
+
try:
|
|
92
|
+
os.remove(dst_file_dir)
|
|
93
|
+
except FileNotFoundError:
|
|
94
|
+
pass
|
|
95
|
+
for tmp_file in OVERWRITE_FILES:
|
|
96
|
+
src_file = os.path.join(src_dir, tmp_file)
|
|
97
|
+
dst_file = os.path.join(dst_dir, tmp_file)
|
|
98
|
+
if _is_the_same_file(src_file, dst_file):
|
|
99
|
+
logger.info(f"skip {dst_file}")
|
|
100
|
+
continue
|
|
101
|
+
shutil.copy2(src_file, dst_file)
|
|
102
|
+
|
|
103
|
+
logger.info(f"fetch dbt project: {self.project_id} -> {self.path.project_dir}")
|
|
104
|
+
|
|
105
|
+
os.makedirs(self.path.base_path, exist_ok=True)
|
|
106
|
+
logger.info(f"fetch dbt project: preparing 1 - base_path: {self.path.base_path}")
|
|
107
|
+
|
|
108
|
+
gzip_temp_dir: str = tempfile.mkdtemp(dir=self.path.base_path, prefix=f"_tmp_{self.path.simple_project_dir}")
|
|
109
|
+
gzip_file = f"{gzip_temp_dir}.tar.gz"
|
|
110
|
+
logger.info(
|
|
111
|
+
f"fetch dbt project: preparing 2 - simple_project_dir: {self.path.simple_project_dir} - gzip_file: {gzip_file}"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
local_md5 = ""
|
|
115
|
+
if os.path.exists(self.path.project_dir):
|
|
116
|
+
if os.path.isfile(self.path.project_gzip_file):
|
|
117
|
+
# check if the project exists -> then cal MD5 of project_gzip_file = {self.project_dir}.tar.gz
|
|
118
|
+
local_md5 = calculate_md5(self.path.project_gzip_file)
|
|
119
|
+
logger.info(
|
|
120
|
+
f"fetch dbt project: preparing 3 - current project_gzip_file: {self.path.project_gzip_file}, local_md5: {local_md5}"
|
|
121
|
+
)
|
|
122
|
+
else:
|
|
123
|
+
logger.info(
|
|
124
|
+
f"fetch dbt project: preparing 3 - current project_gzip_file: {self.path.project_gzip_file} not exists"
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
logger.info(f"fetch dbt project: preparing 3 - current project_dir: {self.path.project_dir} not exists")
|
|
128
|
+
|
|
129
|
+
fetch_gzip_result = self.client.fetch_project_gzip(self.project_id, gzip_file, client_md5=local_md5)
|
|
130
|
+
logger.info(f"fetch dbt project: fetching - fetch_gzip_result: {fetch_gzip_result}")
|
|
131
|
+
|
|
132
|
+
if not fetch_gzip_result:
|
|
133
|
+
logger.info("fetch dbt project: md5 is the same, skip fetch project")
|
|
134
|
+
# delete unused empty temp dir
|
|
135
|
+
shutil.rmtree(gzip_temp_dir, ignore_errors=True)
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
tar_gzip_uncompress(gzip_file, gzip_temp_dir)
|
|
139
|
+
|
|
140
|
+
logger.info(f"uncompress {gzip_file} to {gzip_temp_dir} success")
|
|
141
|
+
|
|
142
|
+
os.makedirs(self.path.project_dir, exist_ok=True)
|
|
143
|
+
_overwrite_from_gzip_dir(gzip_temp_dir, self.path.project_dir)
|
|
144
|
+
shutil.move(gzip_file, self.path.project_gzip_file)
|
|
145
|
+
shutil.rmtree(gzip_temp_dir, ignore_errors=True)
|
|
146
|
+
|
|
147
|
+
@wrap_error(ERR.DP_FETCH_CONNECTION_FAILED)
|
|
148
|
+
def fetch_connection(self):
|
|
149
|
+
from recurvedata.connectors.service import get_datasource_by_config
|
|
150
|
+
|
|
151
|
+
logger.info("start fetch connection")
|
|
152
|
+
con_item = self.client.get_connection(self.project_id)
|
|
153
|
+
self.ds = get_datasource_by_config(
|
|
154
|
+
con_item.type, config=con_item.data, database=con_item.database, schema=con_item.database_schema
|
|
155
|
+
)
|
|
156
|
+
self.ds.recurve_connector.set_env_when_get_dbt_connection()
|
|
157
|
+
|
|
158
|
+
@wrap_error(ERR.DP_FETCH_CONNECTION_FAILED)
|
|
159
|
+
def fetch_connection_and_variables(self):
|
|
160
|
+
from recurvedata.connectors.service import get_datasource_by_config
|
|
161
|
+
|
|
162
|
+
logger.info("start fetch connection and variables")
|
|
163
|
+
item = self.client.get_connection_and_variables(self.project_id)
|
|
164
|
+
con_item = item.connection
|
|
165
|
+
logger.info("after fetch connection and variables")
|
|
166
|
+
self.ds = get_datasource_by_config(
|
|
167
|
+
con_item.type, config=con_item.data, database=con_item.database, schema=con_item.database_schema
|
|
168
|
+
)
|
|
169
|
+
os.environ["DBT_USER"] = self.ds.user or ""
|
|
170
|
+
os.environ["DBT_PASSWORD"] = self.ds.password or ""
|
|
171
|
+
self.variables = self.prepare_variables(item.variables)
|
|
172
|
+
logger.info("start process variables")
|
|
173
|
+
logger.info("after process variables")
|
|
174
|
+
|
|
175
|
+
def prepare_variables(self, variables: dict | None) -> dict:
|
|
176
|
+
from recurvedata.executors.executor import Executor
|
|
177
|
+
|
|
178
|
+
execution_date, schedule_interval = now(), "@daily"
|
|
179
|
+
processed_variables = Executor.process_variables(variables or {}, {}, execution_date, schedule_interval)
|
|
180
|
+
result_variables = Renderer().init_context(execution_date, schedule_interval)
|
|
181
|
+
result_variables.update(processed_variables)
|
|
182
|
+
return result_variables
|
|
183
|
+
|
|
184
|
+
@tracer.create_span()
|
|
185
|
+
def compile(self, model_name: str = None, inline_sql: str = None, validate_sql: bool = False) -> CompileResult:
|
|
186
|
+
logger.info(f"prepare to compile: model_name: {model_name}, inline_sql: {inline_sql}")
|
|
187
|
+
self.prepare()
|
|
188
|
+
compiled_sql = self._run_compile(model_name, inline_sql)
|
|
189
|
+
logger.info(f"compiled_sql is :{compiled_sql}")
|
|
190
|
+
if validate_sql:
|
|
191
|
+
self._run_preview(compiled_sql, limit=0)
|
|
192
|
+
return CompileResult(compiled_sql=compiled_sql)
|
|
193
|
+
|
|
194
|
+
def should_fetch_project(self) -> bool:
|
|
195
|
+
if self.force_regenerate_dir or not os.path.exists(self.path.project_dir):
|
|
196
|
+
return True
|
|
197
|
+
remote_md5 = self.client.fetch_project_gzip_md5(self.project_id).md5
|
|
198
|
+
local_md5 = calculate_md5(self.path.project_gzip_file)
|
|
199
|
+
if remote_md5 == local_md5:
|
|
200
|
+
logger.info("md5 is the same, skip fetch project")
|
|
201
|
+
return False
|
|
202
|
+
logger.info(f"remote_md5 md5 {remote_md5} vs local md5 {local_md5}")
|
|
203
|
+
return True
|
|
204
|
+
|
|
205
|
+
@tracer.create_span()
|
|
206
|
+
def prepare(self):
|
|
207
|
+
self.fetch_project()
|
|
208
|
+
|
|
209
|
+
if self.need_fetch_variable:
|
|
210
|
+
try:
|
|
211
|
+
self.fetch_connection_and_variables()
|
|
212
|
+
except Exception: # back compatible
|
|
213
|
+
logger.exception("fetch_connection_and_variables fail")
|
|
214
|
+
self.fetch_connection()
|
|
215
|
+
else:
|
|
216
|
+
self.fetch_connection()
|
|
217
|
+
|
|
218
|
+
self.run_dependency()
|
|
219
|
+
|
|
220
|
+
@wrap_error(ERR.DEPS_FAILED)
|
|
221
|
+
@tracer.create_span()
|
|
222
|
+
def run_dependency(self):
|
|
223
|
+
run_deps_if_necessary(self.path.project_dir)
|
|
224
|
+
|
|
225
|
+
@wrap_error(ERR.MODEL_COMPILE_FAILED)
|
|
226
|
+
@tracer.create_span()
|
|
227
|
+
def _run_compile(self, model_name: str = None, inline_sql: str = None) -> str:
|
|
228
|
+
if model_name:
|
|
229
|
+
cmds = ["compile", "--select", model_name]
|
|
230
|
+
elif inline_sql:
|
|
231
|
+
cmds = ["compile", "-d", "--inline", inline_sql]
|
|
232
|
+
else:
|
|
233
|
+
raise ValueError("model_name or inline_sql must be specified")
|
|
234
|
+
|
|
235
|
+
if self.variables:
|
|
236
|
+
dbt_vars = format_var(self, self.variables)
|
|
237
|
+
cmds += ["--vars", dbt_vars]
|
|
238
|
+
|
|
239
|
+
result, _ = self._run_dbt_cmds(cmds)
|
|
240
|
+
if result.success:
|
|
241
|
+
compiled_code = result.result.results[0].node.compiled_code
|
|
242
|
+
return compiled_code.strip()
|
|
243
|
+
|
|
244
|
+
def _run_dbt_cmds(self, cmds: list, raise_when_failed: bool = True) -> ("dbtRunnerResult", str):
|
|
245
|
+
from dbt.cli.main import dbtRunner
|
|
246
|
+
|
|
247
|
+
logger.info(f"prepare run dbt cmds: {cmds}")
|
|
248
|
+
dbt = dbtRunner()
|
|
249
|
+
|
|
250
|
+
with change_directory(self.path.project_dir):
|
|
251
|
+
log_buffer = io.StringIO()
|
|
252
|
+
# Redirect stdout and stderr to the buffer
|
|
253
|
+
with contextlib.redirect_stdout(log_buffer), contextlib.redirect_stderr(log_buffer):
|
|
254
|
+
result: "dbtRunnerResult" = dbt.invoke(cmds)
|
|
255
|
+
|
|
256
|
+
if raise_when_failed and not result.success:
|
|
257
|
+
if isinstance(result.exception, BaseException):
|
|
258
|
+
raise result.exception
|
|
259
|
+
raise ValueError(str(result.exception))
|
|
260
|
+
logger.info(f"run dbt cmds finished: {cmds}")
|
|
261
|
+
return result, log_buffer.getvalue()
|
|
262
|
+
|
|
263
|
+
@tracer.create_span()
|
|
264
|
+
def preview(
|
|
265
|
+
self,
|
|
266
|
+
model_name: str = None,
|
|
267
|
+
inline_sql: str = None,
|
|
268
|
+
limit: int = 100,
|
|
269
|
+
no_data: bool = False,
|
|
270
|
+
is_compiled: bool = False,
|
|
271
|
+
) -> "PreviewResult":
|
|
272
|
+
self.prepare()
|
|
273
|
+
|
|
274
|
+
if is_compiled:
|
|
275
|
+
compiled_sql = inline_sql
|
|
276
|
+
logger.info(f"sql is compiled: {compiled_sql}")
|
|
277
|
+
else:
|
|
278
|
+
compiled_sql = self._run_compile(model_name, inline_sql)
|
|
279
|
+
logger.info(f"compiled_sql is :{compiled_sql}")
|
|
280
|
+
|
|
281
|
+
if no_data:
|
|
282
|
+
limit = 0
|
|
283
|
+
return self._run_preview(compiled_sql, limit)
|
|
284
|
+
|
|
285
|
+
@wrap_error(ERR.MODEL_PREVIEW_FAILED)
|
|
286
|
+
def _run_preview(self, compiled_sql: str, limit: int = 100) -> "PreviewResult":
|
|
287
|
+
from recurvedata.executors.cli.connector import ConnectionService
|
|
288
|
+
|
|
289
|
+
con_service = ConnectionService()
|
|
290
|
+
try:
|
|
291
|
+
return con_service.preview_sql(self.ds, compiled_sql, limit)
|
|
292
|
+
except Exception as e:
|
|
293
|
+
raise WrapRecurveException(ERR.MODEL_PREVIEW_FAILED, e, data={"compiled_sql": compiled_sql})
|
|
294
|
+
|
|
295
|
+
def get_test_cases(self, model_name: str) -> list[str]:
|
|
296
|
+
cmds = ["ls", "--resource-type", "test", "--select", model_name]
|
|
297
|
+
result, _ = self._run_dbt_cmds(cmds)
|
|
298
|
+
if result.success:
|
|
299
|
+
return result.result
|
|
300
|
+
|
|
301
|
+
@wrap_error(ERR.MODEL_RUN_FAILED)
|
|
302
|
+
@tracer.create_span()
|
|
303
|
+
def _run_model(
|
|
304
|
+
self, model_name: str, dbt_vars: str = None, full_refresh: bool = False
|
|
305
|
+
) -> tuple[str, "dbtRunnerResult"]:
|
|
306
|
+
run_model_result = self.run_model(model_name, dbt_vars, full_refresh)
|
|
307
|
+
compiled_sql = run_model_result.compiled_sql
|
|
308
|
+
res = run_model_result.result
|
|
309
|
+
if not res.success:
|
|
310
|
+
error_message = None
|
|
311
|
+
if res.result and res.result.results:
|
|
312
|
+
# Case 1: Has results with error messages
|
|
313
|
+
errors = [r.message for r in res.result.results if r.message]
|
|
314
|
+
if errors:
|
|
315
|
+
error_message = "\n".join(errors)
|
|
316
|
+
elif res.exception:
|
|
317
|
+
# Case 2: Has exception
|
|
318
|
+
error_message = str(res.exception)
|
|
319
|
+
else:
|
|
320
|
+
# Case 3: No results and no exception
|
|
321
|
+
error_message = "Unknown error occurred during model run"
|
|
322
|
+
|
|
323
|
+
raise WrapRecurveException(
|
|
324
|
+
ERR.MODEL_RUN_FAILED,
|
|
325
|
+
Exception(error_message),
|
|
326
|
+
data={
|
|
327
|
+
"compiled_sql": compiled_sql,
|
|
328
|
+
},
|
|
329
|
+
)
|
|
330
|
+
return compiled_sql, res
|
|
331
|
+
|
|
332
|
+
@tracer.create_span()
|
|
333
|
+
def run_model(
|
|
334
|
+
self, model_name: str, dbt_vars: str = None, full_refresh: bool = False, include_run_log: bool = False
|
|
335
|
+
) -> RunModelResult:
|
|
336
|
+
cmds = ["run", "--select", model_name]
|
|
337
|
+
if dbt_vars:
|
|
338
|
+
cmds.extend(["--vars", dbt_vars])
|
|
339
|
+
if full_refresh:
|
|
340
|
+
cmds.append("--full-refresh")
|
|
341
|
+
|
|
342
|
+
if include_run_log:
|
|
343
|
+
cmds.append("--debug")
|
|
344
|
+
cmds.extend(["--log-format", "json"])
|
|
345
|
+
|
|
346
|
+
res, run_log = self._run_dbt_cmds(cmds, raise_when_failed=False)
|
|
347
|
+
|
|
348
|
+
compiled_code = self._extract_compiled_code(model_name, res)
|
|
349
|
+
run_sql = self._get_model_run_sql(model_name)
|
|
350
|
+
return RunModelResult(compiled_sql=compiled_code, result=res, run_sql=run_sql, run_log=run_log)
|
|
351
|
+
|
|
352
|
+
def _extract_compiled_code(self, model_name: str, materialized_result: "dbtRunnerResult") -> str | None:
|
|
353
|
+
# partial-compile will not have compiled_sql in materialized_result
|
|
354
|
+
return self._extract_compiled_code_from_run_result(materialized_result) or self._get_model_compiled_sql(
|
|
355
|
+
model_name
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
@classmethod
|
|
359
|
+
def _extract_compiled_code_from_run_result(cls, materialized_result: "dbtRunnerResult") -> str | None:
|
|
360
|
+
if not materialized_result.result:
|
|
361
|
+
return
|
|
362
|
+
results = materialized_result.result.results
|
|
363
|
+
run_result: "RunResult" = results[0]
|
|
364
|
+
compiled_code = run_result.node.compiled_code
|
|
365
|
+
if compiled_code:
|
|
366
|
+
return compiled_code.strip()
|
|
367
|
+
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
def _get_model_compiled_sql(self, model_name: str) -> str | None:
|
|
371
|
+
compiled_sql_path = Path(self.path.get_model_compiled_sql_path(model_name))
|
|
372
|
+
if compiled_sql_path.exists():
|
|
373
|
+
return compiled_sql_path.read_text()
|
|
374
|
+
|
|
375
|
+
def _get_model_run_sql(self, model_name: str) -> str | None:
|
|
376
|
+
run_sql_path = Path(self.path.get_model_run_sql_path(model_name))
|
|
377
|
+
if run_sql_path.exists():
|
|
378
|
+
return run_sql_path.read_text()
|
|
379
|
+
|
|
380
|
+
@tracer.create_span()
|
|
381
|
+
def run_test(self, model_id: int, dbt_vars: str = None) -> "dbtRunnerResult":
|
|
382
|
+
cmds = [
|
|
383
|
+
"test",
|
|
384
|
+
"--select",
|
|
385
|
+
f"tag:model_{model_id}",
|
|
386
|
+
]
|
|
387
|
+
if dbt_vars:
|
|
388
|
+
cmds.extend(["--vars", dbt_vars])
|
|
389
|
+
|
|
390
|
+
res, _ = self._run_dbt_cmds(cmds, raise_when_failed=False)
|
|
391
|
+
return res
|
|
392
|
+
|
|
393
|
+
def extract_model_graph(self, models: list[str] = None, model_cmd: str = None) -> DbtGraph:
|
|
394
|
+
"""
|
|
395
|
+
extract the models and model graph from model pipeline settings
|
|
396
|
+
:param models: the models selected in the drop down list
|
|
397
|
+
:param model_cmd: the command from the advanced mode
|
|
398
|
+
"""
|
|
399
|
+
|
|
400
|
+
return extract_graph(self.path.project_dir, models, model_cmd)
|
|
401
|
+
|
|
402
|
+
def extract_var_from_dbt_project(self) -> dict:
|
|
403
|
+
with open(self.path.dbt_project_yml_path, "r") as file:
|
|
404
|
+
dbt_project_dct = yaml.safe_load(file)
|
|
405
|
+
return dbt_project_dct.get("vars", {})
|
|
406
|
+
|
|
407
|
+
def read_model_sql(self, model_name: str) -> str | None:
|
|
408
|
+
model_path = Path(self.path.get_model_sql_path(model_name))
|
|
409
|
+
if not model_path.exists():
|
|
410
|
+
return
|
|
411
|
+
return model_path.read_text()
|
|
412
|
+
|
|
413
|
+
@tracer.create_span()
|
|
414
|
+
def run_test_sample_data(self, dbt_test_result: "dbtRunnerResult") -> dict[str, PreviewResult]:
|
|
415
|
+
# todo: use dbt store-failure
|
|
416
|
+
|
|
417
|
+
from recurvedata.executors.cli.connector import ConnectionService
|
|
418
|
+
|
|
419
|
+
if not dbt_test_result.result:
|
|
420
|
+
return {}
|
|
421
|
+
|
|
422
|
+
result: dict[str, PreviewResult] = {}
|
|
423
|
+
|
|
424
|
+
con_service = ConnectionService()
|
|
425
|
+
|
|
426
|
+
def _run_single_test_case_sample_data(unique_id: str, sql: str):
|
|
427
|
+
try:
|
|
428
|
+
data: PreviewResult = con_service.preview_sql(self.ds, sql, limit=100)
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.exception(f"run single test case {unique_id} fail: {e}, sql: {sql}")
|
|
431
|
+
return
|
|
432
|
+
result[unique_id] = data
|
|
433
|
+
|
|
434
|
+
unique_id_2_sql = {
|
|
435
|
+
dbt_result.node.unique_id: dbt_result.node.compiled_code
|
|
436
|
+
for dbt_result in dbt_test_result.result.results
|
|
437
|
+
if dbt_result.node.compiled_code
|
|
438
|
+
# todo: if no failure, then skip fetching sample data
|
|
439
|
+
}
|
|
440
|
+
logger.debug(f"unique_id_2_sql: {unique_id_2_sql}")
|
|
441
|
+
|
|
442
|
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
443
|
+
futures = {
|
|
444
|
+
executor.submit(_run_single_test_case_sample_data, unique_id, sql): unique_id
|
|
445
|
+
for unique_id, sql in unique_id_2_sql.items()
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
for future in futures:
|
|
449
|
+
future.result()
|
|
450
|
+
|
|
451
|
+
return result
|
recurvedata/dbt/utils.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
from recurvedata.dbt.consts import (
|
|
11
|
+
DbtFileNames,
|
|
12
|
+
format_installed_packages_path,
|
|
13
|
+
format_package_lock_path,
|
|
14
|
+
format_packages_yml_path,
|
|
15
|
+
)
|
|
16
|
+
from recurvedata.utils.files import FileLock
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import yaml
|
|
20
|
+
from dbt.cli.main import dbtRunnerResult
|
|
21
|
+
from dbt.contracts.results import RunExecutionResult, RunResultsArtifact
|
|
22
|
+
from dbt.exceptions import DbtRuntimeError
|
|
23
|
+
except ImportError:
|
|
24
|
+
dbtRunnerResult = None
|
|
25
|
+
DbtRuntimeError = None
|
|
26
|
+
RunExecutionResult = RunResultsArtifact = None
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from recurvedata.dbt.service import DbtService
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@contextmanager
|
|
35
|
+
def change_directory(new_dir): # todo(chenjingmeng): use dbt api instead of cli
|
|
36
|
+
"""Context manager to change the current working directory temporarily."""
|
|
37
|
+
original_dir = os.getcwd()
|
|
38
|
+
os.chdir(new_dir)
|
|
39
|
+
try:
|
|
40
|
+
yield
|
|
41
|
+
finally:
|
|
42
|
+
os.chdir(original_dir)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def extract_project_name(project_yml: str) -> str:
|
|
46
|
+
with open(project_yml, "r") as file:
|
|
47
|
+
dbt_project = yaml.safe_load(file)
|
|
48
|
+
project_name = dbt_project.get("name")
|
|
49
|
+
return project_name
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def run_dbt_cmds(project_dir: str, cmds: list) -> dbtRunnerResult:
|
|
53
|
+
from dbt.cli.main import dbtRunner
|
|
54
|
+
|
|
55
|
+
def _set_default_os_env():
|
|
56
|
+
os.environ.setdefault("DBT_USER", "")
|
|
57
|
+
os.environ.setdefault("DBT_PASSWORD", "")
|
|
58
|
+
|
|
59
|
+
_set_default_os_env()
|
|
60
|
+
dbt = dbtRunner()
|
|
61
|
+
logger.info(f"prepare run dbt cmds: {cmds}")
|
|
62
|
+
with change_directory(project_dir):
|
|
63
|
+
result: dbtRunnerResult = dbt.invoke(cmds)
|
|
64
|
+
return result
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def dbt_runner_result_to_dict(result: dbtRunnerResult) -> dict:
|
|
68
|
+
def _exception_to_dict(exception: DbtRuntimeError | BaseException | None) -> dict | None:
|
|
69
|
+
if exception is None:
|
|
70
|
+
return None
|
|
71
|
+
if isinstance(exception, DbtRuntimeError):
|
|
72
|
+
return exception.data()
|
|
73
|
+
return {
|
|
74
|
+
"type": type(exception).__name__,
|
|
75
|
+
"message": str(exception),
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
def _result_to_dict(sub_result: RunExecutionResult | None) -> dict | None:
|
|
79
|
+
if isinstance(sub_result, RunExecutionResult):
|
|
80
|
+
res_dct = sub_result.to_dict(omit_none=False)
|
|
81
|
+
return _format_cp_result_dct(res_dct)
|
|
82
|
+
|
|
83
|
+
def _format_cp_result_dct(dbt_result_dct: dict) -> dict:
|
|
84
|
+
if not dbt_result_dct.get("results"):
|
|
85
|
+
return dbt_result_dct
|
|
86
|
+
results: list[dict] = dbt_result_dct["results"]
|
|
87
|
+
if results:
|
|
88
|
+
for sub_result in results:
|
|
89
|
+
node_dct = sub_result.get("node", {})
|
|
90
|
+
sub_result.update(node_dct) # on CP, DBTTestResultDetails needs node data like unique_id to validate
|
|
91
|
+
# todo: better to adjust cp pydantic schema
|
|
92
|
+
return dbt_result_dct
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
"success": result.success,
|
|
96
|
+
"exception": _exception_to_dict(result.exception),
|
|
97
|
+
"result": _result_to_dict(result.result),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class VariableJSONEncoder(json.JSONEncoder):
|
|
102
|
+
def default(self, obj: Any):
|
|
103
|
+
return self.format_var(obj)
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def format_var(cls, value: Any):
|
|
107
|
+
if value is None or isinstance(value, (int, bool, float)):
|
|
108
|
+
return value
|
|
109
|
+
elif isinstance(value, datetime.datetime):
|
|
110
|
+
return value.isoformat()
|
|
111
|
+
return str(value)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def format_var(service: "DbtService", variables: dict) -> str | None:
|
|
115
|
+
default_var_dct: dict = service.extract_var_from_dbt_project()
|
|
116
|
+
override_variables: dict = {
|
|
117
|
+
k: v for (k, v) in variables.items() if k not in default_var_dct or v != default_var_dct[k]
|
|
118
|
+
}
|
|
119
|
+
if not override_variables:
|
|
120
|
+
return
|
|
121
|
+
vars_string = json.dumps(override_variables, cls=VariableJSONEncoder)
|
|
122
|
+
return vars_string
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def should_run_dependency(project_dir: str) -> bool:
|
|
126
|
+
packages_yml = Path(format_packages_yml_path(project_dir))
|
|
127
|
+
if not packages_yml.exists():
|
|
128
|
+
return False
|
|
129
|
+
if packages_yml.stat().st_size == 0:
|
|
130
|
+
return False
|
|
131
|
+
data: dict = read_yaml_file(str(packages_yml))
|
|
132
|
+
if not data.get("packages"):
|
|
133
|
+
return False
|
|
134
|
+
package_lock = Path(format_package_lock_path(project_dir))
|
|
135
|
+
if not package_lock.exists():
|
|
136
|
+
return True
|
|
137
|
+
packages_dir = Path(format_installed_packages_path(project_dir))
|
|
138
|
+
if not packages_dir.exists():
|
|
139
|
+
# maybe concurrency issue, causing the dbt_packages dir missing
|
|
140
|
+
return True
|
|
141
|
+
data: dict = read_yaml_file(str(package_lock))
|
|
142
|
+
pack_cnt = len(data.get("packages", []))
|
|
143
|
+
if pack_cnt > len(os.listdir(str(packages_dir))):
|
|
144
|
+
# previous concurrency issue
|
|
145
|
+
return True
|
|
146
|
+
if packages_yml.stat().st_mtime > package_lock.stat().st_mtime:
|
|
147
|
+
return True
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def read_yaml_file(filename: str) -> dict:
|
|
152
|
+
with open(filename, "r") as file:
|
|
153
|
+
return yaml.safe_load(file)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def run_deps_if_necessary(project_dir: str):
|
|
157
|
+
if not should_run_dependency(project_dir):
|
|
158
|
+
logger.info(f"skip deps on {project_dir}")
|
|
159
|
+
return
|
|
160
|
+
|
|
161
|
+
lock = FileLock(lock_file_path=Path(project_dir).with_suffix(".deps_lock"))
|
|
162
|
+
with lock:
|
|
163
|
+
if not should_run_dependency(project_dir):
|
|
164
|
+
logger.info(f"skip deps on {project_dir}")
|
|
165
|
+
return
|
|
166
|
+
res = run_dbt_cmds(
|
|
167
|
+
project_dir,
|
|
168
|
+
[
|
|
169
|
+
"deps",
|
|
170
|
+
],
|
|
171
|
+
)
|
|
172
|
+
if not res.success:
|
|
173
|
+
raise DbtRuntimeError(f"run deps failed on {project_dir}, {res.exception}")
|
|
174
|
+
|
|
175
|
+
lock_file = Path(format_package_lock_path(project_dir))
|
|
176
|
+
if lock_file.exists():
|
|
177
|
+
lock_file.touch() # used in should_run_dependency
|
|
178
|
+
logger.info(f"deps on {project_dir} finish")
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def ensure_manifest_json_exists(project_dir: str):
|
|
182
|
+
manifest_path = Path(project_dir) / "target" / DbtFileNames.MANIFEST_FILE.value
|
|
183
|
+
if manifest_path.exists():
|
|
184
|
+
return
|
|
185
|
+
run_dbt_cmds(project_dir, ["parse"])
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _has_error_log(log: dict) -> bool:
|
|
190
|
+
log_data = log.get('data')
|
|
191
|
+
if log_data:
|
|
192
|
+
return log_data.get('status') == 'error' or 'error' in log_data.get("base_msg", "")
|
|
193
|
+
return False
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _create_success_log(sql: str) -> dict:
|
|
197
|
+
return {
|
|
198
|
+
"sql": sql,
|
|
199
|
+
"status": 'success'
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _create_failed_log(sql: str) -> dict:
|
|
204
|
+
return {
|
|
205
|
+
"sql": sql,
|
|
206
|
+
"status": 'failed'
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
def parse_run_model_log(run_log: str) -> list[dict]:
|
|
210
|
+
if not run_log:
|
|
211
|
+
return []
|
|
212
|
+
|
|
213
|
+
run_sql_log = []
|
|
214
|
+
sql = None
|
|
215
|
+
for line in run_log.splitlines():
|
|
216
|
+
try:
|
|
217
|
+
log = json.loads(line)
|
|
218
|
+
log_data = log.get('data')
|
|
219
|
+
|
|
220
|
+
if log_data and 'sql' in log_data:
|
|
221
|
+
# If there is no error in the log instead we meet the next sql, the sql is success
|
|
222
|
+
if sql is not None:
|
|
223
|
+
run_sql_log.append(_create_success_log(sql))
|
|
224
|
+
|
|
225
|
+
sql = log_data['sql']
|
|
226
|
+
# Remove /* ... */ using regex
|
|
227
|
+
cleaned = re.sub(r"/\*.*?\*/", "", sql, flags=re.DOTALL)
|
|
228
|
+
# Strip whitespace to get the SQL
|
|
229
|
+
sql = cleaned.strip()
|
|
230
|
+
|
|
231
|
+
elif sql:
|
|
232
|
+
# failed if status is error or log has base_msg contains error keyword
|
|
233
|
+
# if log does not have status or base_msg, then skip
|
|
234
|
+
if _has_error_log(log):
|
|
235
|
+
run_sql_log.append(_create_failed_log(sql))
|
|
236
|
+
sql = None
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
except json.JSONDecodeError:
|
|
240
|
+
logger.error("Skipping non-JSON line:", line)
|
|
241
|
+
|
|
242
|
+
# mark log success
|
|
243
|
+
if sql is not None:
|
|
244
|
+
run_sql_log.append(_create_success_log(sql))
|
|
245
|
+
|
|
246
|
+
return run_sql_log
|