dvt-core 0.59.0a51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2660 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +60 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.py +273 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.py +1252 -0
- dbt/compute/metadata/__init__.py +63 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/catalog_store.py +1036 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.py +1020 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/spark_logger.py +272 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.py +472 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.py +408 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +888 -0
- dbt/config/project_utils.py +48 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +564 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +419 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_comprehensive_registry.py +1254 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/dvt_starter_project/README.md +15 -0
- dbt/include/dvt_starter_project/__init__.py +3 -0
- dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/dvt_project.yml +39 -0
- dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
- dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
- dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +122 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2208 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +506 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +513 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +1002 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +509 -0
- dbt/task/dvt_run.py +282 -0
- dbt/task/dvt_seed.py +806 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.py +1022 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.py +804 -0
- dbt/task/migrate.py +714 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.py +1489 -0
- dbt/task/profile_serve.py +662 -0
- dbt/task/retract.py +441 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1647 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.py +814 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +271 -0
- dvt_cli/__init__.py +158 -0
- dvt_core-0.59.0a51.dist-info/METADATA +288 -0
- dvt_core-0.59.0a51.dist-info/RECORD +299 -0
- dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
- dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
- dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
dbt/task/dvt_seed.py
ADDED
|
@@ -0,0 +1,806 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DVT Seed Task - Spark-powered seed loading with pattern-based transformations.
|
|
3
|
+
|
|
4
|
+
DVT v0.59.0a34: Hybrid approach - file databases use native, network use JDBC.
|
|
5
|
+
Uses DVT's virtualization infrastructure for consistent behavior across all targets.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
1. Read CSV files with Spark
|
|
9
|
+
2. Match column values against patterns in value_transformations table
|
|
10
|
+
3. Apply Spark SQL transformations automatically
|
|
11
|
+
4. Write to target using best method:
|
|
12
|
+
- File-based databases (DuckDB): Native connection via Pandas/Arrow (avoids locking issues)
|
|
13
|
+
- Network databases (Postgres, etc.): Spark JDBC with DROP CASCADE support
|
|
14
|
+
5. Rich UI output with progress tracking
|
|
15
|
+
|
|
16
|
+
This ensures consistent behavior whether writing to DuckDB, Postgres, Databricks,
|
|
17
|
+
or any other supported adapter.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import re
|
|
21
|
+
import time
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
24
|
+
|
|
25
|
+
import duckdb
|
|
26
|
+
|
|
27
|
+
from dbt.adapters.factory import get_adapter
|
|
28
|
+
from dbt.artifacts.schemas.results import NodeStatus, RunStatus
|
|
29
|
+
from dbt.artifacts.schemas.run import RunResult
|
|
30
|
+
from dbt.config.runtime import RuntimeConfig
|
|
31
|
+
from dbt.contracts.graph.nodes import SeedNode
|
|
32
|
+
from dbt.events.types import LogSeedResult, LogStartLine
|
|
33
|
+
from dbt.graph import ResourceTypeSelector
|
|
34
|
+
from dbt.node_types import NodeType
|
|
35
|
+
from dbt.task import group_lookup
|
|
36
|
+
from dbt.task.base import BaseRunner
|
|
37
|
+
from dbt.task.dvt_output import DVTMultiBarDisplay, HAS_RICH
|
|
38
|
+
from dbt.task.printer import print_run_end_messages
|
|
39
|
+
from dbt.task.run import RunTask
|
|
40
|
+
from dbt_common.events.base_types import EventLevel
|
|
41
|
+
from dbt_common.events.functions import fire_event
|
|
42
|
+
from dbt_common.exceptions import DbtInternalError
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ValueTransformationRegistry:
|
|
46
|
+
"""Registry for value transformation patterns from MDM."""
|
|
47
|
+
|
|
48
|
+
_patterns: Optional[List[Tuple[str, str, str, int]]] = None
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def get_patterns(cls) -> List[Tuple[str, str, str, int]]:
|
|
52
|
+
"""Load transformation patterns from MDM database."""
|
|
53
|
+
if cls._patterns is not None:
|
|
54
|
+
return cls._patterns
|
|
55
|
+
|
|
56
|
+
# Try packaged registry first, then user MDM
|
|
57
|
+
registry_paths = [
|
|
58
|
+
Path(__file__).parent.parent / "include" / "data" / "adapters_registry.duckdb",
|
|
59
|
+
Path.home() / ".dvt" / ".data" / "mdm.duckdb",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
cls._patterns = []
|
|
63
|
+
for path in registry_paths:
|
|
64
|
+
if path.exists():
|
|
65
|
+
try:
|
|
66
|
+
conn = duckdb.connect(str(path), read_only=True)
|
|
67
|
+
# Check if table exists
|
|
68
|
+
tables = conn.execute(
|
|
69
|
+
"SELECT table_name FROM information_schema.tables WHERE table_name = 'value_transformations'"
|
|
70
|
+
).fetchall()
|
|
71
|
+
if tables:
|
|
72
|
+
result = conn.execute("""
|
|
73
|
+
SELECT pattern, target_type, transform_expr, priority
|
|
74
|
+
FROM value_transformations
|
|
75
|
+
ORDER BY priority DESC
|
|
76
|
+
""").fetchall()
|
|
77
|
+
cls._patterns = [(r[0], r[1], r[2], r[3]) for r in result]
|
|
78
|
+
conn.close()
|
|
79
|
+
if cls._patterns:
|
|
80
|
+
break
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
return cls._patterns
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def match_pattern(cls, sample_values: List[str]) -> Optional[Tuple[str, str]]:
|
|
88
|
+
"""
|
|
89
|
+
Match sample values against transformation patterns.
|
|
90
|
+
|
|
91
|
+
Returns (target_type, transform_expr) if a pattern matches majority of values.
|
|
92
|
+
"""
|
|
93
|
+
patterns = cls.get_patterns()
|
|
94
|
+
if not patterns or not sample_values:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
# Filter out None/empty values
|
|
98
|
+
valid_values = [v for v in sample_values if v is not None and str(v).strip()]
|
|
99
|
+
if not valid_values:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
for pattern, target_type, transform_expr, _ in patterns:
|
|
103
|
+
try:
|
|
104
|
+
regex = re.compile(pattern, re.IGNORECASE)
|
|
105
|
+
matches = sum(1 for v in valid_values if regex.match(str(v).strip()))
|
|
106
|
+
# If 80%+ of values match, use this pattern
|
|
107
|
+
if matches / len(valid_values) >= 0.8:
|
|
108
|
+
return (target_type, transform_expr)
|
|
109
|
+
except re.error:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class DVTSeedRunner(BaseRunner):
|
|
116
|
+
"""DVT Seed Runner using Spark for ETL-grade seed loading.
|
|
117
|
+
|
|
118
|
+
Uses unified Spark JDBC for ALL adapters (32+) - same infrastructure
|
|
119
|
+
as dvt run federation path. No adapter-specific code paths.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def __init__(self, config: RuntimeConfig, adapter, node: SeedNode, node_index: int, num_nodes: int):
|
|
123
|
+
super().__init__(config, adapter, node, node_index, num_nodes)
|
|
124
|
+
self._spark = None
|
|
125
|
+
self._spark_engine = None
|
|
126
|
+
|
|
127
|
+
def describe_node(self) -> str:
|
|
128
|
+
return f"seed file {self.get_node_representation()}"
|
|
129
|
+
|
|
130
|
+
def before_execute(self) -> None:
|
|
131
|
+
fire_event(
|
|
132
|
+
LogStartLine(
|
|
133
|
+
description=self.describe_node(),
|
|
134
|
+
index=self.node_index,
|
|
135
|
+
total=self.num_nodes,
|
|
136
|
+
node_info=self.node.node_info,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def after_execute(self, result) -> None:
|
|
141
|
+
"""Print result line after seed execution completes."""
|
|
142
|
+
self.print_result_line(result)
|
|
143
|
+
|
|
144
|
+
def get_node_representation(self) -> str:
|
|
145
|
+
return f"{self.node.schema}.{self.node.alias}"
|
|
146
|
+
|
|
147
|
+
def _get_spark_session(self):
|
|
148
|
+
"""Get or create Spark session using configured compute from computes.yml.
|
|
149
|
+
|
|
150
|
+
Compute selection hierarchy (highest to lowest priority):
|
|
151
|
+
1. CLI --target-compute flag (MUST exist if specified)
|
|
152
|
+
2. computes.yml target_compute default (MUST exist if specified)
|
|
153
|
+
3. Fallback to local Spark ONLY if no compute is configured
|
|
154
|
+
|
|
155
|
+
DVT Rule: Invalid compute → Compilation Error (NO fallback)
|
|
156
|
+
"""
|
|
157
|
+
if self._spark is not None:
|
|
158
|
+
return self._spark
|
|
159
|
+
|
|
160
|
+
from dbt.compute.engines.spark_engine import SparkEngine
|
|
161
|
+
from dbt.compute.jdbc_utils import set_docker_mode
|
|
162
|
+
from dbt.config.compute import ComputeRegistry
|
|
163
|
+
|
|
164
|
+
# Load compute configuration from project's computes.yml
|
|
165
|
+
project_dir = self.config.project_root
|
|
166
|
+
registry = ComputeRegistry(project_dir=project_dir)
|
|
167
|
+
|
|
168
|
+
# Check for CLI --target-compute override (highest priority)
|
|
169
|
+
cli_target_compute = getattr(self.config.args, 'TARGET_COMPUTE', None)
|
|
170
|
+
|
|
171
|
+
# Determine which compute to use
|
|
172
|
+
target_compute = cli_target_compute or registry.target_compute
|
|
173
|
+
|
|
174
|
+
if target_compute:
|
|
175
|
+
# A compute is specified - it MUST exist (no fallback)
|
|
176
|
+
compute_cluster = registry.get(target_compute)
|
|
177
|
+
if not compute_cluster or not compute_cluster.config:
|
|
178
|
+
available = [c.name for c in registry.list()]
|
|
179
|
+
raise DbtInternalError(
|
|
180
|
+
f"Compute '{target_compute}' not found in computes.yml. "
|
|
181
|
+
f"Available computes: {available}"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# DVT v0.59.0a40: Enable Docker mode for standalone clusters with localhost master
|
|
185
|
+
# This rewrites localhost -> host.docker.internal in JDBC URLs
|
|
186
|
+
cluster_config = compute_cluster.config
|
|
187
|
+
master = cluster_config.get("master", "")
|
|
188
|
+
if master.startswith("spark://") and ("localhost" in master or "127.0.0.1" in master):
|
|
189
|
+
set_docker_mode(True)
|
|
190
|
+
else:
|
|
191
|
+
set_docker_mode(False)
|
|
192
|
+
|
|
193
|
+
# Use configured Spark settings (SparkEngine auto-detects platform from config)
|
|
194
|
+
self._spark_engine = SparkEngine(
|
|
195
|
+
spark_config=cluster_config,
|
|
196
|
+
app_name="DVT-Seed",
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
# No compute specified anywhere - fallback to local Spark
|
|
200
|
+
set_docker_mode(False)
|
|
201
|
+
self._spark_engine = SparkEngine(
|
|
202
|
+
mode="embedded",
|
|
203
|
+
app_name="DVT-Seed",
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Connect to Spark (creates the session)
|
|
207
|
+
self._spark_engine.connect()
|
|
208
|
+
self._spark = self._spark_engine.spark
|
|
209
|
+
return self._spark
|
|
210
|
+
|
|
211
|
+
def _get_seed_path(self) -> Path:
|
|
212
|
+
"""Get the path to the seed CSV file."""
|
|
213
|
+
# Seeds are in the project's seed-paths directory
|
|
214
|
+
seed_paths = self.config.seed_paths
|
|
215
|
+
for seed_path in seed_paths:
|
|
216
|
+
full_path = Path(self.config.project_root) / seed_path / f"{self.node.name}.csv"
|
|
217
|
+
if full_path.exists():
|
|
218
|
+
return full_path
|
|
219
|
+
|
|
220
|
+
# Try original_file_path
|
|
221
|
+
if hasattr(self.node, 'original_file_path') and self.node.original_file_path:
|
|
222
|
+
original = Path(self.config.project_root) / self.node.original_file_path
|
|
223
|
+
if original.exists():
|
|
224
|
+
return original
|
|
225
|
+
|
|
226
|
+
raise FileNotFoundError(f"Seed file not found for {self.node.name}")
|
|
227
|
+
|
|
228
|
+
def _detect_transformations(self, spark_df) -> Dict[str, Tuple[str, str]]:
|
|
229
|
+
"""
|
|
230
|
+
Analyze DataFrame columns and detect needed transformations.
|
|
231
|
+
|
|
232
|
+
Returns dict of column_name -> (target_type, transform_expr)
|
|
233
|
+
"""
|
|
234
|
+
transformations = {}
|
|
235
|
+
|
|
236
|
+
# Sample first 100 rows for pattern matching
|
|
237
|
+
try:
|
|
238
|
+
sample_rows = spark_df.limit(100).collect()
|
|
239
|
+
except Exception:
|
|
240
|
+
return transformations
|
|
241
|
+
|
|
242
|
+
if not sample_rows:
|
|
243
|
+
return transformations
|
|
244
|
+
|
|
245
|
+
# Check each string column
|
|
246
|
+
for col_name in spark_df.columns:
|
|
247
|
+
# Get sample values for this column
|
|
248
|
+
sample_values = []
|
|
249
|
+
for row in sample_rows:
|
|
250
|
+
try:
|
|
251
|
+
val = row[col_name]
|
|
252
|
+
if val is not None:
|
|
253
|
+
sample_values.append(str(val))
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
# Try to match a transformation pattern
|
|
258
|
+
match = ValueTransformationRegistry.match_pattern(sample_values)
|
|
259
|
+
if match:
|
|
260
|
+
transformations[col_name] = match
|
|
261
|
+
|
|
262
|
+
return transformations
|
|
263
|
+
|
|
264
|
+
def _apply_transformations(self, spark_df, transformations: Dict[str, Tuple[str, str]]):
|
|
265
|
+
"""Apply Spark SQL transformations to columns."""
|
|
266
|
+
from pyspark.sql import functions as F
|
|
267
|
+
|
|
268
|
+
if not transformations:
|
|
269
|
+
return spark_df
|
|
270
|
+
|
|
271
|
+
# Build select expressions
|
|
272
|
+
select_exprs = []
|
|
273
|
+
for col_name in spark_df.columns:
|
|
274
|
+
if col_name in transformations:
|
|
275
|
+
target_type, transform_expr = transformations[col_name]
|
|
276
|
+
# Replace {col} placeholder with actual column reference
|
|
277
|
+
expr = transform_expr.replace("{col}", f"`{col_name}`")
|
|
278
|
+
select_exprs.append(F.expr(expr).alias(col_name))
|
|
279
|
+
else:
|
|
280
|
+
select_exprs.append(F.col(f"`{col_name}`"))
|
|
281
|
+
|
|
282
|
+
return spark_df.select(*select_exprs)
|
|
283
|
+
|
|
284
|
+
# File-based database types that need native connection (not JDBC)
|
|
285
|
+
FILE_BASED_ADAPTERS = {'duckdb', 'sqlite'}
|
|
286
|
+
|
|
287
|
+
def _write_to_file_database(self, spark_df, adapter) -> int:
|
|
288
|
+
"""Write DataFrame to file-based database using native connection.
|
|
289
|
+
|
|
290
|
+
File-based databases (DuckDB, SQLite) don't handle JDBC writes well
|
|
291
|
+
due to file locking issues. We use native connections via Pandas/Arrow.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
spark_df: Spark DataFrame to write
|
|
295
|
+
adapter: The dbt adapter (used for credentials and relation naming)
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
Row count written
|
|
299
|
+
"""
|
|
300
|
+
adapter_type = adapter.type()
|
|
301
|
+
credentials = adapter.config.credentials
|
|
302
|
+
|
|
303
|
+
# Convert Spark DataFrame to Pandas
|
|
304
|
+
pdf = spark_df.toPandas()
|
|
305
|
+
row_count = len(pdf)
|
|
306
|
+
|
|
307
|
+
if adapter_type == 'duckdb':
|
|
308
|
+
# Get DuckDB path from credentials
|
|
309
|
+
db_path = getattr(credentials, 'path', None) or getattr(credentials, 'database', None)
|
|
310
|
+
if not db_path:
|
|
311
|
+
raise ValueError("DuckDB path not found in credentials")
|
|
312
|
+
|
|
313
|
+
# Expand user path
|
|
314
|
+
db_path = str(Path(db_path).expanduser())
|
|
315
|
+
|
|
316
|
+
# Get schema and table name
|
|
317
|
+
schema = self.node.schema or 'main'
|
|
318
|
+
table_name = self.node.alias or self.node.name
|
|
319
|
+
|
|
320
|
+
# Connect and write using DuckDB's native connection
|
|
321
|
+
conn = duckdb.connect(db_path)
|
|
322
|
+
try:
|
|
323
|
+
# Create schema if needed
|
|
324
|
+
conn.execute(f"CREATE SCHEMA IF NOT EXISTS {schema}")
|
|
325
|
+
# Write DataFrame (replace if exists)
|
|
326
|
+
full_table = f"{schema}.{table_name}"
|
|
327
|
+
conn.execute(f"DROP TABLE IF EXISTS {full_table}")
|
|
328
|
+
# Register the Pandas DataFrame as a virtual table, then create from it
|
|
329
|
+
conn.register('_dvt_seed_data', pdf)
|
|
330
|
+
conn.execute(f"CREATE TABLE {full_table} AS SELECT * FROM _dvt_seed_data")
|
|
331
|
+
conn.unregister('_dvt_seed_data')
|
|
332
|
+
conn.commit()
|
|
333
|
+
finally:
|
|
334
|
+
conn.close()
|
|
335
|
+
|
|
336
|
+
elif adapter_type == 'sqlite':
|
|
337
|
+
import sqlite3
|
|
338
|
+
|
|
339
|
+
db_path = getattr(credentials, 'database', None)
|
|
340
|
+
if not db_path:
|
|
341
|
+
raise ValueError("SQLite database path not found in credentials")
|
|
342
|
+
|
|
343
|
+
db_path = str(Path(db_path).expanduser())
|
|
344
|
+
table_name = self.node.alias or self.node.name
|
|
345
|
+
|
|
346
|
+
conn = sqlite3.connect(db_path)
|
|
347
|
+
try:
|
|
348
|
+
pdf.to_sql(table_name, conn, if_exists='replace', index=False)
|
|
349
|
+
finally:
|
|
350
|
+
conn.close()
|
|
351
|
+
|
|
352
|
+
return row_count
|
|
353
|
+
|
|
354
|
+
def _drop_table_cascade(self, adapter, relation) -> None:
|
|
355
|
+
"""Drop a table/view with CASCADE to handle dependent objects.
|
|
356
|
+
|
|
357
|
+
Network databases like PostgreSQL may have views depending on tables.
|
|
358
|
+
We need to drop with CASCADE before overwriting.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
adapter: The dbt adapter
|
|
362
|
+
relation: The relation to drop
|
|
363
|
+
"""
|
|
364
|
+
adapter_type = adapter.type()
|
|
365
|
+
target_table = relation.render()
|
|
366
|
+
|
|
367
|
+
# Get a raw connection from the adapter
|
|
368
|
+
with adapter.connection_named('drop_cascade'):
|
|
369
|
+
conn = adapter.connections.get_thread_connection()
|
|
370
|
+
if conn and conn.handle:
|
|
371
|
+
try:
|
|
372
|
+
# Use raw connection to execute DROP CASCADE
|
|
373
|
+
cursor = conn.handle.cursor()
|
|
374
|
+
# Try both TABLE and VIEW
|
|
375
|
+
for obj_type in ['TABLE', 'VIEW']:
|
|
376
|
+
try:
|
|
377
|
+
drop_sql = f"DROP {obj_type} IF EXISTS {target_table} CASCADE"
|
|
378
|
+
cursor.execute(drop_sql)
|
|
379
|
+
except Exception:
|
|
380
|
+
pass
|
|
381
|
+
conn.handle.commit()
|
|
382
|
+
except Exception:
|
|
383
|
+
# Ignore errors - table may not exist
|
|
384
|
+
pass
|
|
385
|
+
|
|
386
|
+
def _write_to_target(self, spark_df, adapter) -> int:
|
|
387
|
+
"""Write DataFrame to target database using the appropriate method.
|
|
388
|
+
|
|
389
|
+
DVT v0.59.0a47: Hybrid approach:
|
|
390
|
+
- File-based databases (DuckDB): Native connection via Pandas/Arrow
|
|
391
|
+
- Network databases (Postgres, etc.): Spark JDBC with DROP CASCADE
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
spark_df: Spark DataFrame to write
|
|
395
|
+
adapter: The dbt adapter (used for credentials and relation naming)
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
Row count written
|
|
399
|
+
"""
|
|
400
|
+
adapter_type = adapter.type()
|
|
401
|
+
|
|
402
|
+
# Check if this is a file-based database
|
|
403
|
+
if adapter_type in self.FILE_BASED_ADAPTERS:
|
|
404
|
+
return self._write_to_file_database(spark_df, adapter)
|
|
405
|
+
|
|
406
|
+
# Network database - use Spark JDBC
|
|
407
|
+
from dbt.compute.jdbc_utils import build_jdbc_config
|
|
408
|
+
|
|
409
|
+
credentials = adapter.config.credentials
|
|
410
|
+
|
|
411
|
+
# Use adapter's Relation class for DROP CASCADE (needs proper quoting)
|
|
412
|
+
relation = adapter.Relation.create_from(self.config, self.node)
|
|
413
|
+
|
|
414
|
+
# Drop with CASCADE before write (handles dependent views)
|
|
415
|
+
self._drop_table_cascade(adapter, relation)
|
|
416
|
+
|
|
417
|
+
# Build JDBC config using DVT's unified infrastructure
|
|
418
|
+
jdbc_url, jdbc_properties = build_jdbc_config(credentials)
|
|
419
|
+
|
|
420
|
+
# DVT v0.59.0a48: Build table name WITHOUT adapter quoting for Spark JDBC
|
|
421
|
+
# Spark JDBC expects unquoted table names (it handles quoting internally)
|
|
422
|
+
# relation.render() returns quoted names like "public"."table" which breaks JDBC
|
|
423
|
+
schema = self.node.schema or 'public'
|
|
424
|
+
table_name = self.node.alias or self.node.name
|
|
425
|
+
target_table = f"{schema}.{table_name}"
|
|
426
|
+
|
|
427
|
+
# Count rows before write
|
|
428
|
+
row_count = spark_df.count()
|
|
429
|
+
|
|
430
|
+
# Write to target via Spark JDBC
|
|
431
|
+
spark_df.write \
|
|
432
|
+
.mode("overwrite") \
|
|
433
|
+
.jdbc(jdbc_url, target_table, properties=jdbc_properties)
|
|
434
|
+
|
|
435
|
+
return row_count
|
|
436
|
+
|
|
437
|
+
# Adapters that require strict column name sanitization
|
|
438
|
+
# These don't support special characters (spaces, semicolons, etc.) in column names
|
|
439
|
+
# even with quoting. Most traditional databases (PostgreSQL, MySQL, etc.) support
|
|
440
|
+
# quoted identifiers with special characters.
|
|
441
|
+
STRICT_COLUMN_NAME_ADAPTERS = {
|
|
442
|
+
'databricks', # Delta tables reject special chars
|
|
443
|
+
'spark', # Hive metastore has restrictions
|
|
444
|
+
'delta', # Delta Lake format
|
|
445
|
+
'iceberg', # Apache Iceberg format
|
|
446
|
+
'hudi', # Apache Hudi format
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
def _sanitize_column_names(self, spark_df, adapter_type: str):
|
|
450
|
+
"""Sanitize column names based on adapter requirements from MDM.
|
|
451
|
+
|
|
452
|
+
DVT v0.59.0a48: MDM-driven column name sanitization.
|
|
453
|
+
Reads syntax_registry from MDM to determine if adapter requires strict identifiers.
|
|
454
|
+
|
|
455
|
+
For strict adapters (determined by MDM syntax_registry):
|
|
456
|
+
1. Strips leading/trailing whitespace
|
|
457
|
+
2. Replaces spaces with underscores
|
|
458
|
+
3. Removes problematic characters: ,;{}()\n\t=
|
|
459
|
+
4. Ensures SQL-safe identifiers
|
|
460
|
+
|
|
461
|
+
For permissive adapters:
|
|
462
|
+
Preserves column names exactly as they appear in source.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
spark_df: Spark DataFrame to sanitize
|
|
466
|
+
adapter_type: The adapter type (e.g., 'postgres', 'databricks')
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
Spark DataFrame with sanitized column names (if needed)
|
|
470
|
+
"""
|
|
471
|
+
# Check MDM for adapter's strict identifier requirement
|
|
472
|
+
requires_strict = self._check_strict_identifiers_from_mdm(adapter_type)
|
|
473
|
+
|
|
474
|
+
if requires_strict:
|
|
475
|
+
# Full sanitization for strict adapters
|
|
476
|
+
new_columns = []
|
|
477
|
+
for col in spark_df.columns:
|
|
478
|
+
new_col = col.strip()
|
|
479
|
+
new_col = new_col.replace(" ", "_")
|
|
480
|
+
new_col = re.sub(r'[,;{}()\n\t=]', '', new_col)
|
|
481
|
+
new_col = re.sub(r'[^\w]', '_', new_col)
|
|
482
|
+
new_col = re.sub(r'_+', '_', new_col)
|
|
483
|
+
new_col = new_col.strip('_')
|
|
484
|
+
if not new_col:
|
|
485
|
+
new_col = f"col_{spark_df.columns.index(col)}"
|
|
486
|
+
new_columns.append(new_col)
|
|
487
|
+
return spark_df.toDF(*new_columns)
|
|
488
|
+
else:
|
|
489
|
+
# Permissive adapters: preserve column names exactly as-is
|
|
490
|
+
return spark_df
|
|
491
|
+
|
|
492
|
+
def _check_strict_identifiers_from_mdm(self, adapter_type: str) -> bool:
|
|
493
|
+
"""Check MDM syntax_registry for adapter's strict identifier requirement.
|
|
494
|
+
|
|
495
|
+
DVT v0.59.0a48: Uses MDM to determine sanitization behavior.
|
|
496
|
+
Adapters with backtick quoting (`) typically require strict identifiers
|
|
497
|
+
because they're often data lake formats (Databricks, Spark, BigQuery).
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
adapter_type: The adapter type (e.g., 'postgres', 'databricks')
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
True if adapter requires strict column name sanitization
|
|
504
|
+
"""
|
|
505
|
+
from pathlib import Path
|
|
506
|
+
|
|
507
|
+
# MDM locations
|
|
508
|
+
registry_paths = [
|
|
509
|
+
Path(__file__).parent.parent / "include" / "data" / "adapters_registry.duckdb",
|
|
510
|
+
Path.home() / ".dvt" / ".data" / "mdm.duckdb",
|
|
511
|
+
]
|
|
512
|
+
|
|
513
|
+
for path in registry_paths:
|
|
514
|
+
if path.exists():
|
|
515
|
+
try:
|
|
516
|
+
conn = duckdb.connect(str(path), read_only=True)
|
|
517
|
+
# Check if requires_strict_identifiers column exists
|
|
518
|
+
cols = conn.execute(
|
|
519
|
+
"SELECT column_name FROM information_schema.columns "
|
|
520
|
+
"WHERE table_name='syntax_registry' AND column_name='requires_strict_identifiers'"
|
|
521
|
+
).fetchall()
|
|
522
|
+
|
|
523
|
+
if cols:
|
|
524
|
+
# Use explicit column if it exists
|
|
525
|
+
result = conn.execute(
|
|
526
|
+
"SELECT requires_strict_identifiers FROM syntax_registry "
|
|
527
|
+
f"WHERE adapter_name = '{adapter_type}'"
|
|
528
|
+
).fetchone()
|
|
529
|
+
conn.close()
|
|
530
|
+
if result:
|
|
531
|
+
return bool(result[0])
|
|
532
|
+
else:
|
|
533
|
+
# Fallback: infer from quoting style
|
|
534
|
+
# Backtick (`) adapters typically require strict identifiers
|
|
535
|
+
result = conn.execute(
|
|
536
|
+
"SELECT quote_start FROM syntax_registry "
|
|
537
|
+
f"WHERE adapter_name = '{adapter_type}'"
|
|
538
|
+
).fetchone()
|
|
539
|
+
conn.close()
|
|
540
|
+
if result and result[0] == '`':
|
|
541
|
+
return True
|
|
542
|
+
break
|
|
543
|
+
except Exception:
|
|
544
|
+
pass
|
|
545
|
+
|
|
546
|
+
# Default fallback for known strict adapters (if MDM lookup fails)
|
|
547
|
+
return adapter_type in self.STRICT_COLUMN_NAME_ADAPTERS
|
|
548
|
+
|
|
549
|
+
def execute(self, model, manifest):
|
|
550
|
+
"""Execute seed loading with Spark and pattern transformations."""
|
|
551
|
+
start_time = time.time()
|
|
552
|
+
|
|
553
|
+
try:
|
|
554
|
+
# Get seed file path
|
|
555
|
+
seed_path = self._get_seed_path()
|
|
556
|
+
|
|
557
|
+
# Read CSV with Spark
|
|
558
|
+
spark = self._get_spark_session()
|
|
559
|
+
spark_df = spark.read \
|
|
560
|
+
.option("header", "true") \
|
|
561
|
+
.option("inferSchema", "false") \
|
|
562
|
+
.csv(str(seed_path))
|
|
563
|
+
|
|
564
|
+
# Get adapter for writing (needed for adapter-specific sanitization)
|
|
565
|
+
adapter = get_adapter(self.config)
|
|
566
|
+
adapter_type = adapter.type()
|
|
567
|
+
|
|
568
|
+
# Sanitize column names based on adapter requirements
|
|
569
|
+
# Strict adapters (Databricks) need full sanitization
|
|
570
|
+
# Permissive adapters (PostgreSQL) only need whitespace trimming
|
|
571
|
+
spark_df = self._sanitize_column_names(spark_df, adapter_type)
|
|
572
|
+
|
|
573
|
+
# Detect and apply transformations
|
|
574
|
+
transformations = self._detect_transformations(spark_df)
|
|
575
|
+
spark_df = self._apply_transformations(spark_df, transformations)
|
|
576
|
+
|
|
577
|
+
# Write to target using unified JDBC
|
|
578
|
+
row_count = self._write_to_target(spark_df, adapter)
|
|
579
|
+
|
|
580
|
+
execution_time = time.time() - start_time
|
|
581
|
+
|
|
582
|
+
# Build result
|
|
583
|
+
return RunResult(
|
|
584
|
+
status=RunStatus.Success,
|
|
585
|
+
timing=[],
|
|
586
|
+
thread_id="",
|
|
587
|
+
execution_time=execution_time,
|
|
588
|
+
adapter_response={},
|
|
589
|
+
message=f"INSERT {row_count}",
|
|
590
|
+
failures=None,
|
|
591
|
+
node=model,
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
except Exception as e:
|
|
595
|
+
execution_time = time.time() - start_time
|
|
596
|
+
return RunResult(
|
|
597
|
+
status=RunStatus.Error,
|
|
598
|
+
timing=[],
|
|
599
|
+
thread_id="",
|
|
600
|
+
execution_time=execution_time,
|
|
601
|
+
adapter_response={},
|
|
602
|
+
message=str(e),
|
|
603
|
+
failures=1,
|
|
604
|
+
node=model,
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
def compile(self, manifest):
|
|
608
|
+
return self.node
|
|
609
|
+
|
|
610
|
+
def print_result_line(self, result):
|
|
611
|
+
group = group_lookup.get(self.node.unique_id)
|
|
612
|
+
level = EventLevel.ERROR if result.status == NodeStatus.Error else EventLevel.INFO
|
|
613
|
+
fire_event(
|
|
614
|
+
LogSeedResult(
|
|
615
|
+
status=result.status,
|
|
616
|
+
result_message=result.message,
|
|
617
|
+
index=self.node_index,
|
|
618
|
+
total=self.num_nodes,
|
|
619
|
+
execution_time=result.execution_time,
|
|
620
|
+
schema=self.node.schema,
|
|
621
|
+
relation=self.node.alias,
|
|
622
|
+
node_info=self.node.node_info,
|
|
623
|
+
group=group,
|
|
624
|
+
),
|
|
625
|
+
level=level,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
class DVTSeedTask(RunTask):
|
|
630
|
+
"""DVT Seed Task - Enhanced seed loading with Spark and multi-bar Rich UI.
|
|
631
|
+
|
|
632
|
+
DVT v0.59.0a37: Header displays BEFORE execution via before_run() hook.
|
|
633
|
+
- File-based databases: NATIVE (DuckDB, SQLite)
|
|
634
|
+
- Network databases: SPARK-JDBC (Postgres, Snowflake, etc.)
|
|
635
|
+
"""
|
|
636
|
+
|
|
637
|
+
# File-based databases use native connections (not JDBC)
|
|
638
|
+
FILE_BASED_ADAPTERS = {'duckdb', 'sqlite'}
|
|
639
|
+
|
|
640
|
+
def __init__(self, args, config, manifest):
|
|
641
|
+
super().__init__(args, config, manifest)
|
|
642
|
+
self._display: Optional[DVTMultiBarDisplay] = None
|
|
643
|
+
self._adapter_type = None
|
|
644
|
+
self._use_rich_output = HAS_RICH and not getattr(args, 'QUIET', False)
|
|
645
|
+
self._spark_logger = None
|
|
646
|
+
self._header_displayed = False
|
|
647
|
+
|
|
648
|
+
def raise_on_first_error(self) -> bool:
|
|
649
|
+
return False
|
|
650
|
+
|
|
651
|
+
def get_node_selector(self):
|
|
652
|
+
if self.manifest is None or self.graph is None:
|
|
653
|
+
raise DbtInternalError("manifest and graph must be set to perform node selection")
|
|
654
|
+
return ResourceTypeSelector(
|
|
655
|
+
graph=self.graph,
|
|
656
|
+
manifest=self.manifest,
|
|
657
|
+
previous_state=self.previous_state,
|
|
658
|
+
resource_types=[NodeType.Seed],
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
def get_runner_type(self, _):
|
|
662
|
+
return DVTSeedRunner
|
|
663
|
+
|
|
664
|
+
def _get_execution_path(self) -> str:
|
|
665
|
+
"""Determine execution path based on adapter type."""
|
|
666
|
+
if self._adapter_type is None:
|
|
667
|
+
try:
|
|
668
|
+
if self.config is None:
|
|
669
|
+
raise ValueError("config is None")
|
|
670
|
+
credentials = self.config.credentials
|
|
671
|
+
if credentials is None:
|
|
672
|
+
raise ValueError("credentials is None")
|
|
673
|
+
self._adapter_type = getattr(credentials, 'type', None)
|
|
674
|
+
if not self._adapter_type:
|
|
675
|
+
adapter = get_adapter(self.config)
|
|
676
|
+
self._adapter_type = adapter.type()
|
|
677
|
+
except Exception:
|
|
678
|
+
self._adapter_type = 'unknown'
|
|
679
|
+
|
|
680
|
+
if self._adapter_type in self.FILE_BASED_ADAPTERS:
|
|
681
|
+
return "NATIVE"
|
|
682
|
+
return "SPARK-JDBC"
|
|
683
|
+
|
|
684
|
+
def _get_target_info(self) -> str:
|
|
685
|
+
"""Get the current target name for display."""
|
|
686
|
+
cli_target = getattr(self.config.args, 'TARGET', None)
|
|
687
|
+
return cli_target or self.config.target_name or "default"
|
|
688
|
+
|
|
689
|
+
def _get_compute_info(self) -> str:
|
|
690
|
+
"""Get the current compute engine for display."""
|
|
691
|
+
exec_path = self._get_execution_path()
|
|
692
|
+
if exec_path == "NATIVE":
|
|
693
|
+
return "native"
|
|
694
|
+
cli_compute = getattr(self.config.args, 'TARGET_COMPUTE', None)
|
|
695
|
+
return cli_compute or "spark-local"
|
|
696
|
+
|
|
697
|
+
def _start_spark_logger(self) -> None:
|
|
698
|
+
"""Start Spark output logging to target directory.
|
|
699
|
+
|
|
700
|
+
Note: suppress_console=False so dbt's event output flows normally.
|
|
701
|
+
Spark output is tee'd to the log file for later reference.
|
|
702
|
+
"""
|
|
703
|
+
import os
|
|
704
|
+
try:
|
|
705
|
+
from dbt.compute.spark_logger import get_spark_logger
|
|
706
|
+
target_dir = os.path.join(os.getcwd(), "target")
|
|
707
|
+
compute_name = self._get_compute_info().replace("-", "_")
|
|
708
|
+
self._spark_logger = get_spark_logger(target_dir, compute_name)
|
|
709
|
+
# Don't suppress console - let dbt events flow normally
|
|
710
|
+
self._spark_logger.start_session(suppress_console=False)
|
|
711
|
+
except Exception:
|
|
712
|
+
self._spark_logger = None
|
|
713
|
+
|
|
714
|
+
def _stop_spark_logger(self) -> None:
|
|
715
|
+
"""Stop Spark output logging."""
|
|
716
|
+
if self._spark_logger:
|
|
717
|
+
try:
|
|
718
|
+
self._spark_logger.end_session()
|
|
719
|
+
except Exception:
|
|
720
|
+
pass
|
|
721
|
+
self._spark_logger = None
|
|
722
|
+
|
|
723
|
+
def before_run(self, adapter, selected_uids):
|
|
724
|
+
"""
|
|
725
|
+
Called BEFORE model execution starts - this is where we show the header.
|
|
726
|
+
|
|
727
|
+
DVT v0.59.0a38: Fixed header timing to display BEFORE execution.
|
|
728
|
+
The before_run() hook is called after 'Concurrency: X threads' message
|
|
729
|
+
but before any models start executing.
|
|
730
|
+
"""
|
|
731
|
+
result = super().before_run(adapter, selected_uids)
|
|
732
|
+
|
|
733
|
+
# Show header BEFORE execution (only once)
|
|
734
|
+
if self._use_rich_output and not self._header_displayed:
|
|
735
|
+
try:
|
|
736
|
+
exec_path = self._get_execution_path()
|
|
737
|
+
self._display = DVTMultiBarDisplay(
|
|
738
|
+
title="DVT Seed",
|
|
739
|
+
operation="seed",
|
|
740
|
+
target=self._get_target_info(),
|
|
741
|
+
compute=self._get_compute_info(),
|
|
742
|
+
)
|
|
743
|
+
self._display.start_display()
|
|
744
|
+
self._header_displayed = True
|
|
745
|
+
|
|
746
|
+
# Start Spark logging AFTER header is shown
|
|
747
|
+
if exec_path != "NATIVE":
|
|
748
|
+
self._start_spark_logger()
|
|
749
|
+
except Exception:
|
|
750
|
+
self._display = None
|
|
751
|
+
|
|
752
|
+
return result
|
|
753
|
+
|
|
754
|
+
def run(self):
|
|
755
|
+
"""Override run to add Rich UI summary AFTER execution.
|
|
756
|
+
|
|
757
|
+
DVT v0.59.0a38: Header is now displayed in before_run() hook.
|
|
758
|
+
This method only handles summary display after execution completes.
|
|
759
|
+
"""
|
|
760
|
+
# Run the parent implementation
|
|
761
|
+
# Header is shown in before_run(), Spark logger started there too
|
|
762
|
+
results = super().run()
|
|
763
|
+
|
|
764
|
+
# Stop Spark logging FIRST so we can print to console
|
|
765
|
+
self._stop_spark_logger()
|
|
766
|
+
|
|
767
|
+
# Show summary AFTER execution
|
|
768
|
+
exec_path = self._get_execution_path()
|
|
769
|
+
if results and self._display:
|
|
770
|
+
try:
|
|
771
|
+
# Update stats from results
|
|
772
|
+
for result in results:
|
|
773
|
+
if result.node:
|
|
774
|
+
duration_ms = (result.execution_time or 0) * 1000
|
|
775
|
+
|
|
776
|
+
if result.status == RunStatus.Success:
|
|
777
|
+
status = "pass"
|
|
778
|
+
error_msg = None
|
|
779
|
+
elif result.status == RunStatus.Error:
|
|
780
|
+
status = "error"
|
|
781
|
+
error_msg = result.message
|
|
782
|
+
else:
|
|
783
|
+
status = "skip"
|
|
784
|
+
error_msg = None
|
|
785
|
+
|
|
786
|
+
self._display.update_model_complete(
|
|
787
|
+
unique_id=result.node.unique_id,
|
|
788
|
+
status=status,
|
|
789
|
+
duration_ms=duration_ms,
|
|
790
|
+
execution_path=exec_path,
|
|
791
|
+
error_message=error_msg,
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
self._display.stop_display()
|
|
795
|
+
self._display.print_summary()
|
|
796
|
+
|
|
797
|
+
except Exception:
|
|
798
|
+
pass
|
|
799
|
+
|
|
800
|
+
return results
|
|
801
|
+
|
|
802
|
+
def task_end_messages(self, results) -> None:
|
|
803
|
+
# Rich UI handles the summary, so we skip the default messages
|
|
804
|
+
if self._display:
|
|
805
|
+
return
|
|
806
|
+
super().task_end_messages(results)
|