dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2039 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +804 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.py +624 -0
- dbt/compute/federated_executor.py +837 -0
- dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-310-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
- dbt/compute/smart_selector.py +311 -0
- dbt/compute/strategies/__init__.py +54 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.py +364 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-310-darwin.so +0 -0
- dbt/config/compute.py +547 -0
- dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +346 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +247 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-310-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.py +454 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/generate.py +660 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.py +29 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.py +553 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/printer.py +175 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1306 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.py +759 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +268 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.52.2.dist-info/METADATA +286 -0
- dvt_core-0.52.2.dist-info/RECORD +275 -0
- dvt_core-0.52.2.dist-info/WHEEL +5 -0
- dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
- dvt_core-0.52.2.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,837 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Federated Query Executor
|
|
3
|
+
|
|
4
|
+
Orchestrates multi-source query execution using Spark compute engine.
|
|
5
|
+
This is the core component that enables DVT's data virtualization capabilities.
|
|
6
|
+
|
|
7
|
+
v0.3.0: Unified Spark architecture - all federation uses Spark JDBC.
|
|
8
|
+
|
|
9
|
+
Execution flow:
|
|
10
|
+
1. Identify all source tables/models from compiled SQL
|
|
11
|
+
2. Load sources into Spark via JDBC (parallel reads)
|
|
12
|
+
3. Execute model SQL in Spark
|
|
13
|
+
4. Return results as PyArrow Table
|
|
14
|
+
5. Materialize to target via JDBC or adapter
|
|
15
|
+
|
|
16
|
+
Key principle: Adapters for I/O only, Spark for all compute.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
import sys
|
|
21
|
+
import time
|
|
22
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
|
|
25
|
+
from dbt.adapters.base import BaseAdapter
|
|
26
|
+
from dbt.compute.engines.spark_engine import SparkEngine, _clean_spark_error
|
|
27
|
+
from dbt.contracts.graph.manifest import Manifest
|
|
28
|
+
from dbt.contracts.graph.nodes import ManifestNode
|
|
29
|
+
from dbt.query_analyzer import QueryAnalysisResult
|
|
30
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _log(msg: str) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Log a message that appears immediately in console output.
|
|
36
|
+
DVT v0.4.7: Suppressed for clean output (logs go to spark_run_history).
|
|
37
|
+
"""
|
|
38
|
+
# Suppressed for clean output - all debug info goes to spark_run_history file
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_dependent_views_pg(cursor, schema: str, table: str) -> List[Dict[str, str]]:
|
|
43
|
+
"""
|
|
44
|
+
Query PostgreSQL for views that depend on a table.
|
|
45
|
+
DVT v0.5.5: Used to save views before DROP CASCADE, then restore after.
|
|
46
|
+
|
|
47
|
+
Returns list of dicts with: schema, name, definition
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
# Query views that depend on this table using pg_depend
|
|
51
|
+
sql = """
|
|
52
|
+
SELECT DISTINCT
|
|
53
|
+
n.nspname as view_schema,
|
|
54
|
+
c.relname as view_name,
|
|
55
|
+
pg_get_viewdef(c.oid, true) as view_definition
|
|
56
|
+
FROM pg_depend d
|
|
57
|
+
JOIN pg_rewrite r ON r.oid = d.objid
|
|
58
|
+
JOIN pg_class c ON c.oid = r.ev_class
|
|
59
|
+
JOIN pg_namespace n ON n.oid = c.relnamespace
|
|
60
|
+
JOIN pg_class t ON t.oid = d.refobjid
|
|
61
|
+
JOIN pg_namespace tn ON tn.oid = t.relnamespace
|
|
62
|
+
WHERE t.relname = %s
|
|
63
|
+
AND tn.nspname = %s
|
|
64
|
+
AND c.relkind = 'v'
|
|
65
|
+
AND d.classid = 'pg_rewrite'::regclass
|
|
66
|
+
AND d.deptype = 'n'
|
|
67
|
+
"""
|
|
68
|
+
cursor.execute(sql, (table, schema))
|
|
69
|
+
rows = cursor.fetchall()
|
|
70
|
+
return [
|
|
71
|
+
{'schema': row[0], 'name': row[1], 'definition': row[2]}
|
|
72
|
+
for row in rows
|
|
73
|
+
]
|
|
74
|
+
except Exception:
|
|
75
|
+
# If query fails (different DB, permissions), return empty
|
|
76
|
+
return []
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _recreate_views_pg(cursor, views: List[Dict[str, str]]) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Recreate views from their saved definitions.
|
|
82
|
+
DVT v0.5.5: Restores views after DROP CASCADE.
|
|
83
|
+
"""
|
|
84
|
+
for view in views:
|
|
85
|
+
try:
|
|
86
|
+
create_sql = f'CREATE OR REPLACE VIEW "{view["schema"]}"."{view["name"]}" AS {view["definition"]}'
|
|
87
|
+
_log(f"[DVT] Recreating view: {view['schema']}.{view['name']}")
|
|
88
|
+
cursor.execute(create_sql)
|
|
89
|
+
except Exception as e:
|
|
90
|
+
_log(f"[DVT] Warning: Could not recreate view {view['name']}: {e}")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class SourceTableMetadata:
|
|
95
|
+
"""Metadata about a source table needed for federated execution."""
|
|
96
|
+
|
|
97
|
+
source_id: str # Unique ID from manifest
|
|
98
|
+
connection_name: str # Which connection to read from
|
|
99
|
+
database: str # Database name
|
|
100
|
+
schema: str # Schema name
|
|
101
|
+
identifier: str # Table name
|
|
102
|
+
qualified_name: str # Fully qualified name for SQL
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class FederatedExecutionResult:
|
|
107
|
+
"""Result of federated query execution."""
|
|
108
|
+
|
|
109
|
+
spark_dataframe: Any # Spark DataFrame with query results
|
|
110
|
+
source_tables: List[SourceTableMetadata] # Sources used
|
|
111
|
+
compute_engine: str # Engine used (spark)
|
|
112
|
+
execution_time_ms: float # Execution time in milliseconds
|
|
113
|
+
rows_read: int # Total rows read from sources
|
|
114
|
+
rows_returned: int # Rows in result (may be None if not counted)
|
|
115
|
+
engine: Any # SparkEngine instance (for session lifecycle management)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class FederatedExecutor:
|
|
119
|
+
"""
|
|
120
|
+
Orchestrates federated query execution across multiple data sources.
|
|
121
|
+
|
|
122
|
+
This executor:
|
|
123
|
+
1. Extracts data from multiple sources via adapters
|
|
124
|
+
2. Loads data into a compute engine
|
|
125
|
+
3. Executes the query
|
|
126
|
+
4. Returns results as Spark DataFrame
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def __init__(
|
|
130
|
+
self,
|
|
131
|
+
manifest: Manifest,
|
|
132
|
+
adapters: Dict[str, BaseAdapter],
|
|
133
|
+
default_compute_engine: str = "spark-local",
|
|
134
|
+
):
|
|
135
|
+
"""
|
|
136
|
+
Initialize federated executor.
|
|
137
|
+
|
|
138
|
+
v0.3.0: All federation uses Spark (local or cluster).
|
|
139
|
+
|
|
140
|
+
:param manifest: The dbt manifest with all nodes and sources
|
|
141
|
+
:param adapters: Dict of connection_name → adapter instances
|
|
142
|
+
:param default_compute_engine: Default compute engine ("spark-local" or "spark-cluster")
|
|
143
|
+
"""
|
|
144
|
+
self.manifest = manifest
|
|
145
|
+
self.adapters = adapters
|
|
146
|
+
self.default_compute_engine = default_compute_engine
|
|
147
|
+
|
|
148
|
+
def execute(
|
|
149
|
+
self,
|
|
150
|
+
node: ManifestNode,
|
|
151
|
+
analysis_result: QueryAnalysisResult,
|
|
152
|
+
compute_engine_override: Optional[str] = None,
|
|
153
|
+
spark_config: Optional[Dict[str, str]] = None,
|
|
154
|
+
target_adapter_type: Optional[str] = None,
|
|
155
|
+
coerce_view_to_table: bool = False,
|
|
156
|
+
) -> FederatedExecutionResult:
|
|
157
|
+
"""
|
|
158
|
+
Execute a node using federated query processing.
|
|
159
|
+
|
|
160
|
+
:param node: The compiled node to execute
|
|
161
|
+
:param analysis_result: Query analysis result
|
|
162
|
+
:param compute_engine_override: Override compute engine choice
|
|
163
|
+
:param spark_config: Spark configuration (if using Spark)
|
|
164
|
+
:param target_adapter_type: Target adapter type for JDBC materialization
|
|
165
|
+
:param coerce_view_to_table: DVT v0.51.6 - If True, treat view as table (Rule 3.C.3)
|
|
166
|
+
:returns: FederatedExecutionResult with query results
|
|
167
|
+
:raises DbtRuntimeError: If execution fails
|
|
168
|
+
"""
|
|
169
|
+
import time
|
|
170
|
+
|
|
171
|
+
_log(f"[DVT] Starting federated execution for node: {node.unique_id}")
|
|
172
|
+
start_time = time.time()
|
|
173
|
+
|
|
174
|
+
# Determine compute engine
|
|
175
|
+
compute_engine = (
|
|
176
|
+
compute_engine_override
|
|
177
|
+
or analysis_result.user_override
|
|
178
|
+
or self.default_compute_engine
|
|
179
|
+
)
|
|
180
|
+
_log(f"[DVT] Compute engine selected: {compute_engine}")
|
|
181
|
+
|
|
182
|
+
# DVT v0.5.0: Restrict Spark compute to table and incremental materializations only
|
|
183
|
+
# DVT v0.51.6: Allow view if coerce_view_to_table is True (Rule 3.C.3)
|
|
184
|
+
if hasattr(node, 'config') and hasattr(node.config, 'materialized'):
|
|
185
|
+
materialized = node.config.materialized
|
|
186
|
+
|
|
187
|
+
# DVT v0.51.6: Views are coerced to tables in cross-target scenarios
|
|
188
|
+
effective_materialized = 'table' if (materialized == 'view' and coerce_view_to_table) else materialized
|
|
189
|
+
|
|
190
|
+
# Only allow table and incremental
|
|
191
|
+
if effective_materialized not in ('table', 'incremental'):
|
|
192
|
+
raise DbtRuntimeError(
|
|
193
|
+
f"Spark compute engine only supports 'table' and 'incremental' materializations. "
|
|
194
|
+
f"Node '{node.unique_id}' uses '{materialized}'. "
|
|
195
|
+
f"Please change the materialization to 'table' or 'incremental', or use adapter-native execution."
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# For incremental, validate strategy is 'append' (only supported strategy)
|
|
199
|
+
if materialized == 'incremental':
|
|
200
|
+
incremental_strategy = getattr(node.config, 'incremental_strategy', 'append')
|
|
201
|
+
if incremental_strategy != 'append':
|
|
202
|
+
raise DbtRuntimeError(
|
|
203
|
+
f"Spark compute engine only supports 'append' incremental strategy. "
|
|
204
|
+
f"Node '{node.unique_id}' uses '{incremental_strategy}'. "
|
|
205
|
+
f"Supported strategies: append. "
|
|
206
|
+
f"For merge/delete+insert/insert_overwrite, use adapter-native execution."
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if coerce_view_to_table and materialized == 'view':
|
|
210
|
+
_log(f"[DVT] Materialization: view → table (coerced for cross-target)")
|
|
211
|
+
else:
|
|
212
|
+
_log(f"[DVT] Materialization validated: {materialized}")
|
|
213
|
+
|
|
214
|
+
# Extract source table metadata
|
|
215
|
+
source_tables = self._extract_source_tables(analysis_result)
|
|
216
|
+
_log(f"[DVT] Found {len(source_tables)} source table(s)")
|
|
217
|
+
|
|
218
|
+
# v0.5.99: Look up named clusters from registry
|
|
219
|
+
from dbt.config.compute import ComputeRegistry
|
|
220
|
+
from dbt.compute.jdbc_utils import set_docker_mode
|
|
221
|
+
registry = ComputeRegistry()
|
|
222
|
+
cluster_config = None
|
|
223
|
+
|
|
224
|
+
# Check if it's a registered named cluster
|
|
225
|
+
if compute_engine not in ("spark-local", "spark", "spark-cluster"):
|
|
226
|
+
cluster = registry.get(compute_engine)
|
|
227
|
+
if cluster:
|
|
228
|
+
cluster_config = cluster.config
|
|
229
|
+
_log(f"[DVT] Found registered cluster '{compute_engine}' with platform: {cluster.detect_platform().value}")
|
|
230
|
+
|
|
231
|
+
# DVT v0.51.8: Enable Docker mode for standalone clusters with localhost master
|
|
232
|
+
# This rewrites localhost -> host.docker.internal in JDBC URLs
|
|
233
|
+
master = cluster_config.get("master", "")
|
|
234
|
+
if master.startswith("spark://") and ("localhost" in master or "127.0.0.1" in master):
|
|
235
|
+
set_docker_mode(True)
|
|
236
|
+
_log("[DVT] Docker mode enabled for JDBC URLs")
|
|
237
|
+
else:
|
|
238
|
+
set_docker_mode(False)
|
|
239
|
+
else:
|
|
240
|
+
# Not in registry - check if it starts with "spark" for backwards compat
|
|
241
|
+
if not compute_engine.startswith("spark"):
|
|
242
|
+
raise DbtRuntimeError(
|
|
243
|
+
f"Invalid compute engine '{compute_engine}'. "
|
|
244
|
+
f"Not found in compute registry. "
|
|
245
|
+
f"Available: {[c.name for c in registry.list()]}"
|
|
246
|
+
)
|
|
247
|
+
else:
|
|
248
|
+
set_docker_mode(False)
|
|
249
|
+
|
|
250
|
+
# Create Spark engine (local or cluster based on config)
|
|
251
|
+
_log(f"[DVT] Creating Spark engine (mode: {compute_engine})")
|
|
252
|
+
if compute_engine == "spark-local" or compute_engine == "spark":
|
|
253
|
+
engine = SparkEngine(mode="embedded", spark_config=spark_config or {})
|
|
254
|
+
elif compute_engine == "spark-cluster" or compute_engine.startswith("spark:"):
|
|
255
|
+
# External cluster
|
|
256
|
+
engine = SparkEngine(mode="external", spark_config=spark_config or {})
|
|
257
|
+
elif cluster_config:
|
|
258
|
+
# Named cluster from registry - pass full config
|
|
259
|
+
engine = SparkEngine(mode="external", spark_config=cluster_config)
|
|
260
|
+
else:
|
|
261
|
+
# Fallback
|
|
262
|
+
engine = SparkEngine(mode="external", spark_config=spark_config or {})
|
|
263
|
+
|
|
264
|
+
_log("[DVT] Spark engine created, initializing Spark session...")
|
|
265
|
+
try:
|
|
266
|
+
# v0.5.99: Collect adapter types from sources + target for JDBC driver provisioning
|
|
267
|
+
all_adapter_types = set()
|
|
268
|
+
for source_table in source_tables:
|
|
269
|
+
adapter = self.adapters.get(source_table.connection_name)
|
|
270
|
+
if adapter:
|
|
271
|
+
all_adapter_types.add(adapter.type())
|
|
272
|
+
# Include target adapter type for materialization
|
|
273
|
+
if target_adapter_type:
|
|
274
|
+
all_adapter_types.add(target_adapter_type)
|
|
275
|
+
_log(f"[DVT] Adapter types (sources + target): {all_adapter_types}")
|
|
276
|
+
|
|
277
|
+
# Initialize Spark session with all adapter types (for JDBC drivers)
|
|
278
|
+
engine.connect(adapter_types=all_adapter_types)
|
|
279
|
+
_log("[DVT] Spark session initialized successfully")
|
|
280
|
+
|
|
281
|
+
# Get compiled SQL first (needed for optimization checks)
|
|
282
|
+
compiled_sql = (
|
|
283
|
+
node.compiled_code
|
|
284
|
+
if hasattr(node, "compiled_code")
|
|
285
|
+
else node.raw_code
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Step 1: Load source data into Spark via JDBC (v0.3.0: Spark-only)
|
|
289
|
+
total_rows_read = self._load_sources_spark_jdbc(
|
|
290
|
+
engine, source_tables, analysis_result, compiled_sql
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# Step 2: Rewrite SQL to use table aliases
|
|
294
|
+
rewritten_sql = self._rewrite_sql_for_compute(
|
|
295
|
+
compiled_sql, source_tables
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Step 3: Execute query in Spark
|
|
299
|
+
result_df = engine.spark.sql(rewritten_sql)
|
|
300
|
+
|
|
301
|
+
# Calculate execution time
|
|
302
|
+
execution_time_ms = (time.time() - start_time) * 1000
|
|
303
|
+
|
|
304
|
+
# Return Spark DataFrame AND engine (caller must close engine after materialization)
|
|
305
|
+
return FederatedExecutionResult(
|
|
306
|
+
spark_dataframe=result_df,
|
|
307
|
+
source_tables=source_tables,
|
|
308
|
+
compute_engine=compute_engine,
|
|
309
|
+
execution_time_ms=execution_time_ms,
|
|
310
|
+
rows_read=total_rows_read,
|
|
311
|
+
rows_returned=None, # Will be counted during JDBC write
|
|
312
|
+
engine=engine, # Return engine for lifecycle management
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
# Clean up engine on error
|
|
317
|
+
try:
|
|
318
|
+
engine.close()
|
|
319
|
+
except:
|
|
320
|
+
pass
|
|
321
|
+
# DVT v0.5.2: Clean error message (no Java stack trace)
|
|
322
|
+
clean_error = _clean_spark_error(e)
|
|
323
|
+
# DVT v0.5.99: Include original exception for debugging if cleaned message is too short
|
|
324
|
+
if len(clean_error) < 20:
|
|
325
|
+
clean_error = f"{clean_error} (original: {str(e)[:200]})"
|
|
326
|
+
raise DbtRuntimeError(
|
|
327
|
+
f"Federated execution failed for node {node.unique_id}: {clean_error}"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
def _extract_source_tables(
|
|
331
|
+
self, analysis_result: QueryAnalysisResult
|
|
332
|
+
) -> List[SourceTableMetadata]:
|
|
333
|
+
"""
|
|
334
|
+
Extract metadata for all source tables referenced in the query.
|
|
335
|
+
|
|
336
|
+
:param analysis_result: Query analysis result
|
|
337
|
+
:returns: List of SourceTableMetadata
|
|
338
|
+
"""
|
|
339
|
+
source_tables = []
|
|
340
|
+
|
|
341
|
+
for source_id in analysis_result.source_refs:
|
|
342
|
+
source = self.manifest.sources.get(source_id)
|
|
343
|
+
if not source:
|
|
344
|
+
raise DbtRuntimeError(
|
|
345
|
+
f"Source {source_id} not found in manifest. "
|
|
346
|
+
f"Available sources: {list(self.manifest.sources.keys())[:3]}"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Get connection name from source definition
|
|
350
|
+
connection_name = getattr(source, "connection", None)
|
|
351
|
+
|
|
352
|
+
if not connection_name:
|
|
353
|
+
raise DbtRuntimeError(
|
|
354
|
+
f"Source {source_id} does not have a connection specified. "
|
|
355
|
+
"DVT requires all sources to specify a connection in the source YAML:\n"
|
|
356
|
+
" - name: my_source\n"
|
|
357
|
+
" connection: my_connection"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Build qualified name for SQL
|
|
361
|
+
qualified_name = f"{source.database}.{source.schema}.{source.identifier}"
|
|
362
|
+
|
|
363
|
+
metadata = SourceTableMetadata(
|
|
364
|
+
source_id=source_id,
|
|
365
|
+
connection_name=connection_name,
|
|
366
|
+
database=source.database,
|
|
367
|
+
schema=source.schema,
|
|
368
|
+
identifier=source.identifier,
|
|
369
|
+
qualified_name=qualified_name,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
source_tables.append(metadata)
|
|
373
|
+
|
|
374
|
+
return source_tables
|
|
375
|
+
|
|
376
|
+
# NOTE: _load_sources_via_adapters method removed in v0.3.0
|
|
377
|
+
# All data loading now uses Spark JDBC via _load_sources_spark_jdbc
|
|
378
|
+
|
|
379
|
+
def _load_sources_spark_jdbc(
|
|
380
|
+
self,
|
|
381
|
+
engine: SparkEngine,
|
|
382
|
+
source_tables: List[SourceTableMetadata],
|
|
383
|
+
analysis_result: QueryAnalysisResult,
|
|
384
|
+
compiled_sql: str,
|
|
385
|
+
) -> int:
|
|
386
|
+
"""
|
|
387
|
+
Load all source tables into Spark via JDBC connectors (Phase 1: v0.2.0).
|
|
388
|
+
|
|
389
|
+
This bypasses the DVT node's memory by reading data directly from source
|
|
390
|
+
databases into Spark workers (distributed memory). Data flow:
|
|
391
|
+
Source DB → Spark Workers → Target DB (no DVT node bottleneck)
|
|
392
|
+
|
|
393
|
+
This method:
|
|
394
|
+
1. Gets adapter credentials for each source
|
|
395
|
+
2. Converts credentials to JDBC config
|
|
396
|
+
3. Auto-detects partition column for parallel reads
|
|
397
|
+
4. Reads data via Spark JDBC with partitioning
|
|
398
|
+
5. Registers as temp view in Spark
|
|
399
|
+
|
|
400
|
+
:param engine: Spark engine instance
|
|
401
|
+
:param source_tables: List of source table metadata
|
|
402
|
+
:param analysis_result: Query analysis result
|
|
403
|
+
:returns: Total number of rows loaded (estimated, as Spark is lazy)
|
|
404
|
+
:raises DbtRuntimeError: If JDBC not supported or read fails
|
|
405
|
+
"""
|
|
406
|
+
from dbt.compute.jdbc_utils import build_jdbc_config
|
|
407
|
+
from dbt.compute.filter_pushdown import optimize_jdbc_table_read
|
|
408
|
+
|
|
409
|
+
total_rows = 0
|
|
410
|
+
|
|
411
|
+
for source_meta in source_tables:
|
|
412
|
+
# Get adapter for this source's connection
|
|
413
|
+
adapter = self.adapters.get(source_meta.connection_name)
|
|
414
|
+
if not adapter:
|
|
415
|
+
raise DbtRuntimeError(
|
|
416
|
+
f"No adapter found for connection '{source_meta.connection_name}'"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Check if JDBC is supported for this adapter type
|
|
420
|
+
if not engine.supports_jdbc(adapter.type()):
|
|
421
|
+
raise DbtRuntimeError(
|
|
422
|
+
f"JDBC not supported for adapter type '{adapter.type()}'. "
|
|
423
|
+
f"Falling back to adapter-based loading not yet implemented. "
|
|
424
|
+
f"Please use DuckDB engine for this source type."
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# Log connection attempt
|
|
428
|
+
_log(f"[DVT] Connecting to {adapter.type()} source: {source_meta.qualified_name} (connection: {source_meta.connection_name})")
|
|
429
|
+
connection_start = time.time()
|
|
430
|
+
|
|
431
|
+
# Get adapter credentials
|
|
432
|
+
credentials = adapter.config.credentials
|
|
433
|
+
|
|
434
|
+
# Build JDBC configuration
|
|
435
|
+
try:
|
|
436
|
+
jdbc_url, jdbc_properties = build_jdbc_config(credentials)
|
|
437
|
+
except Exception as e:
|
|
438
|
+
_log(f"[DVT] ERROR: Failed to build JDBC config for '{source_meta.qualified_name}': {str(e)}")
|
|
439
|
+
raise DbtRuntimeError(
|
|
440
|
+
f"Failed to build JDBC config for source '{source_meta.qualified_name}': {str(e)}"
|
|
441
|
+
) from e
|
|
442
|
+
|
|
443
|
+
# Prepare JDBC read parameters with filter pushdown optimization
|
|
444
|
+
# Instead of reading full table, push down filters (LIMIT, WHERE) to source DB
|
|
445
|
+
jdbc_table = optimize_jdbc_table_read(
|
|
446
|
+
source_table=source_meta,
|
|
447
|
+
compiled_sql=compiled_sql,
|
|
448
|
+
source_tables=source_tables,
|
|
449
|
+
adapter_type=adapter.type()
|
|
450
|
+
)
|
|
451
|
+
table_alias = self._get_table_alias(source_meta)
|
|
452
|
+
numPartitions = 16 # Default parallelism
|
|
453
|
+
|
|
454
|
+
# Automatic partition detection DISABLED
|
|
455
|
+
# Reasons:
|
|
456
|
+
# 1. Slow metadata queries (30-60s on cold Snowflake warehouses)
|
|
457
|
+
# 2. Unnecessary overhead for small datasets
|
|
458
|
+
# 3. Filter pushdown now handles optimization automatically
|
|
459
|
+
partition_column = None
|
|
460
|
+
lower_bound = None
|
|
461
|
+
upper_bound = None
|
|
462
|
+
|
|
463
|
+
# Read via Spark JDBC and register as temp view
|
|
464
|
+
_log(f"[DVT] Reading from JDBC: {jdbc_table}")
|
|
465
|
+
try:
|
|
466
|
+
engine.register_jdbc_table(
|
|
467
|
+
url=jdbc_url,
|
|
468
|
+
table=jdbc_table,
|
|
469
|
+
properties=jdbc_properties,
|
|
470
|
+
table_alias=table_alias,
|
|
471
|
+
numPartitions=numPartitions,
|
|
472
|
+
partitionColumn=partition_column,
|
|
473
|
+
lowerBound=lower_bound,
|
|
474
|
+
upperBound=upper_bound,
|
|
475
|
+
)
|
|
476
|
+
connection_time = time.time() - connection_start
|
|
477
|
+
_log(f"[DVT] ✓ Connected to {source_meta.qualified_name} in {connection_time:.1f}s")
|
|
478
|
+
if connection_time > 30:
|
|
479
|
+
_log(f"[DVT] WARNING: Connection took {connection_time:.1f}s (warehouse may have been suspended)")
|
|
480
|
+
except Exception as e:
|
|
481
|
+
connection_time = time.time() - connection_start
|
|
482
|
+
# DVT v0.5.2: Clean error message (no Java stack trace)
|
|
483
|
+
clean_error = _clean_spark_error(e)
|
|
484
|
+
_log(f"[DVT] ERROR: Failed to load '{source_meta.qualified_name}' after {connection_time:.1f}s: {clean_error}")
|
|
485
|
+
raise DbtRuntimeError(
|
|
486
|
+
f"Failed to load source '{source_meta.qualified_name}' via JDBC: {clean_error}"
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# Note: Can't easily count rows without triggering Spark action
|
|
490
|
+
# For now, return 0 (rows_read will be inaccurate for JDBC path)
|
|
491
|
+
# TODO: Consider running COUNT(*) query if row count is needed
|
|
492
|
+
total_rows += 0
|
|
493
|
+
|
|
494
|
+
return total_rows
|
|
495
|
+
|
|
496
|
+
def _get_table_alias(self, source_meta: SourceTableMetadata) -> str:
|
|
497
|
+
"""
|
|
498
|
+
Generate a safe table alias for the compute engine.
|
|
499
|
+
|
|
500
|
+
Compute engines may not support dots or special characters in table names,
|
|
501
|
+
so we create a normalized alias.
|
|
502
|
+
|
|
503
|
+
:param source_meta: Source table metadata
|
|
504
|
+
:returns: Safe table alias
|
|
505
|
+
"""
|
|
506
|
+
# Extract source name and table name from source_id
|
|
507
|
+
# source_id format: source.{project}.{source_name}.{table_name}
|
|
508
|
+
parts = source_meta.source_id.split(".")
|
|
509
|
+
if len(parts) >= 4:
|
|
510
|
+
source_name = parts[2]
|
|
511
|
+
table_name = parts[3]
|
|
512
|
+
return f"{source_name}_{table_name}"
|
|
513
|
+
else:
|
|
514
|
+
# Fallback: use identifier
|
|
515
|
+
return source_meta.identifier
|
|
516
|
+
|
|
517
|
+
def _rewrite_sql_for_compute(
|
|
518
|
+
self, sql: str, source_tables: List[SourceTableMetadata]
|
|
519
|
+
) -> str:
|
|
520
|
+
"""
|
|
521
|
+
Rewrite SQL to replace fully-qualified source table names with compute engine aliases.
|
|
522
|
+
|
|
523
|
+
Source tables are loaded into the compute engine with simple aliases (e.g., 'Exim_cbs_f_country'),
|
|
524
|
+
but the compiled SQL contains fully-qualified names (e.g., '"EXIM_EDWH_DEV"."ods"."cbs_f_country"').
|
|
525
|
+
This method replaces the qualified names with the aliases and removes source-specific clauses
|
|
526
|
+
like SAMPLE that have been pushed down to the source.
|
|
527
|
+
|
|
528
|
+
:param sql: Compiled SQL with fully-qualified table names
|
|
529
|
+
:param source_tables: List of source table metadata
|
|
530
|
+
:returns: Rewritten SQL with aliases and source-specific clauses removed
|
|
531
|
+
"""
|
|
532
|
+
import re
|
|
533
|
+
|
|
534
|
+
rewritten_sql = sql
|
|
535
|
+
|
|
536
|
+
for source_meta in source_tables:
|
|
537
|
+
# Get the alias used in the compute engine
|
|
538
|
+
alias = self._get_table_alias(source_meta)
|
|
539
|
+
|
|
540
|
+
# Replace the fully-qualified table name with the alias
|
|
541
|
+
# Format: "database"."schema"."table" or database.schema.table
|
|
542
|
+
qualified_name = source_meta.qualified_name
|
|
543
|
+
parts = qualified_name.split(".")
|
|
544
|
+
|
|
545
|
+
# DVT v0.51.7: Use case-insensitive regex replacement for all variants
|
|
546
|
+
# because Snowflake returns uppercase but Spark/Databricks lowercases
|
|
547
|
+
|
|
548
|
+
# 1. Unquoted: EXIM_EDWH_DEV.ods.cbs_f_country (any case)
|
|
549
|
+
unquoted_pattern = re.compile(
|
|
550
|
+
r'\b' + r'\.'.join(re.escape(p) for p in parts) + r'\b',
|
|
551
|
+
re.IGNORECASE
|
|
552
|
+
)
|
|
553
|
+
rewritten_sql = unquoted_pattern.sub(alias, rewritten_sql)
|
|
554
|
+
|
|
555
|
+
# 2. Double-quoted (PostgreSQL style): "EXIM_EDWH_DEV"."ods"."cbs_f_country" (any case)
|
|
556
|
+
quoted_pattern = re.compile(
|
|
557
|
+
r'"' + r'"\."\s*'.join(re.escape(p) for p in parts) + r'"',
|
|
558
|
+
re.IGNORECASE
|
|
559
|
+
)
|
|
560
|
+
rewritten_sql = quoted_pattern.sub(alias, rewritten_sql)
|
|
561
|
+
|
|
562
|
+
# 3. Single string quoted: "EXIM_EDWH_DEV.ods.cbs_f_country" (any case)
|
|
563
|
+
single_quoted_pattern = re.compile(
|
|
564
|
+
r'"' + r'\.'.join(re.escape(p) for p in parts) + r'"',
|
|
565
|
+
re.IGNORECASE
|
|
566
|
+
)
|
|
567
|
+
rewritten_sql = single_quoted_pattern.sub(alias, rewritten_sql)
|
|
568
|
+
|
|
569
|
+
# 4. Backtick-quoted (Spark/Databricks style): `EXIM_EDWH_DEV`.`ods`.`cbs_f_country` (any case)
|
|
570
|
+
backtick_pattern = re.compile(
|
|
571
|
+
r'`' + r'`\.`\s*'.join(re.escape(p) for p in parts) + r'`',
|
|
572
|
+
re.IGNORECASE
|
|
573
|
+
)
|
|
574
|
+
rewritten_sql = backtick_pattern.sub(alias, rewritten_sql)
|
|
575
|
+
|
|
576
|
+
# DVT v0.4.5: Remove Snowflake-specific SAMPLE clauses
|
|
577
|
+
# These have been pushed down to the source via JDBC subqueries
|
|
578
|
+
# Spark SQL doesn't support SAMPLE syntax, so remove it from the query
|
|
579
|
+
# Pattern matches: SAMPLE (N), SAMPLE (N ROWS), SAMPLE SYSTEM|BERNOULLI|BLOCK (P)
|
|
580
|
+
# with optional REPEATABLE(seed) or SEED(seed)
|
|
581
|
+
rewritten_sql = re.sub(
|
|
582
|
+
r'\s*(?:TABLE)?SAMPLE\s+(?:SYSTEM|BERNOULLI|BLOCK)\s*\(\s*\d+(?:\.\d+)?\s*\)'
|
|
583
|
+
r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*\d+\s*\))?',
|
|
584
|
+
'',
|
|
585
|
+
rewritten_sql,
|
|
586
|
+
flags=re.IGNORECASE
|
|
587
|
+
)
|
|
588
|
+
rewritten_sql = re.sub(
|
|
589
|
+
r'\s*(?:TABLE)?SAMPLE\s*\(\s*\d+(?:\s+ROWS)?\s*\)'
|
|
590
|
+
r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*\d+\s*\))?',
|
|
591
|
+
'',
|
|
592
|
+
rewritten_sql,
|
|
593
|
+
flags=re.IGNORECASE
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
return rewritten_sql
|
|
597
|
+
|
|
598
|
+
def materialize_result(
|
|
599
|
+
self,
|
|
600
|
+
result: FederatedExecutionResult,
|
|
601
|
+
target_adapter: BaseAdapter,
|
|
602
|
+
target_table: str,
|
|
603
|
+
mode: str = "create",
|
|
604
|
+
use_jdbc: bool = True,
|
|
605
|
+
spark_result_df: Optional[Any] = None,
|
|
606
|
+
) -> Any:
|
|
607
|
+
"""
|
|
608
|
+
Materialize federated query results to target database.
|
|
609
|
+
|
|
610
|
+
v0.3.0: Uses Spark JDBC for all materialization (default).
|
|
611
|
+
|
|
612
|
+
:param result: Federated execution result
|
|
613
|
+
:param target_adapter: Adapter to use for getting target credentials
|
|
614
|
+
:param target_table: Target table name (qualified)
|
|
615
|
+
:param mode: Write mode ('create', 'append', 'replace')
|
|
616
|
+
:param use_jdbc: If True, use JDBC write path (default in v0.3.0)
|
|
617
|
+
:param spark_result_df: Spark DataFrame with results (required for JDBC path)
|
|
618
|
+
:returns: AdapterResponse from write operation
|
|
619
|
+
"""
|
|
620
|
+
if use_jdbc and spark_result_df is not None:
|
|
621
|
+
# Use JDBC write path (default in v0.3.0)
|
|
622
|
+
return self._materialize_spark_jdbc(
|
|
623
|
+
result_df=spark_result_df,
|
|
624
|
+
target_adapter=target_adapter,
|
|
625
|
+
target_table=target_table,
|
|
626
|
+
mode=mode,
|
|
627
|
+
)
|
|
628
|
+
else:
|
|
629
|
+
# Fallback: use target adapter directly (for adapters without JDBC support)
|
|
630
|
+
raise DbtRuntimeError(
|
|
631
|
+
"Non-JDBC materialization path removed in v0.3.0. "
|
|
632
|
+
"All materialization requires Spark JDBC. "
|
|
633
|
+
"Ensure spark_result_df is provided."
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
def _materialize_spark_jdbc(
|
|
637
|
+
self,
|
|
638
|
+
result_df: Any, # Spark DataFrame
|
|
639
|
+
target_adapter: BaseAdapter,
|
|
640
|
+
target_table: str,
|
|
641
|
+
mode: str = "create",
|
|
642
|
+
) -> Any:
|
|
643
|
+
"""
|
|
644
|
+
Materialize Spark query results to target database via JDBC (Phase 1: v0.2.0).
|
|
645
|
+
|
|
646
|
+
This bypasses the DVT node's memory by writing data directly from Spark
|
|
647
|
+
workers to the target database.
|
|
648
|
+
|
|
649
|
+
:param result_df: Spark DataFrame with query results
|
|
650
|
+
:param target_adapter: Adapter to use for getting target credentials
|
|
651
|
+
:param target_table: Target table name (qualified)
|
|
652
|
+
:param mode: Write mode ('create', 'append', 'replace')
|
|
653
|
+
:returns: AdapterResponse
|
|
654
|
+
:raises DbtRuntimeError: If JDBC write fails
|
|
655
|
+
"""
|
|
656
|
+
from dbt.compute.jdbc_utils import build_jdbc_config
|
|
657
|
+
from dbt.adapters.contracts.connection import AdapterResponse
|
|
658
|
+
|
|
659
|
+
# Get target credentials
|
|
660
|
+
target_credentials = target_adapter.config.credentials
|
|
661
|
+
|
|
662
|
+
# Build JDBC configuration for target
|
|
663
|
+
try:
|
|
664
|
+
jdbc_url, jdbc_properties = build_jdbc_config(target_credentials)
|
|
665
|
+
except Exception as e:
|
|
666
|
+
raise DbtRuntimeError(
|
|
667
|
+
f"Failed to build JDBC config for target '{target_table}': {str(e)}"
|
|
668
|
+
) from e
|
|
669
|
+
|
|
670
|
+
# Map DVT mode to Spark JDBC mode
|
|
671
|
+
spark_mode_mapping = {
|
|
672
|
+
"create": "overwrite", # Create/recreate table (dbt behavior)
|
|
673
|
+
"append": "append", # Add to existing table
|
|
674
|
+
"replace": "overwrite", # Drop and recreate
|
|
675
|
+
}
|
|
676
|
+
spark_mode = spark_mode_mapping.get(mode, "overwrite")
|
|
677
|
+
|
|
678
|
+
_log(f"[DVT] Writing to target via Spark JDBC: {target_table} (mode={spark_mode})")
|
|
679
|
+
|
|
680
|
+
# Get Spark session from DataFrame
|
|
681
|
+
spark = result_df.sparkSession
|
|
682
|
+
|
|
683
|
+
# Log DataFrame schema for debugging
|
|
684
|
+
_log(f"[DVT] DataFrame schema:")
|
|
685
|
+
for field in result_df.schema.fields:
|
|
686
|
+
_log(f" - {field.name}: {field.dataType}")
|
|
687
|
+
|
|
688
|
+
# Log row count
|
|
689
|
+
row_count = result_df.count()
|
|
690
|
+
_log(f"[DVT] DataFrame has {row_count} rows")
|
|
691
|
+
|
|
692
|
+
# Sanitize URL for logging (hide password)
|
|
693
|
+
safe_url = jdbc_url.split("?")[0] if "?" in jdbc_url else jdbc_url
|
|
694
|
+
_log(f"[DVT] JDBC URL: {safe_url}")
|
|
695
|
+
_log(f"[DVT] JDBC table: {target_table}")
|
|
696
|
+
|
|
697
|
+
# Write via JDBC
|
|
698
|
+
saved_views: List[Dict[str, str]] = []
|
|
699
|
+
target_adapter_type = target_adapter.type()
|
|
700
|
+
is_postgres = target_adapter_type in ("postgres", "postgresql")
|
|
701
|
+
|
|
702
|
+
try:
|
|
703
|
+
# DVT v0.5.5: Save dependent views before DROP CASCADE, restore after
|
|
704
|
+
# Spark's JDBC overwrite mode doesn't use CASCADE, causing failures
|
|
705
|
+
# when dependent objects (views, etc.) exist
|
|
706
|
+
# DVT v0.51.6: Only applies to PostgreSQL (other DBs handle this differently)
|
|
707
|
+
if spark_mode == "overwrite" and is_postgres:
|
|
708
|
+
try:
|
|
709
|
+
with target_adapter.connection_named("__dvt_drop__"):
|
|
710
|
+
conn = target_adapter.connections.get_thread_connection()
|
|
711
|
+
cursor = conn.handle.cursor()
|
|
712
|
+
|
|
713
|
+
# Parse schema.table from target_table
|
|
714
|
+
parts = target_table.replace('"', '').split('.')
|
|
715
|
+
if len(parts) >= 2:
|
|
716
|
+
tbl_schema = parts[-2]
|
|
717
|
+
tbl_name = parts[-1]
|
|
718
|
+
else:
|
|
719
|
+
tbl_schema = 'public'
|
|
720
|
+
tbl_name = parts[-1]
|
|
721
|
+
|
|
722
|
+
# DVT v0.5.5: Save dependent views before dropping
|
|
723
|
+
saved_views = _get_dependent_views_pg(cursor, tbl_schema, tbl_name)
|
|
724
|
+
if saved_views:
|
|
725
|
+
_log(f"[DVT] Saving {len(saved_views)} dependent view(s) before DROP")
|
|
726
|
+
|
|
727
|
+
# Use CASCADE to drop dependent objects
|
|
728
|
+
drop_sql = f"DROP TABLE IF EXISTS {target_table} CASCADE"
|
|
729
|
+
_log(f"[DVT] Pre-drop with CASCADE: {drop_sql}")
|
|
730
|
+
cursor.execute(drop_sql)
|
|
731
|
+
conn.handle.commit()
|
|
732
|
+
cursor.close()
|
|
733
|
+
except Exception as drop_err:
|
|
734
|
+
_log(f"[DVT] Pre-drop warning (continuing): {drop_err}")
|
|
735
|
+
|
|
736
|
+
result_df.write.format("jdbc").options(
|
|
737
|
+
url=jdbc_url, dbtable=target_table, batchsize="10000", **jdbc_properties
|
|
738
|
+
).mode(spark_mode).save()
|
|
739
|
+
|
|
740
|
+
# DVT v0.5.5: Restore dependent views after successful write (PostgreSQL only)
|
|
741
|
+
if saved_views and is_postgres:
|
|
742
|
+
try:
|
|
743
|
+
with target_adapter.connection_named("__dvt_restore__"):
|
|
744
|
+
conn = target_adapter.connections.get_thread_connection()
|
|
745
|
+
cursor = conn.handle.cursor()
|
|
746
|
+
_recreate_views_pg(cursor, saved_views)
|
|
747
|
+
conn.handle.commit()
|
|
748
|
+
cursor.close()
|
|
749
|
+
_log(f"[DVT] Restored {len(saved_views)} dependent view(s)")
|
|
750
|
+
except Exception as restore_err:
|
|
751
|
+
_log(f"[DVT] Warning: Could not restore views: {restore_err}")
|
|
752
|
+
|
|
753
|
+
# Return mock AdapterResponse
|
|
754
|
+
# Note: Can't easily get rows_affected from Spark JDBC write
|
|
755
|
+
return AdapterResponse(
|
|
756
|
+
_message=f"SUCCESS - Table {target_table} materialized via JDBC",
|
|
757
|
+
rows_affected=row_count,
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
except Exception as e:
|
|
761
|
+
# DVT v0.5.2: Clean error message (no Java stack trace)
|
|
762
|
+
clean_error = _clean_spark_error(e)
|
|
763
|
+
raise DbtRuntimeError(
|
|
764
|
+
f"Failed to materialize results to '{target_table}': {clean_error}"
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
def explain_execution(
|
|
768
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
769
|
+
) -> str:
|
|
770
|
+
"""
|
|
771
|
+
Generate an execution plan explanation for a federated query.
|
|
772
|
+
|
|
773
|
+
Useful for debugging and optimization.
|
|
774
|
+
|
|
775
|
+
:param node: The node to explain
|
|
776
|
+
:param analysis_result: Query analysis result
|
|
777
|
+
:returns: Human-readable execution plan
|
|
778
|
+
"""
|
|
779
|
+
source_tables = self._extract_source_tables(analysis_result)
|
|
780
|
+
|
|
781
|
+
plan_parts = [
|
|
782
|
+
"=== DVT Federated Execution Plan ===",
|
|
783
|
+
f"Node: {node.unique_id}",
|
|
784
|
+
f"Compute Engine: {self.default_compute_engine}",
|
|
785
|
+
"",
|
|
786
|
+
"Data Sources:",
|
|
787
|
+
]
|
|
788
|
+
|
|
789
|
+
for i, source_meta in enumerate(source_tables, 1):
|
|
790
|
+
plan_parts.append(
|
|
791
|
+
f" {i}. {source_meta.qualified_name} "
|
|
792
|
+
f"(connection: {source_meta.connection_name})"
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
plan_parts.extend(
|
|
796
|
+
[
|
|
797
|
+
"",
|
|
798
|
+
"Execution Steps (v0.3.0 - Spark-Unified):",
|
|
799
|
+
" 1. Extract data from each source via Spark JDBC (parallel reads)",
|
|
800
|
+
f" 2. Load {len(source_tables)} table(s) into Spark ({self.default_compute_engine})",
|
|
801
|
+
" 3. Execute query in Spark",
|
|
802
|
+
" 4. Materialize to target via Spark JDBC",
|
|
803
|
+
"",
|
|
804
|
+
f"Strategy: {analysis_result.strategy.upper()}",
|
|
805
|
+
f"Reason: {analysis_result.reason}",
|
|
806
|
+
]
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
return "\n".join(plan_parts)
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
class SourceRewriter:
|
|
813
|
+
"""
|
|
814
|
+
Rewrites SQL queries to use compute engine table aliases.
|
|
815
|
+
|
|
816
|
+
When sources are loaded into compute engines, they may be registered with
|
|
817
|
+
different names (aliases). This class rewrites the SQL to use those aliases.
|
|
818
|
+
"""
|
|
819
|
+
|
|
820
|
+
@staticmethod
|
|
821
|
+
def rewrite_sources(sql: str, source_mapping: Dict[str, str]) -> str:
|
|
822
|
+
"""
|
|
823
|
+
Rewrite SQL to use compute engine table aliases.
|
|
824
|
+
|
|
825
|
+
:param sql: Original SQL with qualified source names
|
|
826
|
+
:param source_mapping: Dict of qualified_name → alias
|
|
827
|
+
:returns: Rewritten SQL
|
|
828
|
+
"""
|
|
829
|
+
rewritten = sql
|
|
830
|
+
|
|
831
|
+
# Replace each qualified name with its alias
|
|
832
|
+
for qualified_name, alias in source_mapping.items():
|
|
833
|
+
# Match qualified name (database.schema.table)
|
|
834
|
+
pattern = re.compile(rf"\b{re.escape(qualified_name)}\b", re.IGNORECASE)
|
|
835
|
+
rewritten = pattern.sub(alias, rewritten)
|
|
836
|
+
|
|
837
|
+
return rewritten
|