dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2403 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-311-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/metadata/__init__.py +40 -0
- dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/store.py +1499 -0
- dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/local.py +443 -0
- dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-311-darwin.so +0 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-311-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.cpython-311-darwin.so +0 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +947 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.cpython-311-darwin.so +0 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +362 -0
- dbt/task/dvt_run.py +204 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.cpython-311-darwin.so +0 -0
- dbt/task/init.py +604 -0
- dbt/task/java.cpython-311-darwin.so +0 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.cpython-311-darwin.so +0 -0
- dbt/task/metadata.py +804 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.cpython-311-darwin.so +0 -0
- dbt/task/profile.py +1307 -0
- dbt/task/profile_serve.py +615 -0
- dbt/task/retract.py +438 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1387 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.cpython-311-darwin.so +0 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.cpython-311-darwin.so +0 -0
- dbt/task/target_sync.py +766 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +270 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.58.6.dist-info/METADATA +288 -0
- dvt_core-0.58.6.dist-info/RECORD +324 -0
- dvt_core-0.58.6.dist-info/WHEEL +5 -0
- dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
- dvt_core-0.58.6.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1080 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Federated Query Executor
|
|
3
|
+
|
|
4
|
+
Orchestrates multi-source query execution using Spark compute engine.
|
|
5
|
+
This is the core component that enables DVT's data virtualization capabilities.
|
|
6
|
+
|
|
7
|
+
v0.3.0: Unified Spark architecture - all federation uses Spark JDBC.
|
|
8
|
+
v0.58.5: Fixed segfaults by disabling multiprocessing resource tracker.
|
|
9
|
+
|
|
10
|
+
Execution flow:
|
|
11
|
+
1. Identify all source tables/models from compiled SQL
|
|
12
|
+
2. Load sources into Spark via JDBC (parallel reads)
|
|
13
|
+
3. Execute model SQL in Spark
|
|
14
|
+
4. Return results as PyArrow Table
|
|
15
|
+
5. Materialize to target via JDBC or adapter
|
|
16
|
+
|
|
17
|
+
Key principle: Adapters for I/O only, Spark for all compute.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# Standard imports
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
import sys
|
|
24
|
+
import time
|
|
25
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
|
|
32
|
+
from dbt.adapters.base import BaseAdapter
|
|
33
|
+
from dbt.compute.engines.spark_engine import SparkEngine, _clean_spark_error
|
|
34
|
+
from dbt.contracts.graph.manifest import Manifest
|
|
35
|
+
from dbt.contracts.graph.nodes import ManifestNode
|
|
36
|
+
from dbt.query_analyzer import QueryAnalysisResult
|
|
37
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _log(msg: str) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Log a message that appears immediately in console output.
|
|
43
|
+
DVT v0.4.7: Suppressed for clean output (logs go to spark_run_history).
|
|
44
|
+
"""
|
|
45
|
+
# Suppressed for clean output - all debug info goes to spark_run_history file
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _get_dependent_views_pg(cursor, schema: str, table: str) -> List[Dict[str, str]]:
|
|
50
|
+
"""
|
|
51
|
+
Query PostgreSQL for views that depend on a table.
|
|
52
|
+
DVT v0.5.5: Used to save views before DROP CASCADE, then restore after.
|
|
53
|
+
|
|
54
|
+
Returns list of dicts with: schema, name, definition
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
# Query views that depend on this table using pg_depend
|
|
58
|
+
sql = """
|
|
59
|
+
SELECT DISTINCT
|
|
60
|
+
n.nspname as view_schema,
|
|
61
|
+
c.relname as view_name,
|
|
62
|
+
pg_get_viewdef(c.oid, true) as view_definition
|
|
63
|
+
FROM pg_depend d
|
|
64
|
+
JOIN pg_rewrite r ON r.oid = d.objid
|
|
65
|
+
JOIN pg_class c ON c.oid = r.ev_class
|
|
66
|
+
JOIN pg_namespace n ON n.oid = c.relnamespace
|
|
67
|
+
JOIN pg_class t ON t.oid = d.refobjid
|
|
68
|
+
JOIN pg_namespace tn ON tn.oid = t.relnamespace
|
|
69
|
+
WHERE t.relname = %s
|
|
70
|
+
AND tn.nspname = %s
|
|
71
|
+
AND c.relkind = 'v'
|
|
72
|
+
AND d.classid = 'pg_rewrite'::regclass
|
|
73
|
+
AND d.deptype = 'n'
|
|
74
|
+
"""
|
|
75
|
+
cursor.execute(sql, (table, schema))
|
|
76
|
+
rows = cursor.fetchall()
|
|
77
|
+
return [
|
|
78
|
+
{'schema': row[0], 'name': row[1], 'definition': row[2]}
|
|
79
|
+
for row in rows
|
|
80
|
+
]
|
|
81
|
+
except Exception:
|
|
82
|
+
# If query fails (different DB, permissions), return empty
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _recreate_views_pg(cursor, views: List[Dict[str, str]]) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Recreate views from their saved definitions.
|
|
89
|
+
DVT v0.5.5: Restores views after DROP CASCADE.
|
|
90
|
+
"""
|
|
91
|
+
for view in views:
|
|
92
|
+
try:
|
|
93
|
+
create_sql = f'CREATE OR REPLACE VIEW "{view["schema"]}"."{view["name"]}" AS {view["definition"]}'
|
|
94
|
+
_log(f"[DVT] Recreating view: {view['schema']}.{view['name']}")
|
|
95
|
+
cursor.execute(create_sql)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
_log(f"[DVT] Warning: Could not recreate view {view['name']}: {e}")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class SourceTableMetadata:
|
|
102
|
+
"""Metadata about a source table needed for federated execution."""
|
|
103
|
+
|
|
104
|
+
source_id: str # Unique ID from manifest
|
|
105
|
+
connection_name: str # Which connection to read from
|
|
106
|
+
database: str # Database name
|
|
107
|
+
schema: str # Schema name
|
|
108
|
+
identifier: str # Table name
|
|
109
|
+
qualified_name: str # Fully qualified name for SQL
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class FederatedExecutionResult:
|
|
114
|
+
"""Result of federated query execution."""
|
|
115
|
+
|
|
116
|
+
spark_dataframe: Any # Spark DataFrame with query results
|
|
117
|
+
source_tables: List[SourceTableMetadata] # Sources used
|
|
118
|
+
compute_engine: str # Engine used (spark)
|
|
119
|
+
execution_time_ms: float # Execution time in milliseconds
|
|
120
|
+
rows_read: int # Total rows read from sources
|
|
121
|
+
rows_returned: int # Rows in result (may be None if not counted)
|
|
122
|
+
engine: Any # SparkEngine instance (for session lifecycle management)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class FederatedExecutor:
|
|
126
|
+
"""
|
|
127
|
+
Orchestrates federated query execution across multiple data sources.
|
|
128
|
+
|
|
129
|
+
This executor:
|
|
130
|
+
1. Extracts data from multiple sources via adapters
|
|
131
|
+
2. Loads data into a compute engine
|
|
132
|
+
3. Executes the query
|
|
133
|
+
4. Returns results as Spark DataFrame
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(
|
|
137
|
+
self,
|
|
138
|
+
manifest: Manifest,
|
|
139
|
+
adapters: Dict[str, BaseAdapter],
|
|
140
|
+
default_compute_engine: str = "spark-local",
|
|
141
|
+
project_root: Optional[Path] = None,
|
|
142
|
+
):
|
|
143
|
+
"""
|
|
144
|
+
Initialize federated executor.
|
|
145
|
+
|
|
146
|
+
v0.3.0: All federation uses Spark (local or cluster).
|
|
147
|
+
v0.54.0: Added metadata store integration for type mapping.
|
|
148
|
+
|
|
149
|
+
:param manifest: The dbt manifest with all nodes and sources
|
|
150
|
+
:param adapters: Dict of connection_name → adapter instances
|
|
151
|
+
:param default_compute_engine: Default compute engine ("spark-local" or "spark-cluster")
|
|
152
|
+
:param project_root: Project root directory (for metadata store access)
|
|
153
|
+
"""
|
|
154
|
+
self.manifest = manifest
|
|
155
|
+
self.adapters = adapters
|
|
156
|
+
self.default_compute_engine = default_compute_engine
|
|
157
|
+
self.project_root = project_root or Path(".")
|
|
158
|
+
self._metadata_store = None
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def metadata_store(self):
|
|
162
|
+
"""
|
|
163
|
+
Lazy-load the project metadata store.
|
|
164
|
+
|
|
165
|
+
v0.54.0: Returns None if store doesn't exist (graceful degradation).
|
|
166
|
+
"""
|
|
167
|
+
if self._metadata_store is None:
|
|
168
|
+
try:
|
|
169
|
+
from dbt.compute.metadata import ProjectMetadataStore
|
|
170
|
+
store_path = self.project_root / ".dvt" / "metadata.duckdb"
|
|
171
|
+
if store_path.exists():
|
|
172
|
+
self._metadata_store = ProjectMetadataStore(self.project_root)
|
|
173
|
+
_log("[DVT] Metadata store loaded from .dvt/metadata.duckdb")
|
|
174
|
+
except ImportError:
|
|
175
|
+
_log("[DVT] DuckDB not available - metadata store disabled")
|
|
176
|
+
except Exception as e:
|
|
177
|
+
_log(f"[DVT] Could not load metadata store: {e}")
|
|
178
|
+
return self._metadata_store
|
|
179
|
+
|
|
180
|
+
def get_source_column_metadata(
|
|
181
|
+
self,
|
|
182
|
+
source_name: str,
|
|
183
|
+
table_name: str
|
|
184
|
+
) -> Optional[List[Dict[str, Any]]]:
|
|
185
|
+
"""
|
|
186
|
+
Look up column metadata for a source table from the metadata store.
|
|
187
|
+
|
|
188
|
+
v0.54.0: Returns cached metadata if available, None otherwise.
|
|
189
|
+
|
|
190
|
+
:param source_name: Name of the source
|
|
191
|
+
:param table_name: Name of the table
|
|
192
|
+
:returns: List of column metadata dicts, or None if not cached
|
|
193
|
+
"""
|
|
194
|
+
if self.metadata_store is None:
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
metadata = self.metadata_store.get_table_metadata(source_name, table_name)
|
|
199
|
+
if metadata:
|
|
200
|
+
return [
|
|
201
|
+
{
|
|
202
|
+
"column_name": col.column_name,
|
|
203
|
+
"adapter_type": col.adapter_type,
|
|
204
|
+
"spark_type": col.spark_type,
|
|
205
|
+
"is_nullable": col.is_nullable,
|
|
206
|
+
"ordinal_position": col.ordinal_position,
|
|
207
|
+
}
|
|
208
|
+
for col in metadata.columns
|
|
209
|
+
]
|
|
210
|
+
except Exception as e:
|
|
211
|
+
_log(f"[DVT] Warning: Could not fetch metadata for {source_name}.{table_name}: {e}")
|
|
212
|
+
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
def get_spark_schema_for_source(
|
|
216
|
+
self,
|
|
217
|
+
source_name: str,
|
|
218
|
+
table_name: str
|
|
219
|
+
) -> Optional[str]:
|
|
220
|
+
"""
|
|
221
|
+
Generate Spark schema DDL for a source table from cached metadata.
|
|
222
|
+
|
|
223
|
+
v0.54.0: Returns schema string for explicit type enforcement.
|
|
224
|
+
|
|
225
|
+
:param source_name: Name of the source
|
|
226
|
+
:param table_name: Name of the table
|
|
227
|
+
:returns: Spark schema DDL string, or None if not cached
|
|
228
|
+
"""
|
|
229
|
+
columns = self.get_source_column_metadata(source_name, table_name)
|
|
230
|
+
if not columns:
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
# Build Spark schema DDL
|
|
234
|
+
# Format: "col1 StringType, col2 IntegerType, ..."
|
|
235
|
+
schema_parts = []
|
|
236
|
+
for col in sorted(columns, key=lambda c: c["ordinal_position"]):
|
|
237
|
+
spark_type = col["spark_type"]
|
|
238
|
+
nullable = "" if col["is_nullable"] else " NOT NULL"
|
|
239
|
+
schema_parts.append(f"`{col['column_name']}` {spark_type}{nullable}")
|
|
240
|
+
|
|
241
|
+
return ", ".join(schema_parts)
|
|
242
|
+
|
|
243
|
+
def capture_source_metadata(
|
|
244
|
+
self,
|
|
245
|
+
engine: SparkEngine,
|
|
246
|
+
source_name: str,
|
|
247
|
+
table_name: str,
|
|
248
|
+
adapter_name: str,
|
|
249
|
+
connection_name: str,
|
|
250
|
+
schema_name: str,
|
|
251
|
+
table_alias: str
|
|
252
|
+
) -> None:
|
|
253
|
+
"""
|
|
254
|
+
Capture metadata from a loaded source table and save to metadata store.
|
|
255
|
+
|
|
256
|
+
v0.54.0: Metadata propagation during federated execution.
|
|
257
|
+
|
|
258
|
+
:param engine: SparkEngine instance with loaded table
|
|
259
|
+
:param source_name: Name of the source
|
|
260
|
+
:param table_name: Name of the table
|
|
261
|
+
:param adapter_name: Type of adapter (postgres, snowflake, etc.)
|
|
262
|
+
:param connection_name: Connection profile name
|
|
263
|
+
:param schema_name: Database schema name
|
|
264
|
+
:param table_alias: Alias used in Spark for the temp view
|
|
265
|
+
"""
|
|
266
|
+
if self.metadata_store is None:
|
|
267
|
+
return
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
# Import here to avoid circular imports
|
|
271
|
+
from dbt.compute.metadata.store import TableMetadata, ColumnMetadata
|
|
272
|
+
from dbt.compute.metadata.registry import TypeRegistry
|
|
273
|
+
|
|
274
|
+
# Get schema from Spark temp view
|
|
275
|
+
spark_schema = engine.get_schema(table_alias)
|
|
276
|
+
if not spark_schema:
|
|
277
|
+
_log(f"[DVT] Could not get schema for {table_alias}")
|
|
278
|
+
return
|
|
279
|
+
|
|
280
|
+
# Build column metadata from Spark schema
|
|
281
|
+
columns = []
|
|
282
|
+
for idx, field in enumerate(spark_schema):
|
|
283
|
+
# field is a StructField with name, dataType, nullable
|
|
284
|
+
spark_type_str = str(field.dataType)
|
|
285
|
+
|
|
286
|
+
# Try to map Spark type back to adapter type
|
|
287
|
+
# Look up in type registry (reverse mapping)
|
|
288
|
+
adapter_type = self._spark_to_adapter_type(
|
|
289
|
+
adapter_name, spark_type_str
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
columns.append(ColumnMetadata(
|
|
293
|
+
column_name=field.name,
|
|
294
|
+
adapter_type=adapter_type,
|
|
295
|
+
spark_type=spark_type_str,
|
|
296
|
+
is_nullable=field.nullable,
|
|
297
|
+
is_primary_key=False, # Can't infer from JDBC
|
|
298
|
+
ordinal_position=idx + 1,
|
|
299
|
+
))
|
|
300
|
+
|
|
301
|
+
if columns:
|
|
302
|
+
# Create and save table metadata
|
|
303
|
+
metadata = TableMetadata(
|
|
304
|
+
source_name=source_name,
|
|
305
|
+
table_name=table_name,
|
|
306
|
+
adapter_name=adapter_name,
|
|
307
|
+
connection_name=connection_name,
|
|
308
|
+
schema_name=schema_name,
|
|
309
|
+
row_count=None, # Don't query count to avoid performance hit
|
|
310
|
+
columns=columns,
|
|
311
|
+
last_refreshed=datetime.now(),
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
with self.metadata_store as store:
|
|
315
|
+
store.save_table_metadata(metadata)
|
|
316
|
+
_log(f"[DVT] Captured metadata for {source_name}.{table_name}: {len(columns)} columns")
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
# Don't fail execution if metadata capture fails
|
|
320
|
+
_log(f"[DVT] Warning: Could not capture metadata for {source_name}.{table_name}: {e}")
|
|
321
|
+
|
|
322
|
+
def _spark_to_adapter_type(
|
|
323
|
+
self,
|
|
324
|
+
adapter_name: str,
|
|
325
|
+
spark_type: str
|
|
326
|
+
) -> str:
|
|
327
|
+
"""
|
|
328
|
+
Map Spark type back to approximate adapter type.
|
|
329
|
+
|
|
330
|
+
This is a best-effort reverse mapping - exact original type
|
|
331
|
+
may not be recoverable due to type normalization during JDBC read.
|
|
332
|
+
|
|
333
|
+
:param adapter_name: Target adapter name
|
|
334
|
+
:param spark_type: Spark type string (e.g., "StringType()")
|
|
335
|
+
:returns: Approximate adapter type string
|
|
336
|
+
"""
|
|
337
|
+
from dbt.compute.metadata.registry import TypeRegistry
|
|
338
|
+
|
|
339
|
+
# Normalize spark type (remove parentheses, etc.)
|
|
340
|
+
spark_type_normalized = spark_type.replace("()", "").replace("Type", "").upper()
|
|
341
|
+
|
|
342
|
+
# Common mappings (reverse of type_registry)
|
|
343
|
+
spark_to_common = {
|
|
344
|
+
"STRING": "VARCHAR",
|
|
345
|
+
"INTEGER": "INTEGER",
|
|
346
|
+
"INT": "INTEGER",
|
|
347
|
+
"LONG": "BIGINT",
|
|
348
|
+
"BIGINT": "BIGINT",
|
|
349
|
+
"SHORT": "SMALLINT",
|
|
350
|
+
"DOUBLE": "DOUBLE PRECISION",
|
|
351
|
+
"FLOAT": "REAL",
|
|
352
|
+
"DECIMAL": "DECIMAL",
|
|
353
|
+
"BOOLEAN": "BOOLEAN",
|
|
354
|
+
"DATE": "DATE",
|
|
355
|
+
"TIMESTAMP": "TIMESTAMP",
|
|
356
|
+
"BINARY": "BYTEA",
|
|
357
|
+
"ARRAY": "ARRAY",
|
|
358
|
+
"MAP": "JSON",
|
|
359
|
+
"STRUCT": "JSON",
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
# Return common SQL type
|
|
363
|
+
return spark_to_common.get(spark_type_normalized, spark_type)
|
|
364
|
+
|
|
365
|
+
def execute(
|
|
366
|
+
self,
|
|
367
|
+
node: ManifestNode,
|
|
368
|
+
analysis_result: QueryAnalysisResult,
|
|
369
|
+
compute_engine_override: Optional[str] = None,
|
|
370
|
+
spark_config: Optional[Dict[str, str]] = None,
|
|
371
|
+
target_adapter_type: Optional[str] = None,
|
|
372
|
+
coerce_view_to_table: bool = False,
|
|
373
|
+
) -> FederatedExecutionResult:
|
|
374
|
+
"""
|
|
375
|
+
Execute a node using federated query processing.
|
|
376
|
+
|
|
377
|
+
:param node: The compiled node to execute
|
|
378
|
+
:param analysis_result: Query analysis result
|
|
379
|
+
:param compute_engine_override: Override compute engine choice
|
|
380
|
+
:param spark_config: Spark configuration (if using Spark)
|
|
381
|
+
:param target_adapter_type: Target adapter type for JDBC materialization
|
|
382
|
+
:param coerce_view_to_table: DVT v0.51.6 - If True, treat view as table (Rule 3.C.3)
|
|
383
|
+
:returns: FederatedExecutionResult with query results
|
|
384
|
+
:raises DbtRuntimeError: If execution fails
|
|
385
|
+
"""
|
|
386
|
+
import time
|
|
387
|
+
|
|
388
|
+
_log(f"[DVT] Starting federated execution for node: {node.unique_id}")
|
|
389
|
+
start_time = time.time()
|
|
390
|
+
|
|
391
|
+
# Determine compute engine
|
|
392
|
+
compute_engine = (
|
|
393
|
+
compute_engine_override
|
|
394
|
+
or analysis_result.user_override
|
|
395
|
+
or self.default_compute_engine
|
|
396
|
+
)
|
|
397
|
+
_log(f"[DVT] Compute engine selected: {compute_engine}")
|
|
398
|
+
|
|
399
|
+
# DVT v0.5.0: Restrict Spark compute to table and incremental materializations only
|
|
400
|
+
# DVT v0.51.6: Allow view if coerce_view_to_table is True (Rule 3.C.3)
|
|
401
|
+
if hasattr(node, 'config') and hasattr(node.config, 'materialized'):
|
|
402
|
+
materialized = node.config.materialized
|
|
403
|
+
|
|
404
|
+
# DVT v0.51.6: Views are coerced to tables in cross-target scenarios
|
|
405
|
+
effective_materialized = 'table' if (materialized == 'view' and coerce_view_to_table) else materialized
|
|
406
|
+
|
|
407
|
+
# Only allow table and incremental
|
|
408
|
+
if effective_materialized not in ('table', 'incremental'):
|
|
409
|
+
raise DbtRuntimeError(
|
|
410
|
+
f"Spark compute engine only supports 'table' and 'incremental' materializations. "
|
|
411
|
+
f"Node '{node.unique_id}' uses '{materialized}'. "
|
|
412
|
+
f"Please change the materialization to 'table' or 'incremental', or use adapter-native execution."
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# For incremental, validate strategy is 'append' (only supported strategy)
|
|
416
|
+
if materialized == 'incremental':
|
|
417
|
+
incremental_strategy = getattr(node.config, 'incremental_strategy', 'append')
|
|
418
|
+
if incremental_strategy != 'append':
|
|
419
|
+
raise DbtRuntimeError(
|
|
420
|
+
f"Spark compute engine only supports 'append' incremental strategy. "
|
|
421
|
+
f"Node '{node.unique_id}' uses '{incremental_strategy}'. "
|
|
422
|
+
f"Supported strategies: append. "
|
|
423
|
+
f"For merge/delete+insert/insert_overwrite, use adapter-native execution."
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
if coerce_view_to_table and materialized == 'view':
|
|
427
|
+
_log(f"[DVT] Materialization: view → table (coerced for cross-target)")
|
|
428
|
+
else:
|
|
429
|
+
_log(f"[DVT] Materialization validated: {materialized}")
|
|
430
|
+
|
|
431
|
+
# Extract source table metadata
|
|
432
|
+
source_tables = self._extract_source_tables(analysis_result)
|
|
433
|
+
_log(f"[DVT] Found {len(source_tables)} source table(s)")
|
|
434
|
+
|
|
435
|
+
# v0.5.99: Look up named clusters from registry
|
|
436
|
+
from dbt.config.compute import ComputeRegistry
|
|
437
|
+
from dbt.compute.jdbc_utils import set_docker_mode
|
|
438
|
+
registry = ComputeRegistry()
|
|
439
|
+
cluster_config = None
|
|
440
|
+
|
|
441
|
+
# Check if it's a registered named cluster
|
|
442
|
+
if compute_engine not in ("spark-local", "spark", "spark-cluster"):
|
|
443
|
+
cluster = registry.get(compute_engine)
|
|
444
|
+
if cluster:
|
|
445
|
+
cluster_config = cluster.config
|
|
446
|
+
_log(f"[DVT] Found registered cluster '{compute_engine}' with platform: {cluster.detect_platform().value}")
|
|
447
|
+
|
|
448
|
+
# DVT v0.51.8: Enable Docker mode for standalone clusters with localhost master
|
|
449
|
+
# This rewrites localhost -> host.docker.internal in JDBC URLs
|
|
450
|
+
master = cluster_config.get("master", "")
|
|
451
|
+
if master.startswith("spark://") and ("localhost" in master or "127.0.0.1" in master):
|
|
452
|
+
set_docker_mode(True)
|
|
453
|
+
_log("[DVT] Docker mode enabled for JDBC URLs")
|
|
454
|
+
else:
|
|
455
|
+
set_docker_mode(False)
|
|
456
|
+
else:
|
|
457
|
+
# Not in registry - check if it starts with "spark" for backwards compat
|
|
458
|
+
if not compute_engine.startswith("spark"):
|
|
459
|
+
raise DbtRuntimeError(
|
|
460
|
+
f"Invalid compute engine '{compute_engine}'. "
|
|
461
|
+
f"Not found in compute registry. "
|
|
462
|
+
f"Available: {[c.name for c in registry.list()]}"
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
set_docker_mode(False)
|
|
466
|
+
|
|
467
|
+
# Create Spark engine (local or cluster based on config)
|
|
468
|
+
_log(f"[DVT] Creating Spark engine (mode: {compute_engine})")
|
|
469
|
+
if compute_engine == "spark-local" or compute_engine == "spark":
|
|
470
|
+
engine = SparkEngine(mode="embedded", spark_config=spark_config or {})
|
|
471
|
+
elif compute_engine == "spark-cluster" or compute_engine.startswith("spark:"):
|
|
472
|
+
# External cluster
|
|
473
|
+
engine = SparkEngine(mode="external", spark_config=spark_config or {})
|
|
474
|
+
elif cluster_config:
|
|
475
|
+
# Named cluster from registry - pass full config
|
|
476
|
+
engine = SparkEngine(mode="external", spark_config=cluster_config)
|
|
477
|
+
else:
|
|
478
|
+
# Fallback
|
|
479
|
+
engine = SparkEngine(mode="external", spark_config=spark_config or {})
|
|
480
|
+
|
|
481
|
+
_log("[DVT] Spark engine created, initializing Spark session...")
|
|
482
|
+
try:
|
|
483
|
+
# v0.5.99: Collect adapter types from sources + target for JDBC driver provisioning
|
|
484
|
+
all_adapter_types = set()
|
|
485
|
+
for source_table in source_tables:
|
|
486
|
+
adapter = self.adapters.get(source_table.connection_name)
|
|
487
|
+
if adapter:
|
|
488
|
+
all_adapter_types.add(adapter.type())
|
|
489
|
+
# Include target adapter type for materialization
|
|
490
|
+
if target_adapter_type:
|
|
491
|
+
all_adapter_types.add(target_adapter_type)
|
|
492
|
+
_log(f"[DVT] Adapter types (sources + target): {all_adapter_types}")
|
|
493
|
+
|
|
494
|
+
# Initialize Spark session with all adapter types (for JDBC drivers)
|
|
495
|
+
engine.connect(adapter_types=all_adapter_types)
|
|
496
|
+
_log("[DVT] Spark session initialized successfully")
|
|
497
|
+
|
|
498
|
+
# Get compiled SQL first (needed for optimization checks)
|
|
499
|
+
compiled_sql = (
|
|
500
|
+
node.compiled_code
|
|
501
|
+
if hasattr(node, "compiled_code")
|
|
502
|
+
else node.raw_code
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Step 1: Load source data into Spark via JDBC (v0.3.0: Spark-only)
|
|
506
|
+
total_rows_read = self._load_sources_spark_jdbc(
|
|
507
|
+
engine, source_tables, analysis_result, compiled_sql
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Step 2: Rewrite SQL to use table aliases
|
|
511
|
+
rewritten_sql = self._rewrite_sql_for_compute(
|
|
512
|
+
compiled_sql, source_tables
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Step 3: Execute query in Spark
|
|
516
|
+
result_df = engine.spark.sql(rewritten_sql)
|
|
517
|
+
|
|
518
|
+
# Calculate execution time
|
|
519
|
+
execution_time_ms = (time.time() - start_time) * 1000
|
|
520
|
+
|
|
521
|
+
# Return Spark DataFrame AND engine (caller must close engine after materialization)
|
|
522
|
+
return FederatedExecutionResult(
|
|
523
|
+
spark_dataframe=result_df,
|
|
524
|
+
source_tables=source_tables,
|
|
525
|
+
compute_engine=compute_engine,
|
|
526
|
+
execution_time_ms=execution_time_ms,
|
|
527
|
+
rows_read=total_rows_read,
|
|
528
|
+
rows_returned=None, # Will be counted during JDBC write
|
|
529
|
+
engine=engine, # Return engine for lifecycle management
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
except Exception as e:
|
|
533
|
+
# Clean up engine on error
|
|
534
|
+
try:
|
|
535
|
+
engine.close()
|
|
536
|
+
except:
|
|
537
|
+
pass
|
|
538
|
+
# DVT v0.5.2: Clean error message (no Java stack trace)
|
|
539
|
+
clean_error = _clean_spark_error(e)
|
|
540
|
+
# DVT v0.5.99: Include original exception for debugging if cleaned message is too short
|
|
541
|
+
if len(clean_error) < 20:
|
|
542
|
+
clean_error = f"{clean_error} (original: {str(e)[:200]})"
|
|
543
|
+
raise DbtRuntimeError(
|
|
544
|
+
f"Federated execution failed for node {node.unique_id}: {clean_error}"
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
def _extract_source_tables(
|
|
548
|
+
self, analysis_result: QueryAnalysisResult
|
|
549
|
+
) -> List[SourceTableMetadata]:
|
|
550
|
+
"""
|
|
551
|
+
Extract metadata for all source tables referenced in the query.
|
|
552
|
+
|
|
553
|
+
:param analysis_result: Query analysis result
|
|
554
|
+
:returns: List of SourceTableMetadata
|
|
555
|
+
"""
|
|
556
|
+
source_tables = []
|
|
557
|
+
|
|
558
|
+
for source_id in analysis_result.source_refs:
|
|
559
|
+
source = self.manifest.sources.get(source_id)
|
|
560
|
+
if not source:
|
|
561
|
+
raise DbtRuntimeError(
|
|
562
|
+
f"Source {source_id} not found in manifest. "
|
|
563
|
+
f"Available sources: {list(self.manifest.sources.keys())[:3]}"
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
# Get connection name from source definition
|
|
567
|
+
connection_name = getattr(source, "connection", None)
|
|
568
|
+
|
|
569
|
+
if not connection_name:
|
|
570
|
+
raise DbtRuntimeError(
|
|
571
|
+
f"Source {source_id} does not have a connection specified. "
|
|
572
|
+
"DVT requires all sources to specify a connection in the source YAML:\n"
|
|
573
|
+
" - name: my_source\n"
|
|
574
|
+
" connection: my_connection"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# Build qualified name for SQL
|
|
578
|
+
qualified_name = f"{source.database}.{source.schema}.{source.identifier}"
|
|
579
|
+
|
|
580
|
+
metadata = SourceTableMetadata(
|
|
581
|
+
source_id=source_id,
|
|
582
|
+
connection_name=connection_name,
|
|
583
|
+
database=source.database,
|
|
584
|
+
schema=source.schema,
|
|
585
|
+
identifier=source.identifier,
|
|
586
|
+
qualified_name=qualified_name,
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
source_tables.append(metadata)
|
|
590
|
+
|
|
591
|
+
return source_tables
|
|
592
|
+
|
|
593
|
+
# NOTE: _load_sources_via_adapters method removed in v0.3.0
|
|
594
|
+
# All data loading now uses Spark JDBC via _load_sources_spark_jdbc
|
|
595
|
+
|
|
596
|
+
def _load_sources_spark_jdbc(
|
|
597
|
+
self,
|
|
598
|
+
engine: SparkEngine,
|
|
599
|
+
source_tables: List[SourceTableMetadata],
|
|
600
|
+
analysis_result: QueryAnalysisResult,
|
|
601
|
+
compiled_sql: str,
|
|
602
|
+
) -> int:
|
|
603
|
+
"""
|
|
604
|
+
Load all source tables into Spark via JDBC connectors (Phase 1: v0.2.0).
|
|
605
|
+
|
|
606
|
+
This bypasses the DVT node's memory by reading data directly from source
|
|
607
|
+
databases into Spark workers (distributed memory). Data flow:
|
|
608
|
+
Source DB → Spark Workers → Target DB (no DVT node bottleneck)
|
|
609
|
+
|
|
610
|
+
This method:
|
|
611
|
+
1. Gets adapter credentials for each source
|
|
612
|
+
2. Converts credentials to JDBC config
|
|
613
|
+
3. Auto-detects partition column for parallel reads
|
|
614
|
+
4. Reads data via Spark JDBC with partitioning
|
|
615
|
+
5. Registers as temp view in Spark
|
|
616
|
+
|
|
617
|
+
:param engine: Spark engine instance
|
|
618
|
+
:param source_tables: List of source table metadata
|
|
619
|
+
:param analysis_result: Query analysis result
|
|
620
|
+
:returns: Total number of rows loaded (estimated, as Spark is lazy)
|
|
621
|
+
:raises DbtRuntimeError: If JDBC not supported or read fails
|
|
622
|
+
"""
|
|
623
|
+
from dbt.compute.jdbc_utils import build_jdbc_config
|
|
624
|
+
from dbt.compute.filter_pushdown import optimize_jdbc_table_read
|
|
625
|
+
|
|
626
|
+
total_rows = 0
|
|
627
|
+
|
|
628
|
+
for source_meta in source_tables:
|
|
629
|
+
# Get adapter for this source's connection
|
|
630
|
+
adapter = self.adapters.get(source_meta.connection_name)
|
|
631
|
+
if not adapter:
|
|
632
|
+
raise DbtRuntimeError(
|
|
633
|
+
f"No adapter found for connection '{source_meta.connection_name}'"
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
# Check if JDBC is supported for this adapter type
|
|
637
|
+
if not engine.supports_jdbc(adapter.type()):
|
|
638
|
+
raise DbtRuntimeError(
|
|
639
|
+
f"JDBC not supported for adapter type '{adapter.type()}'. "
|
|
640
|
+
f"Falling back to adapter-based loading not yet implemented. "
|
|
641
|
+
f"Please use DuckDB engine for this source type."
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
# Log connection attempt
|
|
645
|
+
_log(f"[DVT] Connecting to {adapter.type()} source: {source_meta.qualified_name} (connection: {source_meta.connection_name})")
|
|
646
|
+
connection_start = time.time()
|
|
647
|
+
|
|
648
|
+
# Get adapter credentials
|
|
649
|
+
credentials = adapter.config.credentials
|
|
650
|
+
|
|
651
|
+
# Build JDBC configuration
|
|
652
|
+
try:
|
|
653
|
+
jdbc_url, jdbc_properties = build_jdbc_config(credentials)
|
|
654
|
+
except Exception as e:
|
|
655
|
+
_log(f"[DVT] ERROR: Failed to build JDBC config for '{source_meta.qualified_name}': {str(e)}")
|
|
656
|
+
raise DbtRuntimeError(
|
|
657
|
+
f"Failed to build JDBC config for source '{source_meta.qualified_name}': {str(e)}"
|
|
658
|
+
) from e
|
|
659
|
+
|
|
660
|
+
# Prepare JDBC read parameters with filter pushdown optimization
|
|
661
|
+
# Instead of reading full table, push down filters (LIMIT, WHERE) to source DB
|
|
662
|
+
jdbc_table = optimize_jdbc_table_read(
|
|
663
|
+
source_table=source_meta,
|
|
664
|
+
compiled_sql=compiled_sql,
|
|
665
|
+
source_tables=source_tables,
|
|
666
|
+
adapter_type=adapter.type()
|
|
667
|
+
)
|
|
668
|
+
table_alias = self._get_table_alias(source_meta)
|
|
669
|
+
numPartitions = 16 # Default parallelism
|
|
670
|
+
|
|
671
|
+
# Automatic partition detection DISABLED
|
|
672
|
+
# Reasons:
|
|
673
|
+
# 1. Slow metadata queries (30-60s on cold Snowflake warehouses)
|
|
674
|
+
# 2. Unnecessary overhead for small datasets
|
|
675
|
+
# 3. Filter pushdown now handles optimization automatically
|
|
676
|
+
partition_column = None
|
|
677
|
+
lower_bound = None
|
|
678
|
+
upper_bound = None
|
|
679
|
+
|
|
680
|
+
# v0.54.0: Look up cached metadata for type mapping
|
|
681
|
+
# Extract source_name and table_name from source_id
|
|
682
|
+
source_parts = source_meta.source_id.split(".")
|
|
683
|
+
if len(source_parts) >= 4:
|
|
684
|
+
source_name = source_parts[2]
|
|
685
|
+
table_name = source_parts[3]
|
|
686
|
+
cached_metadata = self.get_source_column_metadata(source_name, table_name)
|
|
687
|
+
if cached_metadata:
|
|
688
|
+
_log(f"[DVT] Using cached metadata for {source_name}.{table_name} ({len(cached_metadata)} columns)")
|
|
689
|
+
else:
|
|
690
|
+
_log(f"[DVT] No cached metadata for {source_name}.{table_name} - using JDBC type inference")
|
|
691
|
+
else:
|
|
692
|
+
cached_metadata = None
|
|
693
|
+
|
|
694
|
+
# Read via Spark JDBC and register as temp view
|
|
695
|
+
_log(f"[DVT] Reading from JDBC: {jdbc_table}")
|
|
696
|
+
try:
|
|
697
|
+
engine.register_jdbc_table(
|
|
698
|
+
url=jdbc_url,
|
|
699
|
+
table=jdbc_table,
|
|
700
|
+
properties=jdbc_properties,
|
|
701
|
+
table_alias=table_alias,
|
|
702
|
+
numPartitions=numPartitions,
|
|
703
|
+
partitionColumn=partition_column,
|
|
704
|
+
lowerBound=lower_bound,
|
|
705
|
+
upperBound=upper_bound,
|
|
706
|
+
)
|
|
707
|
+
connection_time = time.time() - connection_start
|
|
708
|
+
_log(f"[DVT] ✓ Connected to {source_meta.qualified_name} in {connection_time:.1f}s")
|
|
709
|
+
if connection_time > 30:
|
|
710
|
+
_log(f"[DVT] WARNING: Connection took {connection_time:.1f}s (warehouse may have been suspended)")
|
|
711
|
+
|
|
712
|
+
# v0.54.0: Capture metadata if not already cached
|
|
713
|
+
if not cached_metadata and len(source_parts) >= 4:
|
|
714
|
+
self.capture_source_metadata(
|
|
715
|
+
engine=engine,
|
|
716
|
+
source_name=source_name,
|
|
717
|
+
table_name=table_name,
|
|
718
|
+
adapter_name=adapter.type(),
|
|
719
|
+
connection_name=source_meta.connection_name,
|
|
720
|
+
schema_name=source_meta.schema,
|
|
721
|
+
table_alias=table_alias
|
|
722
|
+
)
|
|
723
|
+
except Exception as e:
|
|
724
|
+
connection_time = time.time() - connection_start
|
|
725
|
+
# DVT v0.5.2: Clean error message (no Java stack trace)
|
|
726
|
+
clean_error = _clean_spark_error(e)
|
|
727
|
+
_log(f"[DVT] ERROR: Failed to load '{source_meta.qualified_name}' after {connection_time:.1f}s: {clean_error}")
|
|
728
|
+
raise DbtRuntimeError(
|
|
729
|
+
f"Failed to load source '{source_meta.qualified_name}' via JDBC: {clean_error}"
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# Note: Can't easily count rows without triggering Spark action
|
|
733
|
+
# For now, return 0 (rows_read will be inaccurate for JDBC path)
|
|
734
|
+
# TODO: Consider running COUNT(*) query if row count is needed
|
|
735
|
+
total_rows += 0
|
|
736
|
+
|
|
737
|
+
return total_rows
|
|
738
|
+
|
|
739
|
+
def _get_table_alias(self, source_meta: SourceTableMetadata) -> str:
|
|
740
|
+
"""
|
|
741
|
+
Generate a safe table alias for the compute engine.
|
|
742
|
+
|
|
743
|
+
Compute engines may not support dots or special characters in table names,
|
|
744
|
+
so we create a normalized alias.
|
|
745
|
+
|
|
746
|
+
:param source_meta: Source table metadata
|
|
747
|
+
:returns: Safe table alias
|
|
748
|
+
"""
|
|
749
|
+
# Extract source name and table name from source_id
|
|
750
|
+
# source_id format: source.{project}.{source_name}.{table_name}
|
|
751
|
+
parts = source_meta.source_id.split(".")
|
|
752
|
+
if len(parts) >= 4:
|
|
753
|
+
source_name = parts[2]
|
|
754
|
+
table_name = parts[3]
|
|
755
|
+
return f"{source_name}_{table_name}"
|
|
756
|
+
else:
|
|
757
|
+
# Fallback: use identifier
|
|
758
|
+
return source_meta.identifier
|
|
759
|
+
|
|
760
|
+
def _rewrite_sql_for_compute(
|
|
761
|
+
self, sql: str, source_tables: List[SourceTableMetadata]
|
|
762
|
+
) -> str:
|
|
763
|
+
"""
|
|
764
|
+
Rewrite SQL to replace fully-qualified source table names with compute engine aliases.
|
|
765
|
+
|
|
766
|
+
Source tables are loaded into the compute engine with simple aliases (e.g., 'Exim_cbs_f_country'),
|
|
767
|
+
but the compiled SQL contains fully-qualified names (e.g., '"EXIM_EDWH_DEV"."ods"."cbs_f_country"').
|
|
768
|
+
This method replaces the qualified names with the aliases and removes source-specific clauses
|
|
769
|
+
like SAMPLE that have been pushed down to the source.
|
|
770
|
+
|
|
771
|
+
:param sql: Compiled SQL with fully-qualified table names
|
|
772
|
+
:param source_tables: List of source table metadata
|
|
773
|
+
:returns: Rewritten SQL with aliases and source-specific clauses removed
|
|
774
|
+
"""
|
|
775
|
+
import re
|
|
776
|
+
|
|
777
|
+
rewritten_sql = sql
|
|
778
|
+
|
|
779
|
+
for source_meta in source_tables:
|
|
780
|
+
# Get the alias used in the compute engine
|
|
781
|
+
alias = self._get_table_alias(source_meta)
|
|
782
|
+
|
|
783
|
+
# Replace the fully-qualified table name with the alias
|
|
784
|
+
# Format: "database"."schema"."table" or database.schema.table
|
|
785
|
+
qualified_name = source_meta.qualified_name
|
|
786
|
+
parts = qualified_name.split(".")
|
|
787
|
+
|
|
788
|
+
# DVT v0.51.7: Use case-insensitive regex replacement for all variants
|
|
789
|
+
# because Snowflake returns uppercase but Spark/Databricks lowercases
|
|
790
|
+
|
|
791
|
+
# 1. Unquoted: EXIM_EDWH_DEV.ods.cbs_f_country (any case)
|
|
792
|
+
unquoted_pattern = re.compile(
|
|
793
|
+
r'\b' + r'\.'.join(re.escape(p) for p in parts) + r'\b',
|
|
794
|
+
re.IGNORECASE
|
|
795
|
+
)
|
|
796
|
+
rewritten_sql = unquoted_pattern.sub(alias, rewritten_sql)
|
|
797
|
+
|
|
798
|
+
# 2. Double-quoted (PostgreSQL style): "EXIM_EDWH_DEV"."ods"."cbs_f_country" (any case)
|
|
799
|
+
quoted_pattern = re.compile(
|
|
800
|
+
r'"' + r'"\."\s*'.join(re.escape(p) for p in parts) + r'"',
|
|
801
|
+
re.IGNORECASE
|
|
802
|
+
)
|
|
803
|
+
rewritten_sql = quoted_pattern.sub(alias, rewritten_sql)
|
|
804
|
+
|
|
805
|
+
# 3. Single string quoted: "EXIM_EDWH_DEV.ods.cbs_f_country" (any case)
|
|
806
|
+
single_quoted_pattern = re.compile(
|
|
807
|
+
r'"' + r'\.'.join(re.escape(p) for p in parts) + r'"',
|
|
808
|
+
re.IGNORECASE
|
|
809
|
+
)
|
|
810
|
+
rewritten_sql = single_quoted_pattern.sub(alias, rewritten_sql)
|
|
811
|
+
|
|
812
|
+
# 4. Backtick-quoted (Spark/Databricks style): `EXIM_EDWH_DEV`.`ods`.`cbs_f_country` (any case)
|
|
813
|
+
backtick_pattern = re.compile(
|
|
814
|
+
r'`' + r'`\.`\s*'.join(re.escape(p) for p in parts) + r'`',
|
|
815
|
+
re.IGNORECASE
|
|
816
|
+
)
|
|
817
|
+
rewritten_sql = backtick_pattern.sub(alias, rewritten_sql)
|
|
818
|
+
|
|
819
|
+
# DVT v0.4.5: Remove Snowflake-specific SAMPLE clauses
|
|
820
|
+
# These have been pushed down to the source via JDBC subqueries
|
|
821
|
+
# Spark SQL doesn't support SAMPLE syntax, so remove it from the query
|
|
822
|
+
# Pattern matches: SAMPLE (N), SAMPLE (N ROWS), SAMPLE SYSTEM|BERNOULLI|BLOCK (P)
|
|
823
|
+
# with optional REPEATABLE(seed) or SEED(seed)
|
|
824
|
+
rewritten_sql = re.sub(
|
|
825
|
+
r'\s*(?:TABLE)?SAMPLE\s+(?:SYSTEM|BERNOULLI|BLOCK)\s*\(\s*\d+(?:\.\d+)?\s*\)'
|
|
826
|
+
r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*\d+\s*\))?',
|
|
827
|
+
'',
|
|
828
|
+
rewritten_sql,
|
|
829
|
+
flags=re.IGNORECASE
|
|
830
|
+
)
|
|
831
|
+
rewritten_sql = re.sub(
|
|
832
|
+
r'\s*(?:TABLE)?SAMPLE\s*\(\s*\d+(?:\s+ROWS)?\s*\)'
|
|
833
|
+
r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*\d+\s*\))?',
|
|
834
|
+
'',
|
|
835
|
+
rewritten_sql,
|
|
836
|
+
flags=re.IGNORECASE
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
return rewritten_sql
|
|
840
|
+
|
|
841
|
+
def materialize_result(
|
|
842
|
+
self,
|
|
843
|
+
result: FederatedExecutionResult,
|
|
844
|
+
target_adapter: BaseAdapter,
|
|
845
|
+
target_table: str,
|
|
846
|
+
mode: str = "create",
|
|
847
|
+
use_jdbc: bool = True,
|
|
848
|
+
spark_result_df: Optional[Any] = None,
|
|
849
|
+
) -> Any:
|
|
850
|
+
"""
|
|
851
|
+
Materialize federated query results to target database.
|
|
852
|
+
|
|
853
|
+
v0.3.0: Uses Spark JDBC for all materialization (default).
|
|
854
|
+
|
|
855
|
+
:param result: Federated execution result
|
|
856
|
+
:param target_adapter: Adapter to use for getting target credentials
|
|
857
|
+
:param target_table: Target table name (qualified)
|
|
858
|
+
:param mode: Write mode ('create', 'append', 'replace')
|
|
859
|
+
:param use_jdbc: If True, use JDBC write path (default in v0.3.0)
|
|
860
|
+
:param spark_result_df: Spark DataFrame with results (required for JDBC path)
|
|
861
|
+
:returns: AdapterResponse from write operation
|
|
862
|
+
"""
|
|
863
|
+
if use_jdbc and spark_result_df is not None:
|
|
864
|
+
# Use JDBC write path (default in v0.3.0)
|
|
865
|
+
return self._materialize_spark_jdbc(
|
|
866
|
+
result_df=spark_result_df,
|
|
867
|
+
target_adapter=target_adapter,
|
|
868
|
+
target_table=target_table,
|
|
869
|
+
mode=mode,
|
|
870
|
+
)
|
|
871
|
+
else:
|
|
872
|
+
# Fallback: use target adapter directly (for adapters without JDBC support)
|
|
873
|
+
raise DbtRuntimeError(
|
|
874
|
+
"Non-JDBC materialization path removed in v0.3.0. "
|
|
875
|
+
"All materialization requires Spark JDBC. "
|
|
876
|
+
"Ensure spark_result_df is provided."
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
def _materialize_spark_jdbc(
|
|
880
|
+
self,
|
|
881
|
+
result_df: Any, # Spark DataFrame
|
|
882
|
+
target_adapter: BaseAdapter,
|
|
883
|
+
target_table: str,
|
|
884
|
+
mode: str = "create",
|
|
885
|
+
) -> Any:
|
|
886
|
+
"""
|
|
887
|
+
Materialize Spark query results to target database via JDBC (Phase 1: v0.2.0).
|
|
888
|
+
|
|
889
|
+
This bypasses the DVT node's memory by writing data directly from Spark
|
|
890
|
+
workers to the target database.
|
|
891
|
+
|
|
892
|
+
:param result_df: Spark DataFrame with query results
|
|
893
|
+
:param target_adapter: Adapter to use for getting target credentials
|
|
894
|
+
:param target_table: Target table name (qualified)
|
|
895
|
+
:param mode: Write mode ('create', 'append', 'replace')
|
|
896
|
+
:returns: AdapterResponse
|
|
897
|
+
:raises DbtRuntimeError: If JDBC write fails
|
|
898
|
+
"""
|
|
899
|
+
from dbt.compute.jdbc_utils import build_jdbc_config
|
|
900
|
+
from dbt.adapters.contracts.connection import AdapterResponse
|
|
901
|
+
|
|
902
|
+
# Get target credentials
|
|
903
|
+
target_credentials = target_adapter.config.credentials
|
|
904
|
+
|
|
905
|
+
# Build JDBC configuration for target
|
|
906
|
+
try:
|
|
907
|
+
jdbc_url, jdbc_properties = build_jdbc_config(target_credentials)
|
|
908
|
+
except Exception as e:
|
|
909
|
+
raise DbtRuntimeError(
|
|
910
|
+
f"Failed to build JDBC config for target '{target_table}': {str(e)}"
|
|
911
|
+
) from e
|
|
912
|
+
|
|
913
|
+
# Map DVT mode to Spark JDBC mode
|
|
914
|
+
spark_mode_mapping = {
|
|
915
|
+
"create": "overwrite", # Create/recreate table (dbt behavior)
|
|
916
|
+
"append": "append", # Add to existing table
|
|
917
|
+
"replace": "overwrite", # Drop and recreate
|
|
918
|
+
}
|
|
919
|
+
spark_mode = spark_mode_mapping.get(mode, "overwrite")
|
|
920
|
+
|
|
921
|
+
_log(f"[DVT] Writing to target via Spark JDBC: {target_table} (mode={spark_mode})")
|
|
922
|
+
|
|
923
|
+
# Get Spark session from DataFrame
|
|
924
|
+
spark = result_df.sparkSession
|
|
925
|
+
|
|
926
|
+
# Log DataFrame schema for debugging
|
|
927
|
+
_log(f"[DVT] DataFrame schema:")
|
|
928
|
+
for field in result_df.schema.fields:
|
|
929
|
+
_log(f" - {field.name}: {field.dataType}")
|
|
930
|
+
|
|
931
|
+
# Log row count
|
|
932
|
+
row_count = result_df.count()
|
|
933
|
+
_log(f"[DVT] DataFrame has {row_count} rows")
|
|
934
|
+
|
|
935
|
+
# Sanitize URL for logging (hide password)
|
|
936
|
+
safe_url = jdbc_url.split("?")[0] if "?" in jdbc_url else jdbc_url
|
|
937
|
+
_log(f"[DVT] JDBC URL: {safe_url}")
|
|
938
|
+
_log(f"[DVT] JDBC table: {target_table}")
|
|
939
|
+
|
|
940
|
+
# Write via JDBC
|
|
941
|
+
saved_views: List[Dict[str, str]] = []
|
|
942
|
+
target_adapter_type = target_adapter.type()
|
|
943
|
+
is_postgres = target_adapter_type in ("postgres", "postgresql")
|
|
944
|
+
|
|
945
|
+
try:
|
|
946
|
+
# DVT v0.5.5: Save dependent views before DROP CASCADE, restore after
|
|
947
|
+
# Spark's JDBC overwrite mode doesn't use CASCADE, causing failures
|
|
948
|
+
# when dependent objects (views, etc.) exist
|
|
949
|
+
# DVT v0.51.6: Only applies to PostgreSQL (other DBs handle this differently)
|
|
950
|
+
if spark_mode == "overwrite" and is_postgres:
|
|
951
|
+
try:
|
|
952
|
+
with target_adapter.connection_named("__dvt_drop__"):
|
|
953
|
+
conn = target_adapter.connections.get_thread_connection()
|
|
954
|
+
cursor = conn.handle.cursor()
|
|
955
|
+
|
|
956
|
+
# Parse schema.table from target_table
|
|
957
|
+
parts = target_table.replace('"', '').split('.')
|
|
958
|
+
if len(parts) >= 2:
|
|
959
|
+
tbl_schema = parts[-2]
|
|
960
|
+
tbl_name = parts[-1]
|
|
961
|
+
else:
|
|
962
|
+
tbl_schema = 'public'
|
|
963
|
+
tbl_name = parts[-1]
|
|
964
|
+
|
|
965
|
+
# DVT v0.5.5: Save dependent views before dropping
|
|
966
|
+
saved_views = _get_dependent_views_pg(cursor, tbl_schema, tbl_name)
|
|
967
|
+
if saved_views:
|
|
968
|
+
_log(f"[DVT] Saving {len(saved_views)} dependent view(s) before DROP")
|
|
969
|
+
|
|
970
|
+
# Use CASCADE to drop dependent objects
|
|
971
|
+
drop_sql = f"DROP TABLE IF EXISTS {target_table} CASCADE"
|
|
972
|
+
_log(f"[DVT] Pre-drop with CASCADE: {drop_sql}")
|
|
973
|
+
cursor.execute(drop_sql)
|
|
974
|
+
conn.handle.commit()
|
|
975
|
+
cursor.close()
|
|
976
|
+
except Exception as drop_err:
|
|
977
|
+
_log(f"[DVT] Pre-drop warning (continuing): {drop_err}")
|
|
978
|
+
|
|
979
|
+
result_df.write.format("jdbc").options(
|
|
980
|
+
url=jdbc_url, dbtable=target_table, batchsize="10000", **jdbc_properties
|
|
981
|
+
).mode(spark_mode).save()
|
|
982
|
+
|
|
983
|
+
# DVT v0.5.5: Restore dependent views after successful write (PostgreSQL only)
|
|
984
|
+
if saved_views and is_postgres:
|
|
985
|
+
try:
|
|
986
|
+
with target_adapter.connection_named("__dvt_restore__"):
|
|
987
|
+
conn = target_adapter.connections.get_thread_connection()
|
|
988
|
+
cursor = conn.handle.cursor()
|
|
989
|
+
_recreate_views_pg(cursor, saved_views)
|
|
990
|
+
conn.handle.commit()
|
|
991
|
+
cursor.close()
|
|
992
|
+
_log(f"[DVT] Restored {len(saved_views)} dependent view(s)")
|
|
993
|
+
except Exception as restore_err:
|
|
994
|
+
_log(f"[DVT] Warning: Could not restore views: {restore_err}")
|
|
995
|
+
|
|
996
|
+
# Return mock AdapterResponse
|
|
997
|
+
# Note: Can't easily get rows_affected from Spark JDBC write
|
|
998
|
+
return AdapterResponse(
|
|
999
|
+
_message=f"SUCCESS - Table {target_table} materialized via JDBC",
|
|
1000
|
+
rows_affected=row_count,
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
except Exception as e:
|
|
1004
|
+
# DVT v0.5.2: Clean error message (no Java stack trace)
|
|
1005
|
+
clean_error = _clean_spark_error(e)
|
|
1006
|
+
raise DbtRuntimeError(
|
|
1007
|
+
f"Failed to materialize results to '{target_table}': {clean_error}"
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
def explain_execution(
|
|
1011
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
1012
|
+
) -> str:
|
|
1013
|
+
"""
|
|
1014
|
+
Generate an execution plan explanation for a federated query.
|
|
1015
|
+
|
|
1016
|
+
Useful for debugging and optimization.
|
|
1017
|
+
|
|
1018
|
+
:param node: The node to explain
|
|
1019
|
+
:param analysis_result: Query analysis result
|
|
1020
|
+
:returns: Human-readable execution plan
|
|
1021
|
+
"""
|
|
1022
|
+
source_tables = self._extract_source_tables(analysis_result)
|
|
1023
|
+
|
|
1024
|
+
plan_parts = [
|
|
1025
|
+
"=== DVT Federated Execution Plan ===",
|
|
1026
|
+
f"Node: {node.unique_id}",
|
|
1027
|
+
f"Compute Engine: {self.default_compute_engine}",
|
|
1028
|
+
"",
|
|
1029
|
+
"Data Sources:",
|
|
1030
|
+
]
|
|
1031
|
+
|
|
1032
|
+
for i, source_meta in enumerate(source_tables, 1):
|
|
1033
|
+
plan_parts.append(
|
|
1034
|
+
f" {i}. {source_meta.qualified_name} "
|
|
1035
|
+
f"(connection: {source_meta.connection_name})"
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
plan_parts.extend(
|
|
1039
|
+
[
|
|
1040
|
+
"",
|
|
1041
|
+
"Execution Steps (v0.3.0 - Spark-Unified):",
|
|
1042
|
+
" 1. Extract data from each source via Spark JDBC (parallel reads)",
|
|
1043
|
+
f" 2. Load {len(source_tables)} table(s) into Spark ({self.default_compute_engine})",
|
|
1044
|
+
" 3. Execute query in Spark",
|
|
1045
|
+
" 4. Materialize to target via Spark JDBC",
|
|
1046
|
+
"",
|
|
1047
|
+
f"Strategy: {analysis_result.strategy.upper()}",
|
|
1048
|
+
f"Reason: {analysis_result.reason}",
|
|
1049
|
+
]
|
|
1050
|
+
)
|
|
1051
|
+
|
|
1052
|
+
return "\n".join(plan_parts)
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
class SourceRewriter:
|
|
1056
|
+
"""
|
|
1057
|
+
Rewrites SQL queries to use compute engine table aliases.
|
|
1058
|
+
|
|
1059
|
+
When sources are loaded into compute engines, they may be registered with
|
|
1060
|
+
different names (aliases). This class rewrites the SQL to use those aliases.
|
|
1061
|
+
"""
|
|
1062
|
+
|
|
1063
|
+
@staticmethod
|
|
1064
|
+
def rewrite_sources(sql: str, source_mapping: Dict[str, str]) -> str:
|
|
1065
|
+
"""
|
|
1066
|
+
Rewrite SQL to use compute engine table aliases.
|
|
1067
|
+
|
|
1068
|
+
:param sql: Original SQL with qualified source names
|
|
1069
|
+
:param source_mapping: Dict of qualified_name → alias
|
|
1070
|
+
:returns: Rewritten SQL
|
|
1071
|
+
"""
|
|
1072
|
+
rewritten = sql
|
|
1073
|
+
|
|
1074
|
+
# Replace each qualified name with its alias
|
|
1075
|
+
for qualified_name, alias in source_mapping.items():
|
|
1076
|
+
# Match qualified name (database.schema.table)
|
|
1077
|
+
pattern = re.compile(rf"\b{re.escape(qualified_name)}\b", re.IGNORECASE)
|
|
1078
|
+
rewritten = pattern.sub(alias, rewritten)
|
|
1079
|
+
|
|
1080
|
+
return rewritten
|