dvt-core 0.59.0a51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2660 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +60 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +642 -0
  74. dbt/compute/federated_executor.py +1080 -0
  75. dbt/compute/filter_pushdown.py +273 -0
  76. dbt/compute/jar_provisioning.py +273 -0
  77. dbt/compute/java_compat.py +689 -0
  78. dbt/compute/jdbc_utils.py +1252 -0
  79. dbt/compute/metadata/__init__.py +63 -0
  80. dbt/compute/metadata/adapters_registry.py +370 -0
  81. dbt/compute/metadata/catalog_store.py +1036 -0
  82. dbt/compute/metadata/registry.py +674 -0
  83. dbt/compute/metadata/store.py +1020 -0
  84. dbt/compute/smart_selector.py +377 -0
  85. dbt/compute/spark_logger.py +272 -0
  86. dbt/compute/strategies/__init__.py +55 -0
  87. dbt/compute/strategies/base.py +165 -0
  88. dbt/compute/strategies/dataproc.py +207 -0
  89. dbt/compute/strategies/emr.py +203 -0
  90. dbt/compute/strategies/local.py +472 -0
  91. dbt/compute/strategies/standalone.py +262 -0
  92. dbt/config/__init__.py +4 -0
  93. dbt/config/catalogs.py +94 -0
  94. dbt/config/compute.py +513 -0
  95. dbt/config/dvt_profile.py +408 -0
  96. dbt/config/profile.py +422 -0
  97. dbt/config/project.py +888 -0
  98. dbt/config/project_utils.py +48 -0
  99. dbt/config/renderer.py +231 -0
  100. dbt/config/runtime.py +564 -0
  101. dbt/config/selectors.py +208 -0
  102. dbt/config/utils.py +77 -0
  103. dbt/constants.py +28 -0
  104. dbt/context/__init__.py +0 -0
  105. dbt/context/base.py +745 -0
  106. dbt/context/configured.py +135 -0
  107. dbt/context/context_config.py +382 -0
  108. dbt/context/docs.py +82 -0
  109. dbt/context/exceptions_jinja.py +178 -0
  110. dbt/context/macro_resolver.py +195 -0
  111. dbt/context/macros.py +171 -0
  112. dbt/context/manifest.py +72 -0
  113. dbt/context/providers.py +2249 -0
  114. dbt/context/query_header.py +13 -0
  115. dbt/context/secret.py +58 -0
  116. dbt/context/target.py +74 -0
  117. dbt/contracts/__init__.py +0 -0
  118. dbt/contracts/files.py +413 -0
  119. dbt/contracts/graph/__init__.py +0 -0
  120. dbt/contracts/graph/manifest.py +1904 -0
  121. dbt/contracts/graph/metrics.py +97 -0
  122. dbt/contracts/graph/model_config.py +70 -0
  123. dbt/contracts/graph/node_args.py +42 -0
  124. dbt/contracts/graph/nodes.py +1806 -0
  125. dbt/contracts/graph/semantic_manifest.py +232 -0
  126. dbt/contracts/graph/unparsed.py +811 -0
  127. dbt/contracts/project.py +419 -0
  128. dbt/contracts/results.py +53 -0
  129. dbt/contracts/selection.py +23 -0
  130. dbt/contracts/sql.py +85 -0
  131. dbt/contracts/state.py +68 -0
  132. dbt/contracts/util.py +46 -0
  133. dbt/deprecations.py +348 -0
  134. dbt/deps/__init__.py +0 -0
  135. dbt/deps/base.py +152 -0
  136. dbt/deps/git.py +195 -0
  137. dbt/deps/local.py +79 -0
  138. dbt/deps/registry.py +130 -0
  139. dbt/deps/resolver.py +149 -0
  140. dbt/deps/tarball.py +120 -0
  141. dbt/docs/source/_ext/dbt_click.py +119 -0
  142. dbt/docs/source/conf.py +32 -0
  143. dbt/env_vars.py +64 -0
  144. dbt/event_time/event_time.py +40 -0
  145. dbt/event_time/sample_window.py +60 -0
  146. dbt/events/__init__.py +15 -0
  147. dbt/events/base_types.py +36 -0
  148. dbt/events/core_types_pb2.py +2 -0
  149. dbt/events/logging.py +108 -0
  150. dbt/events/types.py +2516 -0
  151. dbt/exceptions.py +1486 -0
  152. dbt/flags.py +89 -0
  153. dbt/graph/__init__.py +11 -0
  154. dbt/graph/cli.py +249 -0
  155. dbt/graph/graph.py +172 -0
  156. dbt/graph/queue.py +214 -0
  157. dbt/graph/selector.py +374 -0
  158. dbt/graph/selector_methods.py +975 -0
  159. dbt/graph/selector_spec.py +222 -0
  160. dbt/graph/thread_pool.py +18 -0
  161. dbt/hooks.py +21 -0
  162. dbt/include/README.md +49 -0
  163. dbt/include/__init__.py +3 -0
  164. dbt/include/data/adapters_registry.duckdb +0 -0
  165. dbt/include/data/build_comprehensive_registry.py +1254 -0
  166. dbt/include/data/build_registry.py +242 -0
  167. dbt/include/data/csv/adapter_queries.csv +33 -0
  168. dbt/include/data/csv/syntax_rules.csv +9 -0
  169. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  170. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  171. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  172. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  173. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  174. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  175. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  176. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  177. dbt/include/dvt_starter_project/README.md +15 -0
  178. dbt/include/dvt_starter_project/__init__.py +3 -0
  179. dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
  180. dbt/include/dvt_starter_project/dvt_project.yml +39 -0
  181. dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
  182. dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
  183. dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
  184. dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
  185. dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
  186. dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
  187. dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
  188. dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
  189. dbt/internal_deprecations.py +26 -0
  190. dbt/jsonschemas/__init__.py +3 -0
  191. dbt/jsonschemas/jsonschemas.py +309 -0
  192. dbt/jsonschemas/project/0.0.110.json +4717 -0
  193. dbt/jsonschemas/project/0.0.85.json +2015 -0
  194. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  195. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  196. dbt/jsonschemas/resources/latest.json +6773 -0
  197. dbt/links.py +4 -0
  198. dbt/materializations/__init__.py +0 -0
  199. dbt/materializations/incremental/__init__.py +0 -0
  200. dbt/materializations/incremental/microbatch.py +236 -0
  201. dbt/mp_context.py +8 -0
  202. dbt/node_types.py +37 -0
  203. dbt/parser/__init__.py +23 -0
  204. dbt/parser/analysis.py +21 -0
  205. dbt/parser/base.py +548 -0
  206. dbt/parser/common.py +266 -0
  207. dbt/parser/docs.py +52 -0
  208. dbt/parser/fixtures.py +51 -0
  209. dbt/parser/functions.py +30 -0
  210. dbt/parser/generic_test.py +100 -0
  211. dbt/parser/generic_test_builders.py +333 -0
  212. dbt/parser/hooks.py +122 -0
  213. dbt/parser/macros.py +137 -0
  214. dbt/parser/manifest.py +2208 -0
  215. dbt/parser/models.py +573 -0
  216. dbt/parser/partial.py +1178 -0
  217. dbt/parser/read_files.py +445 -0
  218. dbt/parser/schema_generic_tests.py +422 -0
  219. dbt/parser/schema_renderer.py +111 -0
  220. dbt/parser/schema_yaml_readers.py +935 -0
  221. dbt/parser/schemas.py +1466 -0
  222. dbt/parser/search.py +149 -0
  223. dbt/parser/seeds.py +28 -0
  224. dbt/parser/singular_test.py +20 -0
  225. dbt/parser/snapshots.py +44 -0
  226. dbt/parser/sources.py +558 -0
  227. dbt/parser/sql.py +62 -0
  228. dbt/parser/unit_tests.py +621 -0
  229. dbt/plugins/__init__.py +20 -0
  230. dbt/plugins/contracts.py +9 -0
  231. dbt/plugins/exceptions.py +2 -0
  232. dbt/plugins/manager.py +163 -0
  233. dbt/plugins/manifest.py +21 -0
  234. dbt/profiler.py +20 -0
  235. dbt/py.typed +1 -0
  236. dbt/query_analyzer.py +410 -0
  237. dbt/runners/__init__.py +2 -0
  238. dbt/runners/exposure_runner.py +7 -0
  239. dbt/runners/no_op_runner.py +45 -0
  240. dbt/runners/saved_query_runner.py +7 -0
  241. dbt/selected_resources.py +8 -0
  242. dbt/task/__init__.py +0 -0
  243. dbt/task/base.py +506 -0
  244. dbt/task/build.py +197 -0
  245. dbt/task/clean.py +56 -0
  246. dbt/task/clone.py +161 -0
  247. dbt/task/compile.py +150 -0
  248. dbt/task/compute.py +458 -0
  249. dbt/task/debug.py +513 -0
  250. dbt/task/deps.py +280 -0
  251. dbt/task/docs/__init__.py +3 -0
  252. dbt/task/docs/api/__init__.py +23 -0
  253. dbt/task/docs/api/catalog.py +204 -0
  254. dbt/task/docs/api/lineage.py +234 -0
  255. dbt/task/docs/api/profile.py +204 -0
  256. dbt/task/docs/api/spark.py +186 -0
  257. dbt/task/docs/generate.py +1002 -0
  258. dbt/task/docs/index.html +250 -0
  259. dbt/task/docs/serve.py +174 -0
  260. dbt/task/dvt_output.py +509 -0
  261. dbt/task/dvt_run.py +282 -0
  262. dbt/task/dvt_seed.py +806 -0
  263. dbt/task/freshness.py +322 -0
  264. dbt/task/function.py +121 -0
  265. dbt/task/group_lookup.py +46 -0
  266. dbt/task/init.py +1022 -0
  267. dbt/task/java.py +316 -0
  268. dbt/task/list.py +236 -0
  269. dbt/task/metadata.py +804 -0
  270. dbt/task/migrate.py +714 -0
  271. dbt/task/printer.py +175 -0
  272. dbt/task/profile.py +1489 -0
  273. dbt/task/profile_serve.py +662 -0
  274. dbt/task/retract.py +441 -0
  275. dbt/task/retry.py +175 -0
  276. dbt/task/run.py +1647 -0
  277. dbt/task/run_operation.py +141 -0
  278. dbt/task/runnable.py +758 -0
  279. dbt/task/seed.py +103 -0
  280. dbt/task/show.py +149 -0
  281. dbt/task/snapshot.py +56 -0
  282. dbt/task/spark.py +414 -0
  283. dbt/task/sql.py +110 -0
  284. dbt/task/target_sync.py +814 -0
  285. dbt/task/test.py +464 -0
  286. dbt/tests/fixtures/__init__.py +1 -0
  287. dbt/tests/fixtures/project.py +620 -0
  288. dbt/tests/util.py +651 -0
  289. dbt/tracking.py +529 -0
  290. dbt/utils/__init__.py +3 -0
  291. dbt/utils/artifact_upload.py +151 -0
  292. dbt/utils/utils.py +408 -0
  293. dbt/version.py +271 -0
  294. dvt_cli/__init__.py +158 -0
  295. dvt_core-0.59.0a51.dist-info/METADATA +288 -0
  296. dvt_core-0.59.0a51.dist-info/RECORD +299 -0
  297. dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
  298. dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
  299. dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1080 @@
1
+ """
2
+ Federated Query Executor
3
+
4
+ Orchestrates multi-source query execution using Spark compute engine.
5
+ This is the core component that enables DVT's data virtualization capabilities.
6
+
7
+ v0.3.0: Unified Spark architecture - all federation uses Spark JDBC.
8
+ v0.58.5: Fixed segfaults by disabling multiprocessing resource tracker.
9
+
10
+ Execution flow:
11
+ 1. Identify all source tables/models from compiled SQL
12
+ 2. Load sources into Spark via JDBC (parallel reads)
13
+ 3. Execute model SQL in Spark
14
+ 4. Return results as PyArrow Table
15
+ 5. Materialize to target via JDBC or adapter
16
+
17
+ Key principle: Adapters for I/O only, Spark for all compute.
18
+ """
19
+
20
+ # Standard imports
21
+ import os
22
+ import re
23
+ import sys
24
+ import time
25
+ from typing import Any, Dict, List, Optional, Set, Tuple
26
+ from dataclasses import dataclass
27
+
28
+ from pathlib import Path
29
+
30
+ from datetime import datetime
31
+
32
+ from dbt.adapters.base import BaseAdapter
33
+ from dbt.compute.engines.spark_engine import SparkEngine, _clean_spark_error
34
+ from dbt.contracts.graph.manifest import Manifest
35
+ from dbt.contracts.graph.nodes import ManifestNode
36
+ from dbt.query_analyzer import QueryAnalysisResult
37
+ from dbt_common.exceptions import DbtRuntimeError
38
+
39
+
40
+ def _log(msg: str) -> None:
41
+ """
42
+ Log a message that appears immediately in console output.
43
+ DVT v0.4.7: Suppressed for clean output (logs go to spark_run_history).
44
+ """
45
+ # Suppressed for clean output - all debug info goes to spark_run_history file
46
+ pass
47
+
48
+
49
+ def _get_dependent_views_pg(cursor, schema: str, table: str) -> List[Dict[str, str]]:
50
+ """
51
+ Query PostgreSQL for views that depend on a table.
52
+ DVT v0.5.5: Used to save views before DROP CASCADE, then restore after.
53
+
54
+ Returns list of dicts with: schema, name, definition
55
+ """
56
+ try:
57
+ # Query views that depend on this table using pg_depend
58
+ sql = """
59
+ SELECT DISTINCT
60
+ n.nspname as view_schema,
61
+ c.relname as view_name,
62
+ pg_get_viewdef(c.oid, true) as view_definition
63
+ FROM pg_depend d
64
+ JOIN pg_rewrite r ON r.oid = d.objid
65
+ JOIN pg_class c ON c.oid = r.ev_class
66
+ JOIN pg_namespace n ON n.oid = c.relnamespace
67
+ JOIN pg_class t ON t.oid = d.refobjid
68
+ JOIN pg_namespace tn ON tn.oid = t.relnamespace
69
+ WHERE t.relname = %s
70
+ AND tn.nspname = %s
71
+ AND c.relkind = 'v'
72
+ AND d.classid = 'pg_rewrite'::regclass
73
+ AND d.deptype = 'n'
74
+ """
75
+ cursor.execute(sql, (table, schema))
76
+ rows = cursor.fetchall()
77
+ return [
78
+ {'schema': row[0], 'name': row[1], 'definition': row[2]}
79
+ for row in rows
80
+ ]
81
+ except Exception:
82
+ # If query fails (different DB, permissions), return empty
83
+ return []
84
+
85
+
86
+ def _recreate_views_pg(cursor, views: List[Dict[str, str]]) -> None:
87
+ """
88
+ Recreate views from their saved definitions.
89
+ DVT v0.5.5: Restores views after DROP CASCADE.
90
+ """
91
+ for view in views:
92
+ try:
93
+ create_sql = f'CREATE OR REPLACE VIEW "{view["schema"]}"."{view["name"]}" AS {view["definition"]}'
94
+ _log(f"[DVT] Recreating view: {view['schema']}.{view['name']}")
95
+ cursor.execute(create_sql)
96
+ except Exception as e:
97
+ _log(f"[DVT] Warning: Could not recreate view {view['name']}: {e}")
98
+
99
+
100
+ @dataclass
101
+ class SourceTableMetadata:
102
+ """Metadata about a source table needed for federated execution."""
103
+
104
+ source_id: str # Unique ID from manifest
105
+ connection_name: str # Which connection to read from
106
+ database: str # Database name
107
+ schema: str # Schema name
108
+ identifier: str # Table name
109
+ qualified_name: str # Fully qualified name for SQL
110
+
111
+
112
+ @dataclass
113
+ class FederatedExecutionResult:
114
+ """Result of federated query execution."""
115
+
116
+ spark_dataframe: Any # Spark DataFrame with query results
117
+ source_tables: List[SourceTableMetadata] # Sources used
118
+ compute_engine: str # Engine used (spark)
119
+ execution_time_ms: float # Execution time in milliseconds
120
+ rows_read: int # Total rows read from sources
121
+ rows_returned: int # Rows in result (may be None if not counted)
122
+ engine: Any # SparkEngine instance (for session lifecycle management)
123
+
124
+
125
+ class FederatedExecutor:
126
+ """
127
+ Orchestrates federated query execution across multiple data sources.
128
+
129
+ This executor:
130
+ 1. Extracts data from multiple sources via adapters
131
+ 2. Loads data into a compute engine
132
+ 3. Executes the query
133
+ 4. Returns results as Spark DataFrame
134
+ """
135
+
136
+ def __init__(
137
+ self,
138
+ manifest: Manifest,
139
+ adapters: Dict[str, BaseAdapter],
140
+ default_compute_engine: str = "spark-local",
141
+ project_root: Optional[Path] = None,
142
+ ):
143
+ """
144
+ Initialize federated executor.
145
+
146
+ v0.3.0: All federation uses Spark (local or cluster).
147
+ v0.54.0: Added metadata store integration for type mapping.
148
+
149
+ :param manifest: The dbt manifest with all nodes and sources
150
+ :param adapters: Dict of connection_name → adapter instances
151
+ :param default_compute_engine: Default compute engine ("spark-local" or "spark-cluster")
152
+ :param project_root: Project root directory (for metadata store access)
153
+ """
154
+ self.manifest = manifest
155
+ self.adapters = adapters
156
+ self.default_compute_engine = default_compute_engine
157
+ self.project_root = project_root or Path(".")
158
+ self._metadata_store = None
159
+
160
+ @property
161
+ def metadata_store(self):
162
+ """
163
+ Lazy-load the project metadata store.
164
+
165
+ v0.54.0: Returns None if store doesn't exist (graceful degradation).
166
+ """
167
+ if self._metadata_store is None:
168
+ try:
169
+ from dbt.compute.metadata import ProjectMetadataStore
170
+ store_path = self.project_root / ".dvt" / "metadata.duckdb"
171
+ if store_path.exists():
172
+ self._metadata_store = ProjectMetadataStore(self.project_root)
173
+ _log("[DVT] Metadata store loaded from .dvt/metadata.duckdb")
174
+ except ImportError:
175
+ _log("[DVT] DuckDB not available - metadata store disabled")
176
+ except Exception as e:
177
+ _log(f"[DVT] Could not load metadata store: {e}")
178
+ return self._metadata_store
179
+
180
+ def get_source_column_metadata(
181
+ self,
182
+ source_name: str,
183
+ table_name: str
184
+ ) -> Optional[List[Dict[str, Any]]]:
185
+ """
186
+ Look up column metadata for a source table from the metadata store.
187
+
188
+ v0.54.0: Returns cached metadata if available, None otherwise.
189
+
190
+ :param source_name: Name of the source
191
+ :param table_name: Name of the table
192
+ :returns: List of column metadata dicts, or None if not cached
193
+ """
194
+ if self.metadata_store is None:
195
+ return None
196
+
197
+ try:
198
+ metadata = self.metadata_store.get_table_metadata(source_name, table_name)
199
+ if metadata:
200
+ return [
201
+ {
202
+ "column_name": col.column_name,
203
+ "adapter_type": col.adapter_type,
204
+ "spark_type": col.spark_type,
205
+ "is_nullable": col.is_nullable,
206
+ "ordinal_position": col.ordinal_position,
207
+ }
208
+ for col in metadata.columns
209
+ ]
210
+ except Exception as e:
211
+ _log(f"[DVT] Warning: Could not fetch metadata for {source_name}.{table_name}: {e}")
212
+
213
+ return None
214
+
215
+ def get_spark_schema_for_source(
216
+ self,
217
+ source_name: str,
218
+ table_name: str
219
+ ) -> Optional[str]:
220
+ """
221
+ Generate Spark schema DDL for a source table from cached metadata.
222
+
223
+ v0.54.0: Returns schema string for explicit type enforcement.
224
+
225
+ :param source_name: Name of the source
226
+ :param table_name: Name of the table
227
+ :returns: Spark schema DDL string, or None if not cached
228
+ """
229
+ columns = self.get_source_column_metadata(source_name, table_name)
230
+ if not columns:
231
+ return None
232
+
233
+ # Build Spark schema DDL
234
+ # Format: "col1 StringType, col2 IntegerType, ..."
235
+ schema_parts = []
236
+ for col in sorted(columns, key=lambda c: c["ordinal_position"]):
237
+ spark_type = col["spark_type"]
238
+ nullable = "" if col["is_nullable"] else " NOT NULL"
239
+ schema_parts.append(f"`{col['column_name']}` {spark_type}{nullable}")
240
+
241
+ return ", ".join(schema_parts)
242
+
243
+ def capture_source_metadata(
244
+ self,
245
+ engine: SparkEngine,
246
+ source_name: str,
247
+ table_name: str,
248
+ adapter_name: str,
249
+ connection_name: str,
250
+ schema_name: str,
251
+ table_alias: str
252
+ ) -> None:
253
+ """
254
+ Capture metadata from a loaded source table and save to metadata store.
255
+
256
+ v0.54.0: Metadata propagation during federated execution.
257
+
258
+ :param engine: SparkEngine instance with loaded table
259
+ :param source_name: Name of the source
260
+ :param table_name: Name of the table
261
+ :param adapter_name: Type of adapter (postgres, snowflake, etc.)
262
+ :param connection_name: Connection profile name
263
+ :param schema_name: Database schema name
264
+ :param table_alias: Alias used in Spark for the temp view
265
+ """
266
+ if self.metadata_store is None:
267
+ return
268
+
269
+ try:
270
+ # Import here to avoid circular imports
271
+ from dbt.compute.metadata.store import TableMetadata, ColumnMetadata
272
+ from dbt.compute.metadata.registry import TypeRegistry
273
+
274
+ # Get schema from Spark temp view
275
+ spark_schema = engine.get_schema(table_alias)
276
+ if not spark_schema:
277
+ _log(f"[DVT] Could not get schema for {table_alias}")
278
+ return
279
+
280
+ # Build column metadata from Spark schema
281
+ columns = []
282
+ for idx, field in enumerate(spark_schema):
283
+ # field is a StructField with name, dataType, nullable
284
+ spark_type_str = str(field.dataType)
285
+
286
+ # Try to map Spark type back to adapter type
287
+ # Look up in type registry (reverse mapping)
288
+ adapter_type = self._spark_to_adapter_type(
289
+ adapter_name, spark_type_str
290
+ )
291
+
292
+ columns.append(ColumnMetadata(
293
+ column_name=field.name,
294
+ adapter_type=adapter_type,
295
+ spark_type=spark_type_str,
296
+ is_nullable=field.nullable,
297
+ is_primary_key=False, # Can't infer from JDBC
298
+ ordinal_position=idx + 1,
299
+ ))
300
+
301
+ if columns:
302
+ # Create and save table metadata
303
+ metadata = TableMetadata(
304
+ source_name=source_name,
305
+ table_name=table_name,
306
+ adapter_name=adapter_name,
307
+ connection_name=connection_name,
308
+ schema_name=schema_name,
309
+ row_count=None, # Don't query count to avoid performance hit
310
+ columns=columns,
311
+ last_refreshed=datetime.now(),
312
+ )
313
+
314
+ with self.metadata_store as store:
315
+ store.save_table_metadata(metadata)
316
+ _log(f"[DVT] Captured metadata for {source_name}.{table_name}: {len(columns)} columns")
317
+
318
+ except Exception as e:
319
+ # Don't fail execution if metadata capture fails
320
+ _log(f"[DVT] Warning: Could not capture metadata for {source_name}.{table_name}: {e}")
321
+
322
+ def _spark_to_adapter_type(
323
+ self,
324
+ adapter_name: str,
325
+ spark_type: str
326
+ ) -> str:
327
+ """
328
+ Map Spark type back to approximate adapter type.
329
+
330
+ This is a best-effort reverse mapping - exact original type
331
+ may not be recoverable due to type normalization during JDBC read.
332
+
333
+ :param adapter_name: Target adapter name
334
+ :param spark_type: Spark type string (e.g., "StringType()")
335
+ :returns: Approximate adapter type string
336
+ """
337
+ from dbt.compute.metadata.registry import TypeRegistry
338
+
339
+ # Normalize spark type (remove parentheses, etc.)
340
+ spark_type_normalized = spark_type.replace("()", "").replace("Type", "").upper()
341
+
342
+ # Common mappings (reverse of type_registry)
343
+ spark_to_common = {
344
+ "STRING": "VARCHAR",
345
+ "INTEGER": "INTEGER",
346
+ "INT": "INTEGER",
347
+ "LONG": "BIGINT",
348
+ "BIGINT": "BIGINT",
349
+ "SHORT": "SMALLINT",
350
+ "DOUBLE": "DOUBLE PRECISION",
351
+ "FLOAT": "REAL",
352
+ "DECIMAL": "DECIMAL",
353
+ "BOOLEAN": "BOOLEAN",
354
+ "DATE": "DATE",
355
+ "TIMESTAMP": "TIMESTAMP",
356
+ "BINARY": "BYTEA",
357
+ "ARRAY": "ARRAY",
358
+ "MAP": "JSON",
359
+ "STRUCT": "JSON",
360
+ }
361
+
362
+ # Return common SQL type
363
+ return spark_to_common.get(spark_type_normalized, spark_type)
364
+
365
+ def execute(
366
+ self,
367
+ node: ManifestNode,
368
+ analysis_result: QueryAnalysisResult,
369
+ compute_engine_override: Optional[str] = None,
370
+ spark_config: Optional[Dict[str, str]] = None,
371
+ target_adapter_type: Optional[str] = None,
372
+ coerce_view_to_table: bool = False,
373
+ ) -> FederatedExecutionResult:
374
+ """
375
+ Execute a node using federated query processing.
376
+
377
+ :param node: The compiled node to execute
378
+ :param analysis_result: Query analysis result
379
+ :param compute_engine_override: Override compute engine choice
380
+ :param spark_config: Spark configuration (if using Spark)
381
+ :param target_adapter_type: Target adapter type for JDBC materialization
382
+ :param coerce_view_to_table: DVT v0.51.6 - If True, treat view as table (Rule 3.C.3)
383
+ :returns: FederatedExecutionResult with query results
384
+ :raises DbtRuntimeError: If execution fails
385
+ """
386
+ import time
387
+
388
+ _log(f"[DVT] Starting federated execution for node: {node.unique_id}")
389
+ start_time = time.time()
390
+
391
+ # Determine compute engine
392
+ compute_engine = (
393
+ compute_engine_override
394
+ or analysis_result.user_override
395
+ or self.default_compute_engine
396
+ )
397
+ _log(f"[DVT] Compute engine selected: {compute_engine}")
398
+
399
+ # DVT v0.5.0: Restrict Spark compute to table and incremental materializations only
400
+ # DVT v0.51.6: Allow view if coerce_view_to_table is True (Rule 3.C.3)
401
+ if hasattr(node, 'config') and hasattr(node.config, 'materialized'):
402
+ materialized = node.config.materialized
403
+
404
+ # DVT v0.51.6: Views are coerced to tables in cross-target scenarios
405
+ effective_materialized = 'table' if (materialized == 'view' and coerce_view_to_table) else materialized
406
+
407
+ # Only allow table and incremental
408
+ if effective_materialized not in ('table', 'incremental'):
409
+ raise DbtRuntimeError(
410
+ f"Spark compute engine only supports 'table' and 'incremental' materializations. "
411
+ f"Node '{node.unique_id}' uses '{materialized}'. "
412
+ f"Please change the materialization to 'table' or 'incremental', or use adapter-native execution."
413
+ )
414
+
415
+ # For incremental, validate strategy is 'append' (only supported strategy)
416
+ if materialized == 'incremental':
417
+ incremental_strategy = getattr(node.config, 'incremental_strategy', 'append')
418
+ if incremental_strategy != 'append':
419
+ raise DbtRuntimeError(
420
+ f"Spark compute engine only supports 'append' incremental strategy. "
421
+ f"Node '{node.unique_id}' uses '{incremental_strategy}'. "
422
+ f"Supported strategies: append. "
423
+ f"For merge/delete+insert/insert_overwrite, use adapter-native execution."
424
+ )
425
+
426
+ if coerce_view_to_table and materialized == 'view':
427
+ _log(f"[DVT] Materialization: view → table (coerced for cross-target)")
428
+ else:
429
+ _log(f"[DVT] Materialization validated: {materialized}")
430
+
431
+ # Extract source table metadata
432
+ source_tables = self._extract_source_tables(analysis_result)
433
+ _log(f"[DVT] Found {len(source_tables)} source table(s)")
434
+
435
+ # v0.5.99: Look up named clusters from registry
436
+ from dbt.config.compute import ComputeRegistry
437
+ from dbt.compute.jdbc_utils import set_docker_mode
438
+ registry = ComputeRegistry()
439
+ cluster_config = None
440
+
441
+ # Check if it's a registered named cluster
442
+ if compute_engine not in ("spark-local", "spark", "spark-cluster"):
443
+ cluster = registry.get(compute_engine)
444
+ if cluster:
445
+ cluster_config = cluster.config
446
+ _log(f"[DVT] Found registered cluster '{compute_engine}' with platform: {cluster.detect_platform().value}")
447
+
448
+ # DVT v0.51.8: Enable Docker mode for standalone clusters with localhost master
449
+ # This rewrites localhost -> host.docker.internal in JDBC URLs
450
+ master = cluster_config.get("master", "")
451
+ if master.startswith("spark://") and ("localhost" in master or "127.0.0.1" in master):
452
+ set_docker_mode(True)
453
+ _log("[DVT] Docker mode enabled for JDBC URLs")
454
+ else:
455
+ set_docker_mode(False)
456
+ else:
457
+ # Not in registry - check if it starts with "spark" for backwards compat
458
+ if not compute_engine.startswith("spark"):
459
+ raise DbtRuntimeError(
460
+ f"Invalid compute engine '{compute_engine}'. "
461
+ f"Not found in compute registry. "
462
+ f"Available: {[c.name for c in registry.list()]}"
463
+ )
464
+ else:
465
+ set_docker_mode(False)
466
+
467
+ # Create Spark engine (local or cluster based on config)
468
+ _log(f"[DVT] Creating Spark engine (mode: {compute_engine})")
469
+ if compute_engine == "spark-local" or compute_engine == "spark":
470
+ engine = SparkEngine(mode="embedded", spark_config=spark_config or {})
471
+ elif compute_engine == "spark-cluster" or compute_engine.startswith("spark:"):
472
+ # External cluster
473
+ engine = SparkEngine(mode="external", spark_config=spark_config or {})
474
+ elif cluster_config:
475
+ # Named cluster from registry - pass full config
476
+ engine = SparkEngine(mode="external", spark_config=cluster_config)
477
+ else:
478
+ # Fallback
479
+ engine = SparkEngine(mode="external", spark_config=spark_config or {})
480
+
481
+ _log("[DVT] Spark engine created, initializing Spark session...")
482
+ try:
483
+ # v0.5.99: Collect adapter types from sources + target for JDBC driver provisioning
484
+ all_adapter_types = set()
485
+ for source_table in source_tables:
486
+ adapter = self.adapters.get(source_table.connection_name)
487
+ if adapter:
488
+ all_adapter_types.add(adapter.type())
489
+ # Include target adapter type for materialization
490
+ if target_adapter_type:
491
+ all_adapter_types.add(target_adapter_type)
492
+ _log(f"[DVT] Adapter types (sources + target): {all_adapter_types}")
493
+
494
+ # Initialize Spark session with all adapter types (for JDBC drivers)
495
+ engine.connect(adapter_types=all_adapter_types)
496
+ _log("[DVT] Spark session initialized successfully")
497
+
498
+ # Get compiled SQL first (needed for optimization checks)
499
+ compiled_sql = (
500
+ node.compiled_code
501
+ if hasattr(node, "compiled_code")
502
+ else node.raw_code
503
+ )
504
+
505
+ # Step 1: Load source data into Spark via JDBC (v0.3.0: Spark-only)
506
+ total_rows_read = self._load_sources_spark_jdbc(
507
+ engine, source_tables, analysis_result, compiled_sql
508
+ )
509
+
510
+ # Step 2: Rewrite SQL to use table aliases
511
+ rewritten_sql = self._rewrite_sql_for_compute(
512
+ compiled_sql, source_tables
513
+ )
514
+
515
+ # Step 3: Execute query in Spark
516
+ result_df = engine.spark.sql(rewritten_sql)
517
+
518
+ # Calculate execution time
519
+ execution_time_ms = (time.time() - start_time) * 1000
520
+
521
+ # Return Spark DataFrame AND engine (caller must close engine after materialization)
522
+ return FederatedExecutionResult(
523
+ spark_dataframe=result_df,
524
+ source_tables=source_tables,
525
+ compute_engine=compute_engine,
526
+ execution_time_ms=execution_time_ms,
527
+ rows_read=total_rows_read,
528
+ rows_returned=None, # Will be counted during JDBC write
529
+ engine=engine, # Return engine for lifecycle management
530
+ )
531
+
532
+ except Exception as e:
533
+ # Clean up engine on error
534
+ try:
535
+ engine.close()
536
+ except:
537
+ pass
538
+ # DVT v0.5.2: Clean error message (no Java stack trace)
539
+ clean_error = _clean_spark_error(e)
540
+ # DVT v0.5.99: Include original exception for debugging if cleaned message is too short
541
+ if len(clean_error) < 20:
542
+ clean_error = f"{clean_error} (original: {str(e)[:200]})"
543
+ raise DbtRuntimeError(
544
+ f"Federated execution failed for node {node.unique_id}: {clean_error}"
545
+ )
546
+
547
+ def _extract_source_tables(
548
+ self, analysis_result: QueryAnalysisResult
549
+ ) -> List[SourceTableMetadata]:
550
+ """
551
+ Extract metadata for all source tables referenced in the query.
552
+
553
+ :param analysis_result: Query analysis result
554
+ :returns: List of SourceTableMetadata
555
+ """
556
+ source_tables = []
557
+
558
+ for source_id in analysis_result.source_refs:
559
+ source = self.manifest.sources.get(source_id)
560
+ if not source:
561
+ raise DbtRuntimeError(
562
+ f"Source {source_id} not found in manifest. "
563
+ f"Available sources: {list(self.manifest.sources.keys())[:3]}"
564
+ )
565
+
566
+ # Get connection name from source definition
567
+ connection_name = getattr(source, "connection", None)
568
+
569
+ if not connection_name:
570
+ raise DbtRuntimeError(
571
+ f"Source {source_id} does not have a connection specified. "
572
+ "DVT requires all sources to specify a connection in the source YAML:\n"
573
+ " - name: my_source\n"
574
+ " connection: my_connection"
575
+ )
576
+
577
+ # Build qualified name for SQL
578
+ qualified_name = f"{source.database}.{source.schema}.{source.identifier}"
579
+
580
+ metadata = SourceTableMetadata(
581
+ source_id=source_id,
582
+ connection_name=connection_name,
583
+ database=source.database,
584
+ schema=source.schema,
585
+ identifier=source.identifier,
586
+ qualified_name=qualified_name,
587
+ )
588
+
589
+ source_tables.append(metadata)
590
+
591
+ return source_tables
592
+
593
+ # NOTE: _load_sources_via_adapters method removed in v0.3.0
594
+ # All data loading now uses Spark JDBC via _load_sources_spark_jdbc
595
+
596
+ def _load_sources_spark_jdbc(
597
+ self,
598
+ engine: SparkEngine,
599
+ source_tables: List[SourceTableMetadata],
600
+ analysis_result: QueryAnalysisResult,
601
+ compiled_sql: str,
602
+ ) -> int:
603
+ """
604
+ Load all source tables into Spark via JDBC connectors (Phase 1: v0.2.0).
605
+
606
+ This bypasses the DVT node's memory by reading data directly from source
607
+ databases into Spark workers (distributed memory). Data flow:
608
+ Source DB → Spark Workers → Target DB (no DVT node bottleneck)
609
+
610
+ This method:
611
+ 1. Gets adapter credentials for each source
612
+ 2. Converts credentials to JDBC config
613
+ 3. Auto-detects partition column for parallel reads
614
+ 4. Reads data via Spark JDBC with partitioning
615
+ 5. Registers as temp view in Spark
616
+
617
+ :param engine: Spark engine instance
618
+ :param source_tables: List of source table metadata
619
+ :param analysis_result: Query analysis result
620
+ :returns: Total number of rows loaded (estimated, as Spark is lazy)
621
+ :raises DbtRuntimeError: If JDBC not supported or read fails
622
+ """
623
+ from dbt.compute.jdbc_utils import build_jdbc_config
624
+ from dbt.compute.filter_pushdown import optimize_jdbc_table_read
625
+
626
+ total_rows = 0
627
+
628
+ for source_meta in source_tables:
629
+ # Get adapter for this source's connection
630
+ adapter = self.adapters.get(source_meta.connection_name)
631
+ if not adapter:
632
+ raise DbtRuntimeError(
633
+ f"No adapter found for connection '{source_meta.connection_name}'"
634
+ )
635
+
636
+ # Check if JDBC is supported for this adapter type
637
+ if not engine.supports_jdbc(adapter.type()):
638
+ raise DbtRuntimeError(
639
+ f"JDBC not supported for adapter type '{adapter.type()}'. "
640
+ f"This source type requires a JDBC driver. "
641
+ f"Please ensure the appropriate JDBC driver is available."
642
+ )
643
+
644
+ # Log connection attempt
645
+ _log(f"[DVT] Connecting to {adapter.type()} source: {source_meta.qualified_name} (connection: {source_meta.connection_name})")
646
+ connection_start = time.time()
647
+
648
+ # Get adapter credentials
649
+ credentials = adapter.config.credentials
650
+
651
+ # Build JDBC configuration
652
+ try:
653
+ jdbc_url, jdbc_properties = build_jdbc_config(credentials)
654
+ except Exception as e:
655
+ _log(f"[DVT] ERROR: Failed to build JDBC config for '{source_meta.qualified_name}': {str(e)}")
656
+ raise DbtRuntimeError(
657
+ f"Failed to build JDBC config for source '{source_meta.qualified_name}': {str(e)}"
658
+ ) from e
659
+
660
+ # Prepare JDBC read parameters with filter pushdown optimization
661
+ # Instead of reading full table, push down filters (LIMIT, WHERE) to source DB
662
+ jdbc_table = optimize_jdbc_table_read(
663
+ source_table=source_meta,
664
+ compiled_sql=compiled_sql,
665
+ source_tables=source_tables,
666
+ adapter_type=adapter.type()
667
+ )
668
+ table_alias = self._get_table_alias(source_meta)
669
+ numPartitions = 16 # Default parallelism
670
+
671
+ # Automatic partition detection DISABLED
672
+ # Reasons:
673
+ # 1. Slow metadata queries (30-60s on cold Snowflake warehouses)
674
+ # 2. Unnecessary overhead for small datasets
675
+ # 3. Filter pushdown now handles optimization automatically
676
+ partition_column = None
677
+ lower_bound = None
678
+ upper_bound = None
679
+
680
+ # v0.54.0: Look up cached metadata for type mapping
681
+ # Extract source_name and table_name from source_id
682
+ source_parts = source_meta.source_id.split(".")
683
+ if len(source_parts) >= 4:
684
+ source_name = source_parts[2]
685
+ table_name = source_parts[3]
686
+ cached_metadata = self.get_source_column_metadata(source_name, table_name)
687
+ if cached_metadata:
688
+ _log(f"[DVT] Using cached metadata for {source_name}.{table_name} ({len(cached_metadata)} columns)")
689
+ else:
690
+ _log(f"[DVT] No cached metadata for {source_name}.{table_name} - using JDBC type inference")
691
+ else:
692
+ cached_metadata = None
693
+
694
+ # Read via Spark JDBC and register as temp view
695
+ _log(f"[DVT] Reading from JDBC: {jdbc_table}")
696
+ try:
697
+ engine.register_jdbc_table(
698
+ url=jdbc_url,
699
+ table=jdbc_table,
700
+ properties=jdbc_properties,
701
+ table_alias=table_alias,
702
+ numPartitions=numPartitions,
703
+ partitionColumn=partition_column,
704
+ lowerBound=lower_bound,
705
+ upperBound=upper_bound,
706
+ )
707
+ connection_time = time.time() - connection_start
708
+ _log(f"[DVT] ✓ Connected to {source_meta.qualified_name} in {connection_time:.1f}s")
709
+ if connection_time > 30:
710
+ _log(f"[DVT] WARNING: Connection took {connection_time:.1f}s (warehouse may have been suspended)")
711
+
712
+ # v0.54.0: Capture metadata if not already cached
713
+ if not cached_metadata and len(source_parts) >= 4:
714
+ self.capture_source_metadata(
715
+ engine=engine,
716
+ source_name=source_name,
717
+ table_name=table_name,
718
+ adapter_name=adapter.type(),
719
+ connection_name=source_meta.connection_name,
720
+ schema_name=source_meta.schema,
721
+ table_alias=table_alias
722
+ )
723
+ except Exception as e:
724
+ connection_time = time.time() - connection_start
725
+ # DVT v0.5.2: Clean error message (no Java stack trace)
726
+ clean_error = _clean_spark_error(e)
727
+ _log(f"[DVT] ERROR: Failed to load '{source_meta.qualified_name}' after {connection_time:.1f}s: {clean_error}")
728
+ raise DbtRuntimeError(
729
+ f"Failed to load source '{source_meta.qualified_name}' via JDBC: {clean_error}"
730
+ )
731
+
732
+ # Note: Can't easily count rows without triggering Spark action
733
+ # For now, return 0 (rows_read will be inaccurate for JDBC path)
734
+ # TODO: Consider running COUNT(*) query if row count is needed
735
+ total_rows += 0
736
+
737
+ return total_rows
738
+
739
+ def _get_table_alias(self, source_meta: SourceTableMetadata) -> str:
740
+ """
741
+ Generate a safe table alias for the compute engine.
742
+
743
+ Compute engines may not support dots or special characters in table names,
744
+ so we create a normalized alias.
745
+
746
+ :param source_meta: Source table metadata
747
+ :returns: Safe table alias
748
+ """
749
+ # Extract source name and table name from source_id
750
+ # source_id format: source.{project}.{source_name}.{table_name}
751
+ parts = source_meta.source_id.split(".")
752
+ if len(parts) >= 4:
753
+ source_name = parts[2]
754
+ table_name = parts[3]
755
+ return f"{source_name}_{table_name}"
756
+ else:
757
+ # Fallback: use identifier
758
+ return source_meta.identifier
759
+
760
+ def _rewrite_sql_for_compute(
761
+ self, sql: str, source_tables: List[SourceTableMetadata]
762
+ ) -> str:
763
+ """
764
+ Rewrite SQL to replace fully-qualified source table names with compute engine aliases.
765
+
766
+ Source tables are loaded into the compute engine with simple aliases (e.g., 'Exim_cbs_f_country'),
767
+ but the compiled SQL contains fully-qualified names (e.g., '"EXIM_EDWH_DEV"."ods"."cbs_f_country"').
768
+ This method replaces the qualified names with the aliases and removes source-specific clauses
769
+ like SAMPLE that have been pushed down to the source.
770
+
771
+ :param sql: Compiled SQL with fully-qualified table names
772
+ :param source_tables: List of source table metadata
773
+ :returns: Rewritten SQL with aliases and source-specific clauses removed
774
+ """
775
+ import re
776
+
777
+ rewritten_sql = sql
778
+
779
+ for source_meta in source_tables:
780
+ # Get the alias used in the compute engine
781
+ alias = self._get_table_alias(source_meta)
782
+
783
+ # Replace the fully-qualified table name with the alias
784
+ # Format: "database"."schema"."table" or database.schema.table
785
+ qualified_name = source_meta.qualified_name
786
+ parts = qualified_name.split(".")
787
+
788
+ # DVT v0.51.7: Use case-insensitive regex replacement for all variants
789
+ # because Snowflake returns uppercase but Spark/Databricks lowercases
790
+
791
+ # 1. Unquoted: EXIM_EDWH_DEV.ods.cbs_f_country (any case)
792
+ unquoted_pattern = re.compile(
793
+ r'\b' + r'\.'.join(re.escape(p) for p in parts) + r'\b',
794
+ re.IGNORECASE
795
+ )
796
+ rewritten_sql = unquoted_pattern.sub(alias, rewritten_sql)
797
+
798
+ # 2. Double-quoted (PostgreSQL style): "EXIM_EDWH_DEV"."ods"."cbs_f_country" (any case)
799
+ quoted_pattern = re.compile(
800
+ r'"' + r'"\."\s*'.join(re.escape(p) for p in parts) + r'"',
801
+ re.IGNORECASE
802
+ )
803
+ rewritten_sql = quoted_pattern.sub(alias, rewritten_sql)
804
+
805
+ # 3. Single string quoted: "EXIM_EDWH_DEV.ods.cbs_f_country" (any case)
806
+ single_quoted_pattern = re.compile(
807
+ r'"' + r'\.'.join(re.escape(p) for p in parts) + r'"',
808
+ re.IGNORECASE
809
+ )
810
+ rewritten_sql = single_quoted_pattern.sub(alias, rewritten_sql)
811
+
812
+ # 4. Backtick-quoted (Spark/Databricks style): `EXIM_EDWH_DEV`.`ods`.`cbs_f_country` (any case)
813
+ backtick_pattern = re.compile(
814
+ r'`' + r'`\.`\s*'.join(re.escape(p) for p in parts) + r'`',
815
+ re.IGNORECASE
816
+ )
817
+ rewritten_sql = backtick_pattern.sub(alias, rewritten_sql)
818
+
819
+ # DVT v0.4.5: Remove Snowflake-specific SAMPLE clauses
820
+ # These have been pushed down to the source via JDBC subqueries
821
+ # Spark SQL doesn't support SAMPLE syntax, so remove it from the query
822
+ # Pattern matches: SAMPLE (N), SAMPLE (N ROWS), SAMPLE SYSTEM|BERNOULLI|BLOCK (P)
823
+ # with optional REPEATABLE(seed) or SEED(seed)
824
+ rewritten_sql = re.sub(
825
+ r'\s*(?:TABLE)?SAMPLE\s+(?:SYSTEM|BERNOULLI|BLOCK)\s*\(\s*\d+(?:\.\d+)?\s*\)'
826
+ r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*\d+\s*\))?',
827
+ '',
828
+ rewritten_sql,
829
+ flags=re.IGNORECASE
830
+ )
831
+ rewritten_sql = re.sub(
832
+ r'\s*(?:TABLE)?SAMPLE\s*\(\s*\d+(?:\s+ROWS)?\s*\)'
833
+ r'(?:\s+(?:REPEATABLE|SEED)\s*\(\s*\d+\s*\))?',
834
+ '',
835
+ rewritten_sql,
836
+ flags=re.IGNORECASE
837
+ )
838
+
839
+ return rewritten_sql
840
+
841
+ def materialize_result(
842
+ self,
843
+ result: FederatedExecutionResult,
844
+ target_adapter: BaseAdapter,
845
+ target_table: str,
846
+ mode: str = "create",
847
+ use_jdbc: bool = True,
848
+ spark_result_df: Optional[Any] = None,
849
+ ) -> Any:
850
+ """
851
+ Materialize federated query results to target database.
852
+
853
+ v0.3.0: Uses Spark JDBC for all materialization (default).
854
+
855
+ :param result: Federated execution result
856
+ :param target_adapter: Adapter to use for getting target credentials
857
+ :param target_table: Target table name (qualified)
858
+ :param mode: Write mode ('create', 'append', 'replace')
859
+ :param use_jdbc: If True, use JDBC write path (default in v0.3.0)
860
+ :param spark_result_df: Spark DataFrame with results (required for JDBC path)
861
+ :returns: AdapterResponse from write operation
862
+ """
863
+ if use_jdbc and spark_result_df is not None:
864
+ # Use JDBC write path (default in v0.3.0)
865
+ return self._materialize_spark_jdbc(
866
+ result_df=spark_result_df,
867
+ target_adapter=target_adapter,
868
+ target_table=target_table,
869
+ mode=mode,
870
+ )
871
+ else:
872
+ # Fallback: use target adapter directly (for adapters without JDBC support)
873
+ raise DbtRuntimeError(
874
+ "Non-JDBC materialization path removed in v0.3.0. "
875
+ "All materialization requires Spark JDBC. "
876
+ "Ensure spark_result_df is provided."
877
+ )
878
+
879
+ def _materialize_spark_jdbc(
880
+ self,
881
+ result_df: Any, # Spark DataFrame
882
+ target_adapter: BaseAdapter,
883
+ target_table: str,
884
+ mode: str = "create",
885
+ ) -> Any:
886
+ """
887
+ Materialize Spark query results to target database via JDBC (Phase 1: v0.2.0).
888
+
889
+ This bypasses the DVT node's memory by writing data directly from Spark
890
+ workers to the target database.
891
+
892
+ :param result_df: Spark DataFrame with query results
893
+ :param target_adapter: Adapter to use for getting target credentials
894
+ :param target_table: Target table name (qualified)
895
+ :param mode: Write mode ('create', 'append', 'replace')
896
+ :returns: AdapterResponse
897
+ :raises DbtRuntimeError: If JDBC write fails
898
+ """
899
+ from dbt.compute.jdbc_utils import build_jdbc_config
900
+ from dbt.adapters.contracts.connection import AdapterResponse
901
+
902
+ # Get target credentials
903
+ target_credentials = target_adapter.config.credentials
904
+
905
+ # Build JDBC configuration for target
906
+ try:
907
+ jdbc_url, jdbc_properties = build_jdbc_config(target_credentials)
908
+ except Exception as e:
909
+ raise DbtRuntimeError(
910
+ f"Failed to build JDBC config for target '{target_table}': {str(e)}"
911
+ ) from e
912
+
913
+ # Map DVT mode to Spark JDBC mode
914
+ spark_mode_mapping = {
915
+ "create": "overwrite", # Create/recreate table (dbt behavior)
916
+ "append": "append", # Add to existing table
917
+ "replace": "overwrite", # Drop and recreate
918
+ }
919
+ spark_mode = spark_mode_mapping.get(mode, "overwrite")
920
+
921
+ _log(f"[DVT] Writing to target via Spark JDBC: {target_table} (mode={spark_mode})")
922
+
923
+ # Get Spark session from DataFrame
924
+ spark = result_df.sparkSession
925
+
926
+ # Log DataFrame schema for debugging
927
+ _log(f"[DVT] DataFrame schema:")
928
+ for field in result_df.schema.fields:
929
+ _log(f" - {field.name}: {field.dataType}")
930
+
931
+ # Log row count
932
+ row_count = result_df.count()
933
+ _log(f"[DVT] DataFrame has {row_count} rows")
934
+
935
+ # Sanitize URL for logging (hide password)
936
+ safe_url = jdbc_url.split("?")[0] if "?" in jdbc_url else jdbc_url
937
+ _log(f"[DVT] JDBC URL: {safe_url}")
938
+ _log(f"[DVT] JDBC table: {target_table}")
939
+
940
+ # Write via JDBC
941
+ saved_views: List[Dict[str, str]] = []
942
+ target_adapter_type = target_adapter.type()
943
+ is_postgres = target_adapter_type in ("postgres", "postgresql")
944
+
945
+ try:
946
+ # DVT v0.5.5: Save dependent views before DROP CASCADE, restore after
947
+ # Spark's JDBC overwrite mode doesn't use CASCADE, causing failures
948
+ # when dependent objects (views, etc.) exist
949
+ # DVT v0.51.6: Only applies to PostgreSQL (other DBs handle this differently)
950
+ if spark_mode == "overwrite" and is_postgres:
951
+ try:
952
+ with target_adapter.connection_named("__dvt_drop__"):
953
+ conn = target_adapter.connections.get_thread_connection()
954
+ cursor = conn.handle.cursor()
955
+
956
+ # Parse schema.table from target_table
957
+ parts = target_table.replace('"', '').split('.')
958
+ if len(parts) >= 2:
959
+ tbl_schema = parts[-2]
960
+ tbl_name = parts[-1]
961
+ else:
962
+ tbl_schema = 'public'
963
+ tbl_name = parts[-1]
964
+
965
+ # DVT v0.5.5: Save dependent views before dropping
966
+ saved_views = _get_dependent_views_pg(cursor, tbl_schema, tbl_name)
967
+ if saved_views:
968
+ _log(f"[DVT] Saving {len(saved_views)} dependent view(s) before DROP")
969
+
970
+ # Use CASCADE to drop dependent objects
971
+ drop_sql = f"DROP TABLE IF EXISTS {target_table} CASCADE"
972
+ _log(f"[DVT] Pre-drop with CASCADE: {drop_sql}")
973
+ cursor.execute(drop_sql)
974
+ conn.handle.commit()
975
+ cursor.close()
976
+ except Exception as drop_err:
977
+ _log(f"[DVT] Pre-drop warning (continuing): {drop_err}")
978
+
979
+ result_df.write.format("jdbc").options(
980
+ url=jdbc_url, dbtable=target_table, batchsize="10000", **jdbc_properties
981
+ ).mode(spark_mode).save()
982
+
983
+ # DVT v0.5.5: Restore dependent views after successful write (PostgreSQL only)
984
+ if saved_views and is_postgres:
985
+ try:
986
+ with target_adapter.connection_named("__dvt_restore__"):
987
+ conn = target_adapter.connections.get_thread_connection()
988
+ cursor = conn.handle.cursor()
989
+ _recreate_views_pg(cursor, saved_views)
990
+ conn.handle.commit()
991
+ cursor.close()
992
+ _log(f"[DVT] Restored {len(saved_views)} dependent view(s)")
993
+ except Exception as restore_err:
994
+ _log(f"[DVT] Warning: Could not restore views: {restore_err}")
995
+
996
+ # Return mock AdapterResponse
997
+ # Note: Can't easily get rows_affected from Spark JDBC write
998
+ return AdapterResponse(
999
+ _message=f"SUCCESS - Table {target_table} materialized via JDBC",
1000
+ rows_affected=row_count,
1001
+ )
1002
+
1003
+ except Exception as e:
1004
+ # DVT v0.5.2: Clean error message (no Java stack trace)
1005
+ clean_error = _clean_spark_error(e)
1006
+ raise DbtRuntimeError(
1007
+ f"Failed to materialize results to '{target_table}': {clean_error}"
1008
+ )
1009
+
1010
+ def explain_execution(
1011
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
1012
+ ) -> str:
1013
+ """
1014
+ Generate an execution plan explanation for a federated query.
1015
+
1016
+ Useful for debugging and optimization.
1017
+
1018
+ :param node: The node to explain
1019
+ :param analysis_result: Query analysis result
1020
+ :returns: Human-readable execution plan
1021
+ """
1022
+ source_tables = self._extract_source_tables(analysis_result)
1023
+
1024
+ plan_parts = [
1025
+ "=== DVT Federated Execution Plan ===",
1026
+ f"Node: {node.unique_id}",
1027
+ f"Compute Engine: {self.default_compute_engine}",
1028
+ "",
1029
+ "Data Sources:",
1030
+ ]
1031
+
1032
+ for i, source_meta in enumerate(source_tables, 1):
1033
+ plan_parts.append(
1034
+ f" {i}. {source_meta.qualified_name} "
1035
+ f"(connection: {source_meta.connection_name})"
1036
+ )
1037
+
1038
+ plan_parts.extend(
1039
+ [
1040
+ "",
1041
+ "Execution Steps (v0.3.0 - Spark-Unified):",
1042
+ " 1. Extract data from each source via Spark JDBC (parallel reads)",
1043
+ f" 2. Load {len(source_tables)} table(s) into Spark ({self.default_compute_engine})",
1044
+ " 3. Execute query in Spark",
1045
+ " 4. Materialize to target via Spark JDBC",
1046
+ "",
1047
+ f"Strategy: {analysis_result.strategy.upper()}",
1048
+ f"Reason: {analysis_result.reason}",
1049
+ ]
1050
+ )
1051
+
1052
+ return "\n".join(plan_parts)
1053
+
1054
+
1055
+ class SourceRewriter:
1056
+ """
1057
+ Rewrites SQL queries to use compute engine table aliases.
1058
+
1059
+ When sources are loaded into compute engines, they may be registered with
1060
+ different names (aliases). This class rewrites the SQL to use those aliases.
1061
+ """
1062
+
1063
+ @staticmethod
1064
+ def rewrite_sources(sql: str, source_mapping: Dict[str, str]) -> str:
1065
+ """
1066
+ Rewrite SQL to use compute engine table aliases.
1067
+
1068
+ :param sql: Original SQL with qualified source names
1069
+ :param source_mapping: Dict of qualified_name → alias
1070
+ :returns: Rewritten SQL
1071
+ """
1072
+ rewritten = sql
1073
+
1074
+ # Replace each qualified name with its alias
1075
+ for qualified_name, alias in source_mapping.items():
1076
+ # Match qualified name (database.schema.table)
1077
+ pattern = re.compile(rf"\b{re.escape(qualified_name)}\b", re.IGNORECASE)
1078
+ rewritten = pattern.sub(alias, rewritten)
1079
+
1080
+ return rewritten