dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dvt-core might be problematic. Click here for more details.

Files changed (275) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2039 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +804 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +624 -0
  74. dbt/compute/federated_executor.py +837 -0
  75. dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
  76. dbt/compute/filter_pushdown.py +273 -0
  77. dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
  78. dbt/compute/jar_provisioning.py +255 -0
  79. dbt/compute/java_compat.cpython-310-darwin.so +0 -0
  80. dbt/compute/java_compat.py +689 -0
  81. dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
  82. dbt/compute/jdbc_utils.py +678 -0
  83. dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
  84. dbt/compute/smart_selector.py +311 -0
  85. dbt/compute/strategies/__init__.py +54 -0
  86. dbt/compute/strategies/base.py +165 -0
  87. dbt/compute/strategies/dataproc.py +207 -0
  88. dbt/compute/strategies/emr.py +203 -0
  89. dbt/compute/strategies/local.py +364 -0
  90. dbt/compute/strategies/standalone.py +262 -0
  91. dbt/config/__init__.py +4 -0
  92. dbt/config/catalogs.py +94 -0
  93. dbt/config/compute.cpython-310-darwin.so +0 -0
  94. dbt/config/compute.py +547 -0
  95. dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
  96. dbt/config/dvt_profile.py +342 -0
  97. dbt/config/profile.py +422 -0
  98. dbt/config/project.py +873 -0
  99. dbt/config/project_utils.py +28 -0
  100. dbt/config/renderer.py +231 -0
  101. dbt/config/runtime.py +553 -0
  102. dbt/config/selectors.py +208 -0
  103. dbt/config/utils.py +77 -0
  104. dbt/constants.py +28 -0
  105. dbt/context/__init__.py +0 -0
  106. dbt/context/base.py +745 -0
  107. dbt/context/configured.py +135 -0
  108. dbt/context/context_config.py +382 -0
  109. dbt/context/docs.py +82 -0
  110. dbt/context/exceptions_jinja.py +178 -0
  111. dbt/context/macro_resolver.py +195 -0
  112. dbt/context/macros.py +171 -0
  113. dbt/context/manifest.py +72 -0
  114. dbt/context/providers.py +2249 -0
  115. dbt/context/query_header.py +13 -0
  116. dbt/context/secret.py +58 -0
  117. dbt/context/target.py +74 -0
  118. dbt/contracts/__init__.py +0 -0
  119. dbt/contracts/files.py +413 -0
  120. dbt/contracts/graph/__init__.py +0 -0
  121. dbt/contracts/graph/manifest.py +1904 -0
  122. dbt/contracts/graph/metrics.py +97 -0
  123. dbt/contracts/graph/model_config.py +70 -0
  124. dbt/contracts/graph/node_args.py +42 -0
  125. dbt/contracts/graph/nodes.py +1806 -0
  126. dbt/contracts/graph/semantic_manifest.py +232 -0
  127. dbt/contracts/graph/unparsed.py +811 -0
  128. dbt/contracts/project.py +417 -0
  129. dbt/contracts/results.py +53 -0
  130. dbt/contracts/selection.py +23 -0
  131. dbt/contracts/sql.py +85 -0
  132. dbt/contracts/state.py +68 -0
  133. dbt/contracts/util.py +46 -0
  134. dbt/deprecations.py +346 -0
  135. dbt/deps/__init__.py +0 -0
  136. dbt/deps/base.py +152 -0
  137. dbt/deps/git.py +195 -0
  138. dbt/deps/local.py +79 -0
  139. dbt/deps/registry.py +130 -0
  140. dbt/deps/resolver.py +149 -0
  141. dbt/deps/tarball.py +120 -0
  142. dbt/docs/source/_ext/dbt_click.py +119 -0
  143. dbt/docs/source/conf.py +32 -0
  144. dbt/env_vars.py +64 -0
  145. dbt/event_time/event_time.py +40 -0
  146. dbt/event_time/sample_window.py +60 -0
  147. dbt/events/__init__.py +15 -0
  148. dbt/events/base_types.py +36 -0
  149. dbt/events/core_types_pb2.py +2 -0
  150. dbt/events/logging.py +108 -0
  151. dbt/events/types.py +2516 -0
  152. dbt/exceptions.py +1486 -0
  153. dbt/flags.py +89 -0
  154. dbt/graph/__init__.py +11 -0
  155. dbt/graph/cli.py +247 -0
  156. dbt/graph/graph.py +172 -0
  157. dbt/graph/queue.py +214 -0
  158. dbt/graph/selector.py +374 -0
  159. dbt/graph/selector_methods.py +975 -0
  160. dbt/graph/selector_spec.py +222 -0
  161. dbt/graph/thread_pool.py +18 -0
  162. dbt/hooks.py +21 -0
  163. dbt/include/README.md +49 -0
  164. dbt/include/__init__.py +3 -0
  165. dbt/include/starter_project/.gitignore +4 -0
  166. dbt/include/starter_project/README.md +15 -0
  167. dbt/include/starter_project/__init__.py +3 -0
  168. dbt/include/starter_project/analyses/.gitkeep +0 -0
  169. dbt/include/starter_project/dbt_project.yml +36 -0
  170. dbt/include/starter_project/macros/.gitkeep +0 -0
  171. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  172. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  173. dbt/include/starter_project/models/example/schema.yml +21 -0
  174. dbt/include/starter_project/seeds/.gitkeep +0 -0
  175. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  176. dbt/include/starter_project/tests/.gitkeep +0 -0
  177. dbt/internal_deprecations.py +26 -0
  178. dbt/jsonschemas/__init__.py +3 -0
  179. dbt/jsonschemas/jsonschemas.py +309 -0
  180. dbt/jsonschemas/project/0.0.110.json +4717 -0
  181. dbt/jsonschemas/project/0.0.85.json +2015 -0
  182. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  183. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  184. dbt/jsonschemas/resources/latest.json +6773 -0
  185. dbt/links.py +4 -0
  186. dbt/materializations/__init__.py +0 -0
  187. dbt/materializations/incremental/__init__.py +0 -0
  188. dbt/materializations/incremental/microbatch.py +236 -0
  189. dbt/mp_context.py +8 -0
  190. dbt/node_types.py +37 -0
  191. dbt/parser/__init__.py +23 -0
  192. dbt/parser/analysis.py +21 -0
  193. dbt/parser/base.py +548 -0
  194. dbt/parser/common.py +266 -0
  195. dbt/parser/docs.py +52 -0
  196. dbt/parser/fixtures.py +51 -0
  197. dbt/parser/functions.py +30 -0
  198. dbt/parser/generic_test.py +100 -0
  199. dbt/parser/generic_test_builders.py +333 -0
  200. dbt/parser/hooks.py +118 -0
  201. dbt/parser/macros.py +137 -0
  202. dbt/parser/manifest.py +2204 -0
  203. dbt/parser/models.py +573 -0
  204. dbt/parser/partial.py +1178 -0
  205. dbt/parser/read_files.py +445 -0
  206. dbt/parser/schema_generic_tests.py +422 -0
  207. dbt/parser/schema_renderer.py +111 -0
  208. dbt/parser/schema_yaml_readers.py +935 -0
  209. dbt/parser/schemas.py +1466 -0
  210. dbt/parser/search.py +149 -0
  211. dbt/parser/seeds.py +28 -0
  212. dbt/parser/singular_test.py +20 -0
  213. dbt/parser/snapshots.py +44 -0
  214. dbt/parser/sources.py +558 -0
  215. dbt/parser/sql.py +62 -0
  216. dbt/parser/unit_tests.py +621 -0
  217. dbt/plugins/__init__.py +20 -0
  218. dbt/plugins/contracts.py +9 -0
  219. dbt/plugins/exceptions.py +2 -0
  220. dbt/plugins/manager.py +163 -0
  221. dbt/plugins/manifest.py +21 -0
  222. dbt/profiler.py +20 -0
  223. dbt/py.typed +1 -0
  224. dbt/query_analyzer.cpython-310-darwin.so +0 -0
  225. dbt/query_analyzer.py +410 -0
  226. dbt/runners/__init__.py +2 -0
  227. dbt/runners/exposure_runner.py +7 -0
  228. dbt/runners/no_op_runner.py +45 -0
  229. dbt/runners/saved_query_runner.py +7 -0
  230. dbt/selected_resources.py +8 -0
  231. dbt/task/__init__.py +0 -0
  232. dbt/task/base.py +503 -0
  233. dbt/task/build.py +197 -0
  234. dbt/task/clean.py +56 -0
  235. dbt/task/clone.py +161 -0
  236. dbt/task/compile.py +150 -0
  237. dbt/task/compute.py +454 -0
  238. dbt/task/debug.py +505 -0
  239. dbt/task/deps.py +280 -0
  240. dbt/task/docs/__init__.py +3 -0
  241. dbt/task/docs/generate.py +660 -0
  242. dbt/task/docs/index.html +250 -0
  243. dbt/task/docs/serve.py +29 -0
  244. dbt/task/freshness.py +322 -0
  245. dbt/task/function.py +121 -0
  246. dbt/task/group_lookup.py +46 -0
  247. dbt/task/init.py +553 -0
  248. dbt/task/java.py +316 -0
  249. dbt/task/list.py +236 -0
  250. dbt/task/printer.py +175 -0
  251. dbt/task/retry.py +175 -0
  252. dbt/task/run.py +1306 -0
  253. dbt/task/run_operation.py +141 -0
  254. dbt/task/runnable.py +758 -0
  255. dbt/task/seed.py +103 -0
  256. dbt/task/show.py +149 -0
  257. dbt/task/snapshot.py +56 -0
  258. dbt/task/spark.py +414 -0
  259. dbt/task/sql.py +110 -0
  260. dbt/task/target_sync.py +759 -0
  261. dbt/task/test.py +464 -0
  262. dbt/tests/fixtures/__init__.py +1 -0
  263. dbt/tests/fixtures/project.py +620 -0
  264. dbt/tests/util.py +651 -0
  265. dbt/tracking.py +529 -0
  266. dbt/utils/__init__.py +3 -0
  267. dbt/utils/artifact_upload.py +151 -0
  268. dbt/utils/utils.py +408 -0
  269. dbt/version.py +268 -0
  270. dvt_cli/__init__.py +72 -0
  271. dvt_core-0.52.2.dist-info/METADATA +286 -0
  272. dvt_core-0.52.2.dist-info/RECORD +275 -0
  273. dvt_core-0.52.2.dist-info/WHEEL +5 -0
  274. dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
  275. dvt_core-0.52.2.dist-info/top_level.txt +2 -0
@@ -0,0 +1,14 @@
1
+ """
2
+ DVT Compute Layer
3
+
4
+ This module provides compute engine integration for federated query execution.
5
+
6
+ v0.3.0: Spark-unified architecture - arrow_bridge removed.
7
+ """
8
+
9
+ # Note: arrow_bridge, adapter_to_arrow, and arrow_to_adapter removed in v0.3.0
10
+ # All data loading now uses Spark JDBC
11
+
12
+ from typing import List
13
+
14
+ __all__: List[str] = []
@@ -0,0 +1,12 @@
1
+ """
2
+ DVT Compute Engines
3
+
4
+ This module provides ephemeral compute engines for federated query execution.
5
+ Compute engines are used ONLY for processing, never for materialization.
6
+
7
+ v0.3.0: Spark-unified architecture - DuckDBEngine removed.
8
+ """
9
+
10
+ from dbt.compute.engines.spark_engine import SparkEngine
11
+
12
+ __all__ = ["SparkEngine"]
@@ -0,0 +1,624 @@
1
+ """
2
+ Spark Compute Engine
3
+
4
+ Provides Spark integration for large-scale federated query execution.
5
+ Supports multiple connection strategies via strategy pattern:
6
+ - Local: Embedded PySpark (in-process)
7
+ - Standalone: Remote Spark clusters via spark:// URL
8
+ - EMR: AWS EMR clusters via YARN
9
+ - Dataproc: GCP Dataproc clusters
10
+
11
+ Key characteristics:
12
+ - Scalable to large datasets
13
+ - Distributed processing
14
+ - Can connect to external Spark clusters
15
+ - No materialization (ephemeral only)
16
+
17
+ v0.51.2: Removed Databricks support (serverless cannot read external JDBC sources).
18
+ """
19
+
20
+ from typing import Any, Dict, List, Optional
21
+ from dbt_common.exceptions import DbtRuntimeError
22
+
23
+ try:
24
+ from pyspark.sql import SparkSession, DataFrame
25
+ PYSPARK_AVAILABLE = True
26
+ except ImportError:
27
+ PYSPARK_AVAILABLE = False
28
+ SparkSession = None
29
+ DataFrame = None
30
+
31
+ from dbt.compute.strategies import (
32
+ BaseConnectionStrategy,
33
+ LocalStrategy,
34
+ get_emr_strategy,
35
+ get_dataproc_strategy,
36
+ get_standalone_strategy,
37
+ )
38
+
39
+
40
+ def _clean_spark_error(e: Exception) -> str:
41
+ """
42
+ Extract clean error message from Java/Spark exception.
43
+
44
+ DVT v0.5.2: Removes verbose Java stack traces and returns readable error message.
45
+
46
+ :param e: Exception from Spark/Java
47
+ :returns: Clean error message string
48
+ """
49
+ error_str = str(e)
50
+
51
+ # Check for common error patterns and extract meaningful message
52
+
53
+ # Pattern 1: ServiceConfigurationError (Scala version mismatch)
54
+ if "ServiceConfigurationError" in error_str:
55
+ if "Unable to get public no-arg constructor" in error_str:
56
+ # Extract the class name that failed
57
+ if "DataSourceRegister:" in error_str:
58
+ class_name = error_str.split("DataSourceRegister:")[-1].split()[0]
59
+ return f"Spark connector incompatible with current Scala version: {class_name}. Try using JDBC driver directly instead of Spark connector."
60
+ return "Spark service configuration error - possible Scala version mismatch"
61
+
62
+ # Pattern 2: NoClassDefFoundError
63
+ if "NoClassDefFoundError:" in error_str:
64
+ missing_class = error_str.split("NoClassDefFoundError:")[-1].split()[0].strip()
65
+ return f"Missing Java class: {missing_class}. This usually indicates a Scala version mismatch between Spark and the connector."
66
+
67
+ # Pattern 3: ClassNotFoundException
68
+ if "ClassNotFoundException:" in error_str:
69
+ missing_class = error_str.split("ClassNotFoundException:")[-1].split()[0].strip()
70
+ return f"Class not found: {missing_class}"
71
+
72
+ # Pattern 4: SQLException
73
+ if "SQLException:" in error_str:
74
+ sql_error = error_str.split("SQLException:")[-1].split('\n')[0].strip()
75
+ return f"SQL Error: {sql_error}"
76
+
77
+ # Pattern 5: Snowflake errors
78
+ if "net.snowflake" in error_str:
79
+ if "Authentication" in error_str or "auth" in error_str.lower():
80
+ return "Snowflake authentication failed. Check credentials in profile."
81
+ if "does not exist" in error_str:
82
+ return "Snowflake table/schema not found. Check the object path."
83
+
84
+ # Pattern 6: PostgreSQL errors
85
+ if "PSQLException:" in error_str:
86
+ lines = error_str.split('\n')
87
+ for line in lines:
88
+ if "PSQLException:" in line:
89
+ return line.split("PSQLException:")[-1].strip()
90
+
91
+ # Default: Return first line only (remove stack trace)
92
+ first_line = error_str.split('\n')[0]
93
+ if len(first_line) > 200:
94
+ first_line = first_line[:200] + "..."
95
+ return first_line
96
+
97
+
98
+ class SparkEngine:
99
+ """
100
+ Ephemeral Spark compute engine for federated query execution.
101
+
102
+ Uses strategy pattern for flexible connection management:
103
+ 1. Local: Embedded PySpark session (in-process)
104
+ 2. Databricks: Remote Databricks clusters via databricks-connect
105
+ 3. External: Generic external clusters (legacy)
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ mode: str = "embedded",
111
+ spark_config: Optional[Dict[str, str]] = None,
112
+ app_name: str = "DVT-Compute",
113
+ ):
114
+ """
115
+ Initialize Spark engine.
116
+
117
+ :param mode: 'embedded' for local, 'external' for remote cluster, 'databricks' for Databricks
118
+ :param spark_config: Spark configuration dict (platform-specific)
119
+ :param app_name: Spark application name
120
+ :raises DbtRuntimeError: If PySpark not available or invalid config
121
+ """
122
+ if not PYSPARK_AVAILABLE:
123
+ raise DbtRuntimeError("PySpark is not available. Install it with: pip install pyspark")
124
+
125
+ self.mode = mode
126
+ self.spark_config = spark_config or {}
127
+ self.app_name = app_name
128
+ self.spark: Optional[SparkSession] = None
129
+ self.registered_tables: Dict[str, str] = {}
130
+
131
+ # Create connection strategy based on mode or config
132
+ self._connection_strategy = self._create_strategy()
133
+
134
+ def _create_strategy(self) -> BaseConnectionStrategy:
135
+ """
136
+ Create connection strategy based on mode or config.
137
+
138
+ v0.51.2: Removed Databricks (serverless cannot read external JDBC sources).
139
+ Platform detection order:
140
+ 1. Dataproc: project + region + cluster
141
+ 2. EMR: master=yarn (without Dataproc keys)
142
+ 3. Standalone: master=spark://
143
+ 4. Local: default (local[*] or no master)
144
+
145
+ :returns: Connection strategy instance
146
+ :raises DbtRuntimeError: If platform detection fails
147
+ """
148
+ config_keys = set(self.spark_config.keys())
149
+
150
+ # 1. Dataproc: has project, region, and cluster
151
+ if all(k in config_keys for k in ("project", "region", "cluster")):
152
+ DataprocStrategy = get_dataproc_strategy()
153
+ strategy = DataprocStrategy(config=self.spark_config, app_name=self.app_name)
154
+ strategy.validate_config()
155
+ return strategy
156
+
157
+ # Check master config for EMR, Standalone, or Local
158
+ master = self.spark_config.get("master", "")
159
+
160
+ # 3. EMR: master=yarn (YARN resource manager)
161
+ if master.lower() == "yarn":
162
+ EMRStrategy = get_emr_strategy()
163
+ strategy = EMRStrategy(config=self.spark_config, app_name=self.app_name)
164
+ strategy.validate_config()
165
+ return strategy
166
+
167
+ # 4. Standalone: master=spark://
168
+ if master.startswith("spark://"):
169
+ StandaloneStrategy = get_standalone_strategy()
170
+ strategy = StandaloneStrategy(config=self.spark_config, app_name=self.app_name)
171
+ strategy.validate_config()
172
+ return strategy
173
+
174
+ # 5. Local: local[*], local[N], or no master (default)
175
+ if master.startswith("local") or not master or self.mode in ("embedded", "local"):
176
+ strategy = LocalStrategy(config=self.spark_config, app_name=self.app_name)
177
+ strategy.validate_config()
178
+ return strategy
179
+
180
+ # Explicit mode overrides
181
+ if self.mode == "emr":
182
+ EMRStrategy = get_emr_strategy()
183
+ strategy = EMRStrategy(config=self.spark_config, app_name=self.app_name)
184
+ strategy.validate_config()
185
+ return strategy
186
+
187
+ if self.mode == "dataproc":
188
+ DataprocStrategy = get_dataproc_strategy()
189
+ strategy = DataprocStrategy(config=self.spark_config, app_name=self.app_name)
190
+ strategy.validate_config()
191
+ return strategy
192
+
193
+ if self.mode in ("standalone", "external"):
194
+ StandaloneStrategy = get_standalone_strategy()
195
+ strategy = StandaloneStrategy(config=self.spark_config, app_name=self.app_name)
196
+ strategy.validate_config()
197
+ return strategy
198
+
199
+ # Fallback to local
200
+ strategy = LocalStrategy(config=self.spark_config, app_name=self.app_name)
201
+ strategy.validate_config()
202
+ return strategy
203
+
204
+ def __enter__(self):
205
+ """Context manager entry - initialize Spark session."""
206
+ self.connect()
207
+ return self
208
+
209
+ def __exit__(self, exc_type, exc_val, exc_tb):
210
+ """Context manager exit - stop Spark session."""
211
+ self.close()
212
+
213
+ def connect(self, adapter_types: Optional[set] = None) -> None:
214
+ """
215
+ Create Spark session using the configured strategy.
216
+
217
+ v0.5.99: Now accepts adapter_types for JDBC driver provisioning.
218
+
219
+ :param adapter_types: Set of source adapter types that need JDBC drivers
220
+ """
221
+ try:
222
+ self.spark = self._connection_strategy.get_spark_session(adapter_types=adapter_types)
223
+ except Exception as e:
224
+ raise DbtRuntimeError(f"Failed to initialize Spark engine: {str(e)}") from e
225
+
226
+ def close(self) -> None:
227
+ """Stop Spark session and release resources."""
228
+ if self.spark:
229
+ try:
230
+ self._connection_strategy.close(self.spark)
231
+ except Exception:
232
+ pass # Best effort cleanup
233
+ finally:
234
+ self.spark = None
235
+ self.registered_tables.clear()
236
+
237
+ def get_table_info(self, table_name: str) -> Dict[str, Any]:
238
+ """
239
+ Get metadata about a registered table.
240
+
241
+ :param table_name: Name of the table
242
+ :returns: Dictionary with table metadata (columns, row_count, etc.)
243
+ :raises DbtRuntimeError: If table not found
244
+ """
245
+ if not self.spark:
246
+ raise DbtRuntimeError("Spark engine not connected")
247
+
248
+ if table_name not in self.registered_tables:
249
+ raise DbtRuntimeError(f"Table '{table_name}' not registered")
250
+
251
+ try:
252
+ # Get DataFrame for the table
253
+ df = self.spark.table(table_name)
254
+
255
+ # Get schema
256
+ columns = []
257
+ for field in df.schema.fields:
258
+ columns.append(
259
+ {"name": field.name, "type": str(field.dataType), "nullable": field.nullable}
260
+ )
261
+
262
+ # Get row count
263
+ row_count = df.count()
264
+
265
+ return {"table_name": table_name, "columns": columns, "row_count": row_count}
266
+
267
+ except Exception as e:
268
+ raise DbtRuntimeError(f"Failed to get info for table '{table_name}': {str(e)}") from e
269
+
270
+ def list_tables(self) -> List[str]:
271
+ """
272
+ List all registered tables.
273
+
274
+ :returns: List of table names
275
+ """
276
+ return list(self.registered_tables.keys())
277
+
278
+ def explain_query(self, sql: str) -> str:
279
+ """
280
+ Get query execution plan.
281
+
282
+ Useful for debugging and optimization.
283
+
284
+ :param sql: SQL query to explain
285
+ :returns: Query execution plan as string
286
+ """
287
+ if not self.spark:
288
+ raise DbtRuntimeError("Spark engine not connected")
289
+
290
+ try:
291
+ df = self.spark.sql(sql)
292
+ # Get extended explain with cost model and optimizations
293
+ return df._jdf.queryExecution().toString()
294
+
295
+ except Exception as e:
296
+ raise DbtRuntimeError(f"Failed to explain query: {str(e)}\nSQL: {sql}") from e
297
+
298
+ def cache_table(self, table_name: str) -> None:
299
+ """
300
+ Cache a table in Spark memory for faster subsequent queries.
301
+
302
+ Useful for tables that are accessed multiple times.
303
+
304
+ :param table_name: Name of the table to cache
305
+ :raises DbtRuntimeError: If table not found or caching fails
306
+ """
307
+ if not self.spark:
308
+ raise DbtRuntimeError("Spark engine not connected")
309
+
310
+ if table_name not in self.registered_tables:
311
+ raise DbtRuntimeError(f"Table '{table_name}' not registered")
312
+
313
+ try:
314
+ self.spark.catalog.cacheTable(table_name)
315
+ except Exception as e:
316
+ raise DbtRuntimeError(f"Failed to cache table '{table_name}': {str(e)}") from e
317
+
318
+ def uncache_table(self, table_name: str) -> None:
319
+ """
320
+ Remove a table from Spark memory cache.
321
+
322
+ :param table_name: Name of the table to uncache
323
+ """
324
+ if self.spark and table_name in self.registered_tables:
325
+ try:
326
+ self.spark.catalog.uncacheTable(table_name)
327
+ except Exception:
328
+ pass # Best effort
329
+
330
+ def get_platform_info(self) -> Dict[str, Any]:
331
+ """
332
+ Get information about the Spark platform and connection.
333
+
334
+ :returns: Dictionary with platform metadata
335
+ """
336
+ info = {
337
+ "platform": self._connection_strategy.get_platform_name(),
338
+ "mode": self.mode,
339
+ }
340
+
341
+ # Add strategy-specific info if available
342
+ if hasattr(self._connection_strategy, "get_cluster_info"):
343
+ info.update(self._connection_strategy.get_cluster_info())
344
+
345
+ return info
346
+
347
+ def estimate_cost(self, duration_minutes: float) -> float:
348
+ """
349
+ Estimate execution cost for the configured platform.
350
+
351
+ :param duration_minutes: Estimated query duration in minutes
352
+ :returns: Estimated cost in USD
353
+ """
354
+ return self._connection_strategy.estimate_cost(duration_minutes)
355
+
356
+ # JDBC Methods (Phase 1: v0.2.0)
357
+
358
+ def supports_jdbc(self, adapter_type: str) -> bool:
359
+ """
360
+ Check if the given adapter type is supported for JDBC connectivity.
361
+
362
+ :param adapter_type: Adapter type (e.g., 'postgres', 'mysql', 'snowflake')
363
+ :returns: True if JDBC is supported for this adapter type
364
+
365
+ Example:
366
+ >>> engine = SparkEngine()
367
+ >>> engine.supports_jdbc('postgres')
368
+ True
369
+ >>> engine.supports_jdbc('duckdb')
370
+ False
371
+ """
372
+ # Import here to avoid circular dependency
373
+ from dbt.compute.jdbc_utils import JDBC_DRIVER_MAPPING
374
+
375
+ return adapter_type.lower() in JDBC_DRIVER_MAPPING
376
+
377
+ def read_jdbc(
378
+ self,
379
+ url: str,
380
+ table: str,
381
+ properties: Dict[str, str],
382
+ numPartitions: int = 16,
383
+ partitionColumn: Optional[str] = None,
384
+ lowerBound: Optional[int] = None,
385
+ upperBound: Optional[int] = None,
386
+ predicates: Optional[List[str]] = None,
387
+ ) -> DataFrame:
388
+ """
389
+ Read data from a JDBC source into Spark DataFrame with parallel reads.
390
+
391
+ This method bypasses the DVT node's memory by reading data directly
392
+ from the source database into Spark workers (distributed memory).
393
+
394
+ :param url: JDBC connection URL (e.g., 'jdbc:postgresql://host:port/db')
395
+ :param table: Table name or SQL query (wrapped in parentheses)
396
+ :param properties: JDBC connection properties (user, password, driver)
397
+ :param numPartitions: Number of partitions for parallel reads (default: 16)
398
+ :param partitionColumn: Column to use for partitioning (must be numeric)
399
+ :param lowerBound: Lower bound for partition column
400
+ :param upperBound: Upper bound for partition column
401
+ :param predicates: List of WHERE clause predicates for filtering partitions
402
+ :returns: Spark DataFrame with loaded data
403
+ :raises DbtRuntimeError: If JDBC read fails
404
+
405
+ Example:
406
+ >>> url = "jdbc:postgresql://localhost:5432/warehouse"
407
+ >>> properties = {
408
+ ... "user": "analytics",
409
+ ... "password": "secret",
410
+ ... "driver": "org.postgresql.Driver"
411
+ ... }
412
+ >>> df = engine.read_jdbc(
413
+ ... url=url,
414
+ ... table="public.orders",
415
+ ... properties=properties,
416
+ ... numPartitions=16,
417
+ ... partitionColumn="order_id",
418
+ ... lowerBound=1,
419
+ ... upperBound=1000000
420
+ ... )
421
+ >>> print(f"Loaded {df.count()} rows")
422
+
423
+ Notes:
424
+ - For partitioned reads, all of (partitionColumn, lowerBound, upperBound)
425
+ must be provided
426
+ - Partitioning enables parallel reads across Spark workers
427
+ - Without partitioning, data is read in a single thread
428
+ """
429
+ if not self.spark:
430
+ raise DbtRuntimeError("Spark engine not connected")
431
+
432
+ try:
433
+ # Build JDBC read options
434
+ read_options = {
435
+ "url": url,
436
+ "dbtable": table,
437
+ **properties, # Merge user, password, driver
438
+ }
439
+
440
+ # Add partitioning options if provided
441
+ if partitionColumn and lowerBound is not None and upperBound is not None:
442
+ read_options.update(
443
+ {
444
+ "partitionColumn": partitionColumn,
445
+ "lowerBound": str(lowerBound),
446
+ "upperBound": str(upperBound),
447
+ "numPartitions": str(numPartitions),
448
+ }
449
+ )
450
+
451
+ # Add predicates if provided
452
+ if predicates:
453
+ # Predicates are used for push-down filtering
454
+ read_options["predicates"] = predicates
455
+
456
+ # Read via JDBC
457
+ df = self.spark.read.format("jdbc").options(**read_options).load()
458
+
459
+ return df
460
+
461
+ except Exception as e:
462
+ # DVT v0.5.2: Clean error message (no Java stack trace)
463
+ clean_error = _clean_spark_error(e)
464
+ raise DbtRuntimeError(f"Failed to read from JDBC source '{table}': {clean_error}")
465
+
466
+ def write_jdbc(
467
+ self,
468
+ df: DataFrame,
469
+ url: str,
470
+ table: str,
471
+ properties: Dict[str, str],
472
+ mode: str = "overwrite",
473
+ batchsize: int = 10000,
474
+ numPartitions: Optional[int] = None,
475
+ ) -> None:
476
+ """
477
+ Write Spark DataFrame to JDBC target with batch writes.
478
+
479
+ This method writes data directly from Spark workers to the target database,
480
+ bypassing the DVT node's memory.
481
+
482
+ :param df: Spark DataFrame to write
483
+ :param url: JDBC connection URL
484
+ :param table: Target table name (qualified: schema.table)
485
+ :param properties: JDBC connection properties (user, password, driver)
486
+ :param mode: Write mode - 'overwrite', 'append', 'error', 'ignore' (default: 'overwrite')
487
+ :param batchsize: Number of rows to insert per batch (default: 10000)
488
+ :param numPartitions: Repartition DataFrame before write for parallelism
489
+ :raises DbtRuntimeError: If JDBC write fails
490
+
491
+ Example:
492
+ >>> url = "jdbc:postgresql://localhost:5432/warehouse"
493
+ >>> properties = {
494
+ ... "user": "analytics",
495
+ ... "password": "secret",
496
+ ... "driver": "org.postgresql.Driver"
497
+ ... }
498
+ >>> engine.write_jdbc(
499
+ ... df=result_df,
500
+ ... url=url,
501
+ ... table="analytics.aggregated_metrics",
502
+ ... properties=properties,
503
+ ... mode="overwrite",
504
+ ... batchsize=10000
505
+ ... )
506
+
507
+ Notes:
508
+ - 'overwrite' mode drops and recreates the table
509
+ - 'append' mode adds data to existing table
510
+ - Batch size affects memory usage and write performance
511
+ - Larger batch sizes are faster but use more memory
512
+ """
513
+ if not self.spark:
514
+ raise DbtRuntimeError("Spark engine not connected")
515
+
516
+ try:
517
+ # Repartition if requested for better write parallelism
518
+ write_df = df
519
+ if numPartitions:
520
+ write_df = df.repartition(numPartitions)
521
+
522
+ # DVT v0.5.0: Handle DROP CASCADE for table materialization
523
+ if mode == "overwrite":
524
+ # Drop existing table with CASCADE before writing
525
+ # This is essential for declarative workflows (handles dependent views)
526
+ try:
527
+ import jaydebeapi
528
+ conn = jaydebeapi.connect(
529
+ properties.get("driver"),
530
+ url,
531
+ [properties.get("user"), properties.get("password")]
532
+ )
533
+ cursor = conn.cursor()
534
+ cursor.execute(f"DROP TABLE IF EXISTS {table} CASCADE")
535
+ conn.commit()
536
+ cursor.close()
537
+ conn.close()
538
+ except Exception:
539
+ # If DROP fails (table doesn't exist), continue
540
+ pass
541
+
542
+ # Build JDBC write options
543
+ write_options = {
544
+ "url": url,
545
+ "dbtable": table,
546
+ "batchsize": str(batchsize),
547
+ **properties, # Merge user, password, driver
548
+ }
549
+
550
+ # Write via JDBC (now with CASCADE handling)
551
+ write_df.write.format("jdbc").options(**write_options).mode("append" if mode == "overwrite" else mode).save()
552
+
553
+ except Exception as e:
554
+ # DVT v0.5.0: Extract only the actual error message (remove Java stack trace)
555
+ error_msg = str(e).split('\n')[0] if '\n' in str(e) else str(e)
556
+ # Look for PostgreSQL error detail
557
+ if "PSQLException:" in str(e):
558
+ lines = str(e).split('\n')
559
+ for i, line in enumerate(lines):
560
+ if "PSQLException:" in line:
561
+ error_msg = line.split("PSQLException:")[-1].strip()
562
+ # Include Detail and Hint if present
563
+ if i+1 < len(lines) and "Detail:" in lines[i+1]:
564
+ error_msg += "\n " + lines[i+1].strip()
565
+ if i+2 < len(lines) and "Hint:" in lines[i+2]:
566
+ error_msg += "\n " + lines[i+2].strip()
567
+ break
568
+ raise DbtRuntimeError(f"Failed to write to JDBC target '{table}': {error_msg}")
569
+
570
+ def register_jdbc_table(
571
+ self,
572
+ url: str,
573
+ table: str,
574
+ properties: Dict[str, str],
575
+ table_alias: str,
576
+ numPartitions: int = 16,
577
+ partitionColumn: Optional[str] = None,
578
+ lowerBound: Optional[int] = None,
579
+ upperBound: Optional[int] = None,
580
+ ) -> None:
581
+ """
582
+ Read from JDBC and register as a temporary view in Spark.
583
+
584
+ Convenience method that combines read_jdbc() and temp view registration.
585
+
586
+ :param url: JDBC connection URL
587
+ :param table: Source table name
588
+ :param properties: JDBC connection properties
589
+ :param table_alias: Name to register the table as in Spark
590
+ :param numPartitions: Number of partitions for parallel reads
591
+ :param partitionColumn: Column to use for partitioning
592
+ :param lowerBound: Lower bound for partition column
593
+ :param upperBound: Upper bound for partition column
594
+ :raises DbtRuntimeError: If read or registration fails
595
+
596
+ Example:
597
+ >>> engine.register_jdbc_table(
598
+ ... url="jdbc:postgresql://localhost:5432/warehouse",
599
+ ... table="public.customers",
600
+ ... properties={"user": "...", "password": "...", "driver": "..."},
601
+ ... table_alias="customers",
602
+ ... numPartitions=8,
603
+ ... partitionColumn="customer_id",
604
+ ... lowerBound=1,
605
+ ... upperBound=500000
606
+ ... )
607
+ >>> # Now can query with: engine.execute_query("SELECT * FROM customers")
608
+ """
609
+ # Read from JDBC
610
+ df = self.read_jdbc(
611
+ url=url,
612
+ table=table,
613
+ properties=properties,
614
+ numPartitions=numPartitions,
615
+ partitionColumn=partitionColumn,
616
+ lowerBound=lowerBound,
617
+ upperBound=upperBound,
618
+ )
619
+
620
+ # Register as temp view
621
+ df.createOrReplaceTempView(table_alias)
622
+
623
+ # Track registration
624
+ self.registered_tables[table_alias] = table_alias