dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (275) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2039 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +804 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +624 -0
  74. dbt/compute/federated_executor.py +837 -0
  75. dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
  76. dbt/compute/filter_pushdown.py +273 -0
  77. dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
  78. dbt/compute/jar_provisioning.py +255 -0
  79. dbt/compute/java_compat.cpython-310-darwin.so +0 -0
  80. dbt/compute/java_compat.py +689 -0
  81. dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
  82. dbt/compute/jdbc_utils.py +678 -0
  83. dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
  84. dbt/compute/smart_selector.py +311 -0
  85. dbt/compute/strategies/__init__.py +54 -0
  86. dbt/compute/strategies/base.py +165 -0
  87. dbt/compute/strategies/dataproc.py +207 -0
  88. dbt/compute/strategies/emr.py +203 -0
  89. dbt/compute/strategies/local.py +364 -0
  90. dbt/compute/strategies/standalone.py +262 -0
  91. dbt/config/__init__.py +4 -0
  92. dbt/config/catalogs.py +94 -0
  93. dbt/config/compute.cpython-310-darwin.so +0 -0
  94. dbt/config/compute.py +547 -0
  95. dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
  96. dbt/config/dvt_profile.py +342 -0
  97. dbt/config/profile.py +422 -0
  98. dbt/config/project.py +873 -0
  99. dbt/config/project_utils.py +28 -0
  100. dbt/config/renderer.py +231 -0
  101. dbt/config/runtime.py +553 -0
  102. dbt/config/selectors.py +208 -0
  103. dbt/config/utils.py +77 -0
  104. dbt/constants.py +28 -0
  105. dbt/context/__init__.py +0 -0
  106. dbt/context/base.py +745 -0
  107. dbt/context/configured.py +135 -0
  108. dbt/context/context_config.py +382 -0
  109. dbt/context/docs.py +82 -0
  110. dbt/context/exceptions_jinja.py +178 -0
  111. dbt/context/macro_resolver.py +195 -0
  112. dbt/context/macros.py +171 -0
  113. dbt/context/manifest.py +72 -0
  114. dbt/context/providers.py +2249 -0
  115. dbt/context/query_header.py +13 -0
  116. dbt/context/secret.py +58 -0
  117. dbt/context/target.py +74 -0
  118. dbt/contracts/__init__.py +0 -0
  119. dbt/contracts/files.py +413 -0
  120. dbt/contracts/graph/__init__.py +0 -0
  121. dbt/contracts/graph/manifest.py +1904 -0
  122. dbt/contracts/graph/metrics.py +97 -0
  123. dbt/contracts/graph/model_config.py +70 -0
  124. dbt/contracts/graph/node_args.py +42 -0
  125. dbt/contracts/graph/nodes.py +1806 -0
  126. dbt/contracts/graph/semantic_manifest.py +232 -0
  127. dbt/contracts/graph/unparsed.py +811 -0
  128. dbt/contracts/project.py +417 -0
  129. dbt/contracts/results.py +53 -0
  130. dbt/contracts/selection.py +23 -0
  131. dbt/contracts/sql.py +85 -0
  132. dbt/contracts/state.py +68 -0
  133. dbt/contracts/util.py +46 -0
  134. dbt/deprecations.py +346 -0
  135. dbt/deps/__init__.py +0 -0
  136. dbt/deps/base.py +152 -0
  137. dbt/deps/git.py +195 -0
  138. dbt/deps/local.py +79 -0
  139. dbt/deps/registry.py +130 -0
  140. dbt/deps/resolver.py +149 -0
  141. dbt/deps/tarball.py +120 -0
  142. dbt/docs/source/_ext/dbt_click.py +119 -0
  143. dbt/docs/source/conf.py +32 -0
  144. dbt/env_vars.py +64 -0
  145. dbt/event_time/event_time.py +40 -0
  146. dbt/event_time/sample_window.py +60 -0
  147. dbt/events/__init__.py +15 -0
  148. dbt/events/base_types.py +36 -0
  149. dbt/events/core_types_pb2.py +2 -0
  150. dbt/events/logging.py +108 -0
  151. dbt/events/types.py +2516 -0
  152. dbt/exceptions.py +1486 -0
  153. dbt/flags.py +89 -0
  154. dbt/graph/__init__.py +11 -0
  155. dbt/graph/cli.py +247 -0
  156. dbt/graph/graph.py +172 -0
  157. dbt/graph/queue.py +214 -0
  158. dbt/graph/selector.py +374 -0
  159. dbt/graph/selector_methods.py +975 -0
  160. dbt/graph/selector_spec.py +222 -0
  161. dbt/graph/thread_pool.py +18 -0
  162. dbt/hooks.py +21 -0
  163. dbt/include/README.md +49 -0
  164. dbt/include/__init__.py +3 -0
  165. dbt/include/starter_project/.gitignore +4 -0
  166. dbt/include/starter_project/README.md +15 -0
  167. dbt/include/starter_project/__init__.py +3 -0
  168. dbt/include/starter_project/analyses/.gitkeep +0 -0
  169. dbt/include/starter_project/dbt_project.yml +36 -0
  170. dbt/include/starter_project/macros/.gitkeep +0 -0
  171. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  172. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  173. dbt/include/starter_project/models/example/schema.yml +21 -0
  174. dbt/include/starter_project/seeds/.gitkeep +0 -0
  175. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  176. dbt/include/starter_project/tests/.gitkeep +0 -0
  177. dbt/internal_deprecations.py +26 -0
  178. dbt/jsonschemas/__init__.py +3 -0
  179. dbt/jsonschemas/jsonschemas.py +309 -0
  180. dbt/jsonschemas/project/0.0.110.json +4717 -0
  181. dbt/jsonschemas/project/0.0.85.json +2015 -0
  182. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  183. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  184. dbt/jsonschemas/resources/latest.json +6773 -0
  185. dbt/links.py +4 -0
  186. dbt/materializations/__init__.py +0 -0
  187. dbt/materializations/incremental/__init__.py +0 -0
  188. dbt/materializations/incremental/microbatch.py +236 -0
  189. dbt/mp_context.py +8 -0
  190. dbt/node_types.py +37 -0
  191. dbt/parser/__init__.py +23 -0
  192. dbt/parser/analysis.py +21 -0
  193. dbt/parser/base.py +548 -0
  194. dbt/parser/common.py +266 -0
  195. dbt/parser/docs.py +52 -0
  196. dbt/parser/fixtures.py +51 -0
  197. dbt/parser/functions.py +30 -0
  198. dbt/parser/generic_test.py +100 -0
  199. dbt/parser/generic_test_builders.py +333 -0
  200. dbt/parser/hooks.py +118 -0
  201. dbt/parser/macros.py +137 -0
  202. dbt/parser/manifest.py +2204 -0
  203. dbt/parser/models.py +573 -0
  204. dbt/parser/partial.py +1178 -0
  205. dbt/parser/read_files.py +445 -0
  206. dbt/parser/schema_generic_tests.py +422 -0
  207. dbt/parser/schema_renderer.py +111 -0
  208. dbt/parser/schema_yaml_readers.py +935 -0
  209. dbt/parser/schemas.py +1466 -0
  210. dbt/parser/search.py +149 -0
  211. dbt/parser/seeds.py +28 -0
  212. dbt/parser/singular_test.py +20 -0
  213. dbt/parser/snapshots.py +44 -0
  214. dbt/parser/sources.py +558 -0
  215. dbt/parser/sql.py +62 -0
  216. dbt/parser/unit_tests.py +621 -0
  217. dbt/plugins/__init__.py +20 -0
  218. dbt/plugins/contracts.py +9 -0
  219. dbt/plugins/exceptions.py +2 -0
  220. dbt/plugins/manager.py +163 -0
  221. dbt/plugins/manifest.py +21 -0
  222. dbt/profiler.py +20 -0
  223. dbt/py.typed +1 -0
  224. dbt/query_analyzer.cpython-310-darwin.so +0 -0
  225. dbt/query_analyzer.py +410 -0
  226. dbt/runners/__init__.py +2 -0
  227. dbt/runners/exposure_runner.py +7 -0
  228. dbt/runners/no_op_runner.py +45 -0
  229. dbt/runners/saved_query_runner.py +7 -0
  230. dbt/selected_resources.py +8 -0
  231. dbt/task/__init__.py +0 -0
  232. dbt/task/base.py +503 -0
  233. dbt/task/build.py +197 -0
  234. dbt/task/clean.py +56 -0
  235. dbt/task/clone.py +161 -0
  236. dbt/task/compile.py +150 -0
  237. dbt/task/compute.py +454 -0
  238. dbt/task/debug.py +505 -0
  239. dbt/task/deps.py +280 -0
  240. dbt/task/docs/__init__.py +3 -0
  241. dbt/task/docs/generate.py +660 -0
  242. dbt/task/docs/index.html +250 -0
  243. dbt/task/docs/serve.py +29 -0
  244. dbt/task/freshness.py +322 -0
  245. dbt/task/function.py +121 -0
  246. dbt/task/group_lookup.py +46 -0
  247. dbt/task/init.py +553 -0
  248. dbt/task/java.py +316 -0
  249. dbt/task/list.py +236 -0
  250. dbt/task/printer.py +175 -0
  251. dbt/task/retry.py +175 -0
  252. dbt/task/run.py +1306 -0
  253. dbt/task/run_operation.py +141 -0
  254. dbt/task/runnable.py +758 -0
  255. dbt/task/seed.py +103 -0
  256. dbt/task/show.py +149 -0
  257. dbt/task/snapshot.py +56 -0
  258. dbt/task/spark.py +414 -0
  259. dbt/task/sql.py +110 -0
  260. dbt/task/target_sync.py +759 -0
  261. dbt/task/test.py +464 -0
  262. dbt/tests/fixtures/__init__.py +1 -0
  263. dbt/tests/fixtures/project.py +620 -0
  264. dbt/tests/util.py +651 -0
  265. dbt/tracking.py +529 -0
  266. dbt/utils/__init__.py +3 -0
  267. dbt/utils/artifact_upload.py +151 -0
  268. dbt/utils/utils.py +408 -0
  269. dbt/version.py +268 -0
  270. dvt_cli/__init__.py +72 -0
  271. dvt_core-0.52.2.dist-info/METADATA +286 -0
  272. dvt_core-0.52.2.dist-info/RECORD +275 -0
  273. dvt_core-0.52.2.dist-info/WHEEL +5 -0
  274. dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
  275. dvt_core-0.52.2.dist-info/top_level.txt +2 -0
@@ -0,0 +1,311 @@
1
+ """
2
+ Smart Compute Engine Selector
3
+
4
+ Automatically selects the optimal compute engine (Spark Local vs Spark Cluster) based on
5
+ workload characteristics when user doesn't specify a preference.
6
+
7
+ Selection criteria:
8
+ - Estimated data size
9
+ - Number of sources
10
+ - Query complexity
11
+ - Available resources
12
+ """
13
+
14
+ from dataclasses import dataclass
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ from dbt.contracts.graph.manifest import Manifest
18
+ from dbt.contracts.graph.nodes import ManifestNode
19
+ from dbt.query_analyzer import QueryAnalysisResult
20
+
21
+
22
+ @dataclass
23
+ class WorkloadEstimate:
24
+ """Estimated workload characteristics for a query."""
25
+
26
+ estimated_rows: int # Estimated total rows to process
27
+ source_count: int # Number of source tables
28
+ connection_count: int # Number of different connections
29
+ has_aggregations: bool # Query contains GROUP BY or aggregations
30
+ has_joins: bool # Query contains JOIN operations
31
+ complexity_score: float # 0.0 to 1.0, higher = more complex
32
+
33
+ @property
34
+ def estimated_data_mb(self) -> float:
35
+ """Rough estimate of data size in MB (assuming ~100 bytes/row)."""
36
+ return (self.estimated_rows * 100) / (1024 * 1024)
37
+
38
+
39
+ class SmartComputeSelector:
40
+ """
41
+ Intelligently selects compute engine based on workload characteristics.
42
+
43
+ v0.3.0: Unified Spark architecture - selects between spark-local and spark-cluster.
44
+
45
+ Default thresholds:
46
+ - Small/medium workload (<10GB): spark-local
47
+ - Large workload (>10GB): spark-cluster (if configured)
48
+ """
49
+
50
+ # Default thresholds (can be configured)
51
+ CLUSTER_THRESHOLD_MB = 10000 # 10GB - threshold for cluster recommendation
52
+ CLUSTER_THRESHOLD_GB = 10 # Same in GB for clarity
53
+
54
+ def __init__(
55
+ self,
56
+ manifest: Manifest,
57
+ cluster_threshold_mb: Optional[int] = None,
58
+ compute_registry: Optional[Any] = None,
59
+ ):
60
+ """
61
+ Initialize smart selector.
62
+
63
+ :param manifest: The dbt manifest
64
+ :param cluster_threshold_mb: Data size threshold for cluster (default: 10GB)
65
+ :param compute_registry: ComputeRegistry instance for checking cluster availability
66
+ """
67
+ self.manifest = manifest
68
+ self.cluster_threshold_mb = cluster_threshold_mb or self.CLUSTER_THRESHOLD_MB
69
+ self.compute_registry = compute_registry
70
+
71
+ def select_engine(
72
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
73
+ ) -> str:
74
+ """
75
+ Select the optimal compute engine for a node.
76
+
77
+ v0.3.0: Returns "spark-local" or "spark-cluster"
78
+
79
+ :param node: The node to execute
80
+ :param analysis_result: Query analysis result
81
+ :returns: "spark-local" or "spark-cluster"
82
+ """
83
+ # Estimate workload
84
+ estimate = self._estimate_workload(node, analysis_result)
85
+
86
+ # Apply selection logic
87
+ return self._apply_selection_logic(estimate)
88
+
89
+ def _estimate_workload(
90
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
91
+ ) -> WorkloadEstimate:
92
+ """
93
+ Estimate workload characteristics for a node.
94
+
95
+ :param node: The node to analyze
96
+ :param analysis_result: Query analysis result
97
+ :returns: WorkloadEstimate
98
+ """
99
+ # Count sources
100
+ source_count = len(analysis_result.source_refs)
101
+ connection_count = len(analysis_result.source_connections)
102
+
103
+ # Estimate row count from sources
104
+ estimated_rows = self._estimate_row_count(analysis_result.source_refs)
105
+
106
+ # Analyze SQL for complexity
107
+ sql = node.compiled_code if hasattr(node, "compiled_code") else node.raw_code
108
+ has_aggregations = self._has_aggregations(sql)
109
+ has_joins = self._has_joins(sql)
110
+
111
+ # Calculate complexity score
112
+ complexity_score = self._calculate_complexity(
113
+ source_count=source_count,
114
+ connection_count=connection_count,
115
+ has_aggregations=has_aggregations,
116
+ has_joins=has_joins,
117
+ )
118
+
119
+ return WorkloadEstimate(
120
+ estimated_rows=estimated_rows,
121
+ source_count=source_count,
122
+ connection_count=connection_count,
123
+ has_aggregations=has_aggregations,
124
+ has_joins=has_joins,
125
+ complexity_score=complexity_score,
126
+ )
127
+
128
+ def _estimate_row_count(self, source_refs: set) -> int:
129
+ """
130
+ Estimate total row count from source tables.
131
+
132
+ Uses catalog metadata if available, otherwise uses heuristics.
133
+
134
+ :param source_refs: Set of source unique_ids
135
+ :returns: Estimated row count
136
+ """
137
+ total_rows = 0
138
+
139
+ for source_id in source_refs:
140
+ source = self.manifest.sources.get(source_id)
141
+ if not source:
142
+ # Unknown source, use conservative estimate
143
+ total_rows += 100000
144
+ continue
145
+
146
+ # Check if we have catalog metadata with row counts
147
+ # Note: This would come from `dbt docs generate`
148
+ # For now, use a heuristic based on naming
149
+ if (
150
+ "fact" in source.identifier.lower()
151
+ or "events" in source.identifier.lower()
152
+ ):
153
+ # Fact tables tend to be larger
154
+ total_rows += 1000000
155
+ elif (
156
+ "dim" in source.identifier.lower()
157
+ or "lookup" in source.identifier.lower()
158
+ ):
159
+ # Dimension tables tend to be smaller
160
+ total_rows += 10000
161
+ else:
162
+ # Default estimate
163
+ total_rows += 100000
164
+
165
+ return total_rows
166
+
167
+ def _has_aggregations(self, sql: str) -> bool:
168
+ """Check if SQL contains aggregations."""
169
+ sql_upper = sql.upper()
170
+ return any(
171
+ keyword in sql_upper
172
+ for keyword in [
173
+ " GROUP BY ",
174
+ " SUM(",
175
+ " COUNT(",
176
+ " AVG(",
177
+ " MIN(",
178
+ " MAX(",
179
+ " HAVING ",
180
+ ]
181
+ )
182
+
183
+ def _has_joins(self, sql: str) -> bool:
184
+ """Check if SQL contains joins."""
185
+ sql_upper = sql.upper()
186
+ return any(
187
+ keyword in sql_upper
188
+ for keyword in [
189
+ " JOIN ",
190
+ " INNER JOIN ",
191
+ " LEFT JOIN ",
192
+ " RIGHT JOIN ",
193
+ " FULL JOIN ",
194
+ " CROSS JOIN ",
195
+ ]
196
+ )
197
+
198
+ def _calculate_complexity(
199
+ self,
200
+ source_count: int,
201
+ connection_count: int,
202
+ has_aggregations: bool,
203
+ has_joins: bool,
204
+ ) -> float:
205
+ """
206
+ Calculate query complexity score (0.0 to 1.0).
207
+
208
+ :returns: Complexity score
209
+ """
210
+ score = 0.0
211
+
212
+ # Source count contributes
213
+ score += min(source_count / 10.0, 0.3)
214
+
215
+ # Multiple connections increases complexity
216
+ score += min(connection_count / 5.0, 0.2)
217
+
218
+ # Aggregations add complexity
219
+ if has_aggregations:
220
+ score += 0.2
221
+
222
+ # Joins add complexity
223
+ if has_joins:
224
+ score += 0.3
225
+
226
+ return min(score, 1.0)
227
+
228
+ def _apply_selection_logic(self, estimate: WorkloadEstimate) -> str:
229
+ """
230
+ Apply selection logic based on workload estimate.
231
+
232
+ v0.3.0: Selects between spark-local and spark-cluster only.
233
+
234
+ :param estimate: WorkloadEstimate
235
+ :returns: "spark-local" or "spark-cluster"
236
+ """
237
+ # Rule 1: Large data → prefer cluster (if available)
238
+ if estimate.estimated_data_mb > self.cluster_threshold_mb:
239
+ # Check if cluster is configured
240
+ if self._cluster_available():
241
+ return "spark-cluster"
242
+ else:
243
+ # Log warning about large data on local
244
+ # Note: Logging should be done by caller, we just return the engine
245
+ return "spark-local"
246
+
247
+ # Rule 2: Everything else → spark-local (default)
248
+ # spark-local is excellent for most workloads (<10GB)
249
+ return "spark-local"
250
+
251
+ def _cluster_available(self) -> bool:
252
+ """
253
+ Check if a Spark cluster is configured.
254
+
255
+ :returns: True if cluster compute engine is available
256
+ """
257
+ if not self.compute_registry:
258
+ return False
259
+
260
+ # Check if any cluster computes are registered (not spark-local)
261
+ clusters = self.compute_registry.list()
262
+ for cluster in clusters:
263
+ if cluster.type == "spark" and cluster.name != "spark-local":
264
+ # Check if it's actually a cluster (not local master)
265
+ config = cluster.config
266
+ if "master" in config:
267
+ master = config.get("master", "")
268
+ if not master.startswith("local"):
269
+ return True
270
+ elif "host" in config or "cluster_id" in config:
271
+ # Databricks or other remote cluster
272
+ return True
273
+
274
+ return False
275
+
276
+ def get_recommendation_reason(
277
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
278
+ ) -> str:
279
+ """
280
+ Get human-readable explanation for engine selection.
281
+
282
+ :param node: The node
283
+ :param analysis_result: Query analysis result
284
+ :returns: Explanation string
285
+ """
286
+ estimate = self._estimate_workload(node, analysis_result)
287
+ engine = self._apply_selection_logic(estimate)
288
+
289
+ reasons = []
290
+
291
+ if estimate.estimated_data_mb > self.cluster_threshold_mb:
292
+ reasons.append(
293
+ f"Large dataset ({estimate.estimated_data_mb:.0f} MB / {estimate.estimated_data_mb / 1024:.1f} GB)"
294
+ )
295
+ if engine == "spark-local":
296
+ reasons.append(
297
+ "No cluster configured (consider registering a Spark cluster)"
298
+ )
299
+ else:
300
+ reasons.append(
301
+ f"Small/medium workload ({estimate.estimated_data_mb:.0f} MB, {estimate.source_count} sources)"
302
+ )
303
+
304
+ if estimate.source_count > 5:
305
+ reasons.append(f"Many sources ({estimate.source_count})")
306
+
307
+ if estimate.complexity_score > 0.7:
308
+ reasons.append(f"High complexity (score: {estimate.complexity_score:.2f})")
309
+
310
+ reason_str = "; ".join(reasons)
311
+ return f"Selected {engine}: {reason_str}"
@@ -0,0 +1,54 @@
1
+ """
2
+ Spark Connection Strategies
3
+
4
+ This module provides different strategies for connecting to Spark clusters.
5
+ Uses the strategy pattern for flexible platform support.
6
+
7
+ v0.5.98: Added EMRStrategy, DataprocStrategy, and StandaloneStrategy.
8
+ v0.51.2: Removed Databricks support (serverless cannot read external JDBC sources).
9
+ """
10
+
11
+ from dbt.compute.strategies.base import BaseConnectionStrategy
12
+ from dbt.compute.strategies.local import LocalStrategy
13
+
14
+ # Strategies are imported lazily to avoid import errors when
15
+ # optional dependencies are not installed
16
+
17
+
18
+ def get_emr_strategy():
19
+ """
20
+ Lazily import and return EMRStrategy.
21
+
22
+ :returns: EMRStrategy class
23
+ """
24
+ from dbt.compute.strategies.emr import EMRStrategy
25
+ return EMRStrategy
26
+
27
+
28
+ def get_dataproc_strategy():
29
+ """
30
+ Lazily import and return DataprocStrategy.
31
+
32
+ :returns: DataprocStrategy class
33
+ """
34
+ from dbt.compute.strategies.dataproc import DataprocStrategy
35
+ return DataprocStrategy
36
+
37
+
38
+ def get_standalone_strategy():
39
+ """
40
+ Lazily import and return StandaloneStrategy.
41
+
42
+ :returns: StandaloneStrategy class
43
+ """
44
+ from dbt.compute.strategies.standalone import StandaloneStrategy
45
+ return StandaloneStrategy
46
+
47
+
48
+ __all__ = [
49
+ "BaseConnectionStrategy",
50
+ "LocalStrategy",
51
+ "get_emr_strategy",
52
+ "get_dataproc_strategy",
53
+ "get_standalone_strategy",
54
+ ]
@@ -0,0 +1,165 @@
1
+ """
2
+ Base Connection Strategy for Spark Engines
3
+
4
+ Defines the abstract interface for different Spark connection strategies.
5
+ Uses composition over inheritance for flexible platform support.
6
+
7
+ v0.5.98: Added JAR provisioning and connectivity testing methods.
8
+ """
9
+
10
+ from abc import ABC, abstractmethod
11
+ from typing import Any, Dict, Optional, Set, Tuple
12
+
13
+ try:
14
+ from pyspark.sql import SparkSession
15
+
16
+ PYSPARK_AVAILABLE = True
17
+ except ImportError:
18
+ PYSPARK_AVAILABLE = False
19
+ SparkSession = None
20
+
21
+
22
+ class BaseConnectionStrategy(ABC):
23
+ """
24
+ Abstract base class for Spark connection strategies.
25
+
26
+ Different strategies implement different ways to connect to Spark:
27
+ - LocalStrategy: Embedded PySpark (in-process)
28
+ - DatabricksStrategy: Databricks Connect (remote cluster)
29
+ - EMRStrategy: AWS EMR cluster
30
+ - DataprocStrategy: GCP Dataproc
31
+ - StandaloneStrategy: Self-managed Spark clusters
32
+ """
33
+
34
+ def __init__(self, config: Dict[str, Any], app_name: str = "DVT-Compute"):
35
+ """
36
+ Initialize connection strategy.
37
+
38
+ :param config: Strategy-specific configuration
39
+ :param app_name: Spark application name
40
+ """
41
+ self.config = config
42
+ self.app_name = app_name
43
+
44
+ @abstractmethod
45
+ def get_spark_session(self) -> SparkSession:
46
+ """
47
+ Create and return a SparkSession.
48
+
49
+ :returns: Initialized SparkSession
50
+ :raises DbtRuntimeError: If session creation fails
51
+ """
52
+ pass
53
+
54
+ @abstractmethod
55
+ def validate_config(self) -> None:
56
+ """
57
+ Validate strategy-specific configuration.
58
+
59
+ :raises DbtRuntimeError: If configuration is invalid
60
+ """
61
+ pass
62
+
63
+ def estimate_cost(self, duration_minutes: float) -> float:
64
+ """
65
+ Estimate cost for running on this platform.
66
+
67
+ Default implementation returns 0.0 (free). Override for cloud platforms.
68
+
69
+ :param duration_minutes: Estimated query duration in minutes
70
+ :returns: Estimated cost in USD
71
+ """
72
+ return 0.0
73
+
74
+ @abstractmethod
75
+ def close(self, spark: Optional[SparkSession]) -> None:
76
+ """
77
+ Clean up Spark session.
78
+
79
+ :param spark: SparkSession to clean up (may be None)
80
+ """
81
+ pass
82
+
83
+ def get_platform_name(self) -> str:
84
+ """
85
+ Get human-readable platform name.
86
+
87
+ :returns: Platform name (e.g., "local", "databricks", "emr")
88
+ """
89
+ return self.__class__.__name__.replace("Strategy", "").lower()
90
+
91
+ def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
92
+ """
93
+ Get Spark configuration for JDBC JAR provisioning.
94
+
95
+ Default implementation returns empty dict. Override in subclasses
96
+ to provide platform-specific JAR configuration.
97
+
98
+ Local platforms use spark.jars (local file paths).
99
+ Remote platforms use spark.jars.packages (Maven coordinates).
100
+
101
+ :param adapter_types: Set of adapter types that need JDBC drivers
102
+ :returns: Dictionary of Spark config keys/values (e.g., {"spark.jars": "..."})
103
+ """
104
+ return {}
105
+
106
+ def test_connectivity(self) -> Tuple[bool, str]:
107
+ """
108
+ Test basic connectivity to the Spark cluster.
109
+
110
+ Creates a session, runs a simple query, and returns status.
111
+ Override for platform-specific connectivity testing.
112
+
113
+ :returns: Tuple of (success, message)
114
+ """
115
+ try:
116
+ spark = self.get_spark_session()
117
+ # Run a simple SQL query to verify connectivity
118
+ spark.sql("SELECT 1 AS test").collect()
119
+ return (True, "Session created and SQL test passed")
120
+ except Exception as e:
121
+ return (False, str(e))
122
+
123
+ def test_jdbc_connectivity(
124
+ self,
125
+ jdbc_url: str,
126
+ properties: Dict[str, str],
127
+ table_or_query: str = "(SELECT 1 AS test) AS t",
128
+ ) -> Tuple[bool, str]:
129
+ """
130
+ Test JDBC connectivity through the Spark cluster.
131
+
132
+ Creates a session and attempts to read from a JDBC source.
133
+ This verifies that JDBC drivers are properly configured.
134
+
135
+ :param jdbc_url: JDBC connection URL
136
+ :param properties: JDBC connection properties (user, password, driver)
137
+ :param table_or_query: Table name or SQL query wrapped in parentheses
138
+ :returns: Tuple of (success, message)
139
+ """
140
+ try:
141
+ spark = self.get_spark_session()
142
+
143
+ # Attempt JDBC read
144
+ df = (
145
+ spark.read.format("jdbc")
146
+ .option("url", jdbc_url)
147
+ .option("dbtable", table_or_query)
148
+ .options(**properties)
149
+ .load()
150
+ )
151
+
152
+ # Force evaluation
153
+ row_count = df.count()
154
+ return (True, f"JDBC read successful ({row_count} rows)")
155
+ except Exception as e:
156
+ error_msg = str(e)
157
+ # Provide helpful error messages for common issues
158
+ if "ClassNotFoundException" in error_msg:
159
+ return (False, f"JDBC driver not found: {error_msg}")
160
+ elif "No suitable driver" in error_msg:
161
+ return (False, f"JDBC driver not loaded: {error_msg}")
162
+ elif "Authentication" in error_msg.lower() or "password" in error_msg.lower():
163
+ return (False, f"Authentication failed: {error_msg}")
164
+ else:
165
+ return (False, f"JDBC test failed: {error_msg}")