dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2403 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +50 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
  74. dbt/compute/engines/spark_engine.py +642 -0
  75. dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
  76. dbt/compute/federated_executor.py +1080 -0
  77. dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
  78. dbt/compute/filter_pushdown.py +273 -0
  79. dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
  80. dbt/compute/jar_provisioning.py +255 -0
  81. dbt/compute/java_compat.cpython-311-darwin.so +0 -0
  82. dbt/compute/java_compat.py +689 -0
  83. dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
  84. dbt/compute/jdbc_utils.py +678 -0
  85. dbt/compute/metadata/__init__.py +40 -0
  86. dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
  87. dbt/compute/metadata/adapters_registry.py +370 -0
  88. dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
  89. dbt/compute/metadata/registry.py +674 -0
  90. dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
  91. dbt/compute/metadata/store.py +1499 -0
  92. dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
  93. dbt/compute/smart_selector.py +377 -0
  94. dbt/compute/strategies/__init__.py +55 -0
  95. dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
  96. dbt/compute/strategies/base.py +165 -0
  97. dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
  98. dbt/compute/strategies/dataproc.py +207 -0
  99. dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
  100. dbt/compute/strategies/emr.py +203 -0
  101. dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
  102. dbt/compute/strategies/local.py +443 -0
  103. dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
  104. dbt/compute/strategies/standalone.py +262 -0
  105. dbt/config/__init__.py +4 -0
  106. dbt/config/catalogs.py +94 -0
  107. dbt/config/compute.cpython-311-darwin.so +0 -0
  108. dbt/config/compute.py +513 -0
  109. dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
  110. dbt/config/dvt_profile.py +342 -0
  111. dbt/config/profile.py +422 -0
  112. dbt/config/project.py +873 -0
  113. dbt/config/project_utils.py +28 -0
  114. dbt/config/renderer.py +231 -0
  115. dbt/config/runtime.py +553 -0
  116. dbt/config/selectors.py +208 -0
  117. dbt/config/utils.py +77 -0
  118. dbt/constants.py +28 -0
  119. dbt/context/__init__.py +0 -0
  120. dbt/context/base.py +745 -0
  121. dbt/context/configured.py +135 -0
  122. dbt/context/context_config.py +382 -0
  123. dbt/context/docs.py +82 -0
  124. dbt/context/exceptions_jinja.py +178 -0
  125. dbt/context/macro_resolver.py +195 -0
  126. dbt/context/macros.py +171 -0
  127. dbt/context/manifest.py +72 -0
  128. dbt/context/providers.py +2249 -0
  129. dbt/context/query_header.py +13 -0
  130. dbt/context/secret.py +58 -0
  131. dbt/context/target.py +74 -0
  132. dbt/contracts/__init__.py +0 -0
  133. dbt/contracts/files.py +413 -0
  134. dbt/contracts/graph/__init__.py +0 -0
  135. dbt/contracts/graph/manifest.py +1904 -0
  136. dbt/contracts/graph/metrics.py +97 -0
  137. dbt/contracts/graph/model_config.py +70 -0
  138. dbt/contracts/graph/node_args.py +42 -0
  139. dbt/contracts/graph/nodes.py +1806 -0
  140. dbt/contracts/graph/semantic_manifest.py +232 -0
  141. dbt/contracts/graph/unparsed.py +811 -0
  142. dbt/contracts/project.py +417 -0
  143. dbt/contracts/results.py +53 -0
  144. dbt/contracts/selection.py +23 -0
  145. dbt/contracts/sql.py +85 -0
  146. dbt/contracts/state.py +68 -0
  147. dbt/contracts/util.py +46 -0
  148. dbt/deprecations.py +348 -0
  149. dbt/deps/__init__.py +0 -0
  150. dbt/deps/base.py +152 -0
  151. dbt/deps/git.py +195 -0
  152. dbt/deps/local.py +79 -0
  153. dbt/deps/registry.py +130 -0
  154. dbt/deps/resolver.py +149 -0
  155. dbt/deps/tarball.py +120 -0
  156. dbt/docs/source/_ext/dbt_click.py +119 -0
  157. dbt/docs/source/conf.py +32 -0
  158. dbt/env_vars.py +64 -0
  159. dbt/event_time/event_time.py +40 -0
  160. dbt/event_time/sample_window.py +60 -0
  161. dbt/events/__init__.py +15 -0
  162. dbt/events/base_types.py +36 -0
  163. dbt/events/core_types_pb2.py +2 -0
  164. dbt/events/logging.py +108 -0
  165. dbt/events/types.py +2516 -0
  166. dbt/exceptions.py +1486 -0
  167. dbt/flags.py +89 -0
  168. dbt/graph/__init__.py +11 -0
  169. dbt/graph/cli.py +249 -0
  170. dbt/graph/graph.py +172 -0
  171. dbt/graph/queue.py +214 -0
  172. dbt/graph/selector.py +374 -0
  173. dbt/graph/selector_methods.py +975 -0
  174. dbt/graph/selector_spec.py +222 -0
  175. dbt/graph/thread_pool.py +18 -0
  176. dbt/hooks.py +21 -0
  177. dbt/include/README.md +49 -0
  178. dbt/include/__init__.py +3 -0
  179. dbt/include/data/adapters_registry.duckdb +0 -0
  180. dbt/include/data/build_registry.py +242 -0
  181. dbt/include/data/csv/adapter_queries.csv +33 -0
  182. dbt/include/data/csv/syntax_rules.csv +9 -0
  183. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  184. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  185. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  186. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  187. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  188. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  189. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  190. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  191. dbt/include/starter_project/.gitignore +4 -0
  192. dbt/include/starter_project/README.md +15 -0
  193. dbt/include/starter_project/__init__.py +3 -0
  194. dbt/include/starter_project/analyses/.gitkeep +0 -0
  195. dbt/include/starter_project/dbt_project.yml +36 -0
  196. dbt/include/starter_project/macros/.gitkeep +0 -0
  197. dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  198. dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  199. dbt/include/starter_project/models/example/schema.yml +21 -0
  200. dbt/include/starter_project/seeds/.gitkeep +0 -0
  201. dbt/include/starter_project/snapshots/.gitkeep +0 -0
  202. dbt/include/starter_project/tests/.gitkeep +0 -0
  203. dbt/internal_deprecations.py +26 -0
  204. dbt/jsonschemas/__init__.py +3 -0
  205. dbt/jsonschemas/jsonschemas.py +309 -0
  206. dbt/jsonschemas/project/0.0.110.json +4717 -0
  207. dbt/jsonschemas/project/0.0.85.json +2015 -0
  208. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  209. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  210. dbt/jsonschemas/resources/latest.json +6773 -0
  211. dbt/links.py +4 -0
  212. dbt/materializations/__init__.py +0 -0
  213. dbt/materializations/incremental/__init__.py +0 -0
  214. dbt/materializations/incremental/microbatch.py +236 -0
  215. dbt/mp_context.py +8 -0
  216. dbt/node_types.py +37 -0
  217. dbt/parser/__init__.py +23 -0
  218. dbt/parser/analysis.py +21 -0
  219. dbt/parser/base.py +548 -0
  220. dbt/parser/common.py +266 -0
  221. dbt/parser/docs.py +52 -0
  222. dbt/parser/fixtures.py +51 -0
  223. dbt/parser/functions.py +30 -0
  224. dbt/parser/generic_test.py +100 -0
  225. dbt/parser/generic_test_builders.py +333 -0
  226. dbt/parser/hooks.py +118 -0
  227. dbt/parser/macros.py +137 -0
  228. dbt/parser/manifest.py +2204 -0
  229. dbt/parser/models.py +573 -0
  230. dbt/parser/partial.py +1178 -0
  231. dbt/parser/read_files.py +445 -0
  232. dbt/parser/schema_generic_tests.py +422 -0
  233. dbt/parser/schema_renderer.py +111 -0
  234. dbt/parser/schema_yaml_readers.py +935 -0
  235. dbt/parser/schemas.py +1466 -0
  236. dbt/parser/search.py +149 -0
  237. dbt/parser/seeds.py +28 -0
  238. dbt/parser/singular_test.py +20 -0
  239. dbt/parser/snapshots.py +44 -0
  240. dbt/parser/sources.py +558 -0
  241. dbt/parser/sql.py +62 -0
  242. dbt/parser/unit_tests.py +621 -0
  243. dbt/plugins/__init__.py +20 -0
  244. dbt/plugins/contracts.py +9 -0
  245. dbt/plugins/exceptions.py +2 -0
  246. dbt/plugins/manager.py +163 -0
  247. dbt/plugins/manifest.py +21 -0
  248. dbt/profiler.py +20 -0
  249. dbt/py.typed +1 -0
  250. dbt/query_analyzer.cpython-311-darwin.so +0 -0
  251. dbt/query_analyzer.py +410 -0
  252. dbt/runners/__init__.py +2 -0
  253. dbt/runners/exposure_runner.py +7 -0
  254. dbt/runners/no_op_runner.py +45 -0
  255. dbt/runners/saved_query_runner.py +7 -0
  256. dbt/selected_resources.py +8 -0
  257. dbt/task/__init__.py +0 -0
  258. dbt/task/base.py +503 -0
  259. dbt/task/build.py +197 -0
  260. dbt/task/clean.py +56 -0
  261. dbt/task/clone.py +161 -0
  262. dbt/task/compile.py +150 -0
  263. dbt/task/compute.cpython-311-darwin.so +0 -0
  264. dbt/task/compute.py +458 -0
  265. dbt/task/debug.py +505 -0
  266. dbt/task/deps.py +280 -0
  267. dbt/task/docs/__init__.py +3 -0
  268. dbt/task/docs/api/__init__.py +23 -0
  269. dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
  270. dbt/task/docs/api/catalog.py +204 -0
  271. dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
  272. dbt/task/docs/api/lineage.py +234 -0
  273. dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
  274. dbt/task/docs/api/profile.py +204 -0
  275. dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
  276. dbt/task/docs/api/spark.py +186 -0
  277. dbt/task/docs/generate.py +947 -0
  278. dbt/task/docs/index.html +250 -0
  279. dbt/task/docs/serve.cpython-311-darwin.so +0 -0
  280. dbt/task/docs/serve.py +174 -0
  281. dbt/task/dvt_output.py +362 -0
  282. dbt/task/dvt_run.py +204 -0
  283. dbt/task/freshness.py +322 -0
  284. dbt/task/function.py +121 -0
  285. dbt/task/group_lookup.py +46 -0
  286. dbt/task/init.cpython-311-darwin.so +0 -0
  287. dbt/task/init.py +604 -0
  288. dbt/task/java.cpython-311-darwin.so +0 -0
  289. dbt/task/java.py +316 -0
  290. dbt/task/list.py +236 -0
  291. dbt/task/metadata.cpython-311-darwin.so +0 -0
  292. dbt/task/metadata.py +804 -0
  293. dbt/task/printer.py +175 -0
  294. dbt/task/profile.cpython-311-darwin.so +0 -0
  295. dbt/task/profile.py +1307 -0
  296. dbt/task/profile_serve.py +615 -0
  297. dbt/task/retract.py +438 -0
  298. dbt/task/retry.py +175 -0
  299. dbt/task/run.py +1387 -0
  300. dbt/task/run_operation.py +141 -0
  301. dbt/task/runnable.py +758 -0
  302. dbt/task/seed.py +103 -0
  303. dbt/task/show.py +149 -0
  304. dbt/task/snapshot.py +56 -0
  305. dbt/task/spark.cpython-311-darwin.so +0 -0
  306. dbt/task/spark.py +414 -0
  307. dbt/task/sql.py +110 -0
  308. dbt/task/target_sync.cpython-311-darwin.so +0 -0
  309. dbt/task/target_sync.py +766 -0
  310. dbt/task/test.py +464 -0
  311. dbt/tests/fixtures/__init__.py +1 -0
  312. dbt/tests/fixtures/project.py +620 -0
  313. dbt/tests/util.py +651 -0
  314. dbt/tracking.py +529 -0
  315. dbt/utils/__init__.py +3 -0
  316. dbt/utils/artifact_upload.py +151 -0
  317. dbt/utils/utils.py +408 -0
  318. dbt/version.py +270 -0
  319. dvt_cli/__init__.py +72 -0
  320. dvt_core-0.58.6.dist-info/METADATA +288 -0
  321. dvt_core-0.58.6.dist-info/RECORD +324 -0
  322. dvt_core-0.58.6.dist-info/WHEEL +5 -0
  323. dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
  324. dvt_core-0.58.6.dist-info/top_level.txt +2 -0
@@ -0,0 +1,377 @@
1
+ """
2
+ Smart Compute Engine Selector
3
+
4
+ Selects compute engine based on DVT compute rules (NOT size-based).
5
+
6
+ v0.56.0: Refactored to follow DVT compute rules:
7
+ 1. CLI --target-compute override (highest priority)
8
+ 2. Model-level config {{ config(compute='...') }}
9
+ 3. Default from computes.yml target_compute
10
+ 4. Pushdown when model and all inputs are in same target (no Spark needed)
11
+
12
+ Selection is deterministic based on configuration, not data characteristics.
13
+ """
14
+
15
+ from dataclasses import dataclass
16
+ from enum import Enum
17
+ from typing import Any, Optional, Set
18
+
19
+ from dbt.contracts.graph.manifest import Manifest
20
+ from dbt.contracts.graph.nodes import ManifestNode
21
+ from dbt.query_analyzer import QueryAnalysisResult
22
+ from dbt_common.exceptions import DbtRuntimeError
23
+
24
+
25
+ class ExecutionStrategy(Enum):
26
+ """Execution strategy for a node."""
27
+
28
+ PUSHDOWN = "pushdown" # Execute directly on target adapter (same connection)
29
+ FEDERATED = "federated" # Execute via Spark for cross-target queries
30
+
31
+
32
+ @dataclass
33
+ class WorkloadEstimate:
34
+ """Estimated workload characteristics for a query."""
35
+
36
+ estimated_rows: int # Estimated total rows to process
37
+ source_count: int # Number of source tables
38
+ connection_count: int # Number of different connections
39
+ has_aggregations: bool # Query contains GROUP BY or aggregations
40
+ has_joins: bool # Query contains JOIN operations
41
+ complexity_score: float # 0.0 to 1.0, higher = more complex
42
+
43
+ @property
44
+ def estimated_data_mb(self) -> float:
45
+ """Rough estimate of data size in MB (assuming ~100 bytes/row)."""
46
+ return (self.estimated_rows * 100) / (1024 * 1024)
47
+
48
+
49
+ class SmartComputeSelector:
50
+ """
51
+ Selects compute engine based on DVT compute rules.
52
+
53
+ v0.56.0: Rule-based selection (NO size-based logic).
54
+
55
+ Selection hierarchy (highest to lowest priority):
56
+ 1. CLI --target-compute override
57
+ 2. Model config: {{ config(compute='spark-cluster') }}
58
+ 3. Default from computes.yml target_compute
59
+
60
+ Execution strategy:
61
+ - PUSHDOWN: When model and all inputs are in same target
62
+ - FEDERATED: When sources span multiple targets (requires Spark)
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ manifest: Manifest,
68
+ compute_registry: Optional[Any] = None,
69
+ cli_target_compute: Optional[str] = None,
70
+ ):
71
+ """
72
+ Initialize smart selector.
73
+
74
+ :param manifest: The dbt manifest
75
+ :param compute_registry: ComputeRegistry instance for compute configuration
76
+ :param cli_target_compute: CLI --target-compute override (highest priority)
77
+ """
78
+ self.manifest = manifest
79
+ self.compute_registry = compute_registry
80
+ self.cli_target_compute = cli_target_compute
81
+
82
+ def select_engine(
83
+ self,
84
+ node: ManifestNode,
85
+ analysis_result: QueryAnalysisResult,
86
+ cli_override: Optional[str] = None,
87
+ ) -> str:
88
+ """
89
+ Select compute engine based on DVT rules.
90
+
91
+ v0.56.0: Rule-based selection (no size-based logic).
92
+
93
+ Priority:
94
+ 1. cli_override parameter (passed at call time)
95
+ 2. self.cli_target_compute (passed at init time)
96
+ 3. Model config: {{ config(compute='...') }}
97
+ 4. Default from computes.yml target_compute
98
+
99
+ :param node: The node to execute
100
+ :param analysis_result: Query analysis result
101
+ :param cli_override: CLI --target-compute override
102
+ :returns: Compute engine name (e.g., "spark-local", "spark-cluster")
103
+ :raises DbtRuntimeError: If specified compute doesn't exist
104
+ """
105
+ # Determine execution strategy first
106
+ strategy = self._determine_execution_strategy(node, analysis_result)
107
+
108
+ # For pushdown, no Spark compute needed
109
+ if strategy == ExecutionStrategy.PUSHDOWN:
110
+ return "pushdown"
111
+
112
+ # For federated execution, select compute engine
113
+ return self._select_compute_for_federation(node, cli_override)
114
+
115
+ def _determine_execution_strategy(
116
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
117
+ ) -> ExecutionStrategy:
118
+ """
119
+ Determine whether to use pushdown or federation.
120
+
121
+ DVT Rule: Pushdown when model and ALL inputs are in same target.
122
+
123
+ :param node: The node to analyze
124
+ :param analysis_result: Query analysis result
125
+ :returns: ExecutionStrategy (PUSHDOWN or FEDERATED)
126
+ """
127
+ # Get target connection for this node
128
+ node_target = self._get_node_target(node)
129
+
130
+ # Get all source connections
131
+ source_connections = analysis_result.source_connections
132
+
133
+ # If no sources, can use pushdown (pure computation)
134
+ if not source_connections:
135
+ return ExecutionStrategy.PUSHDOWN
136
+
137
+ # Check if all sources are in the same connection as the target
138
+ if len(source_connections) == 1:
139
+ source_connection = next(iter(source_connections))
140
+ if source_connection == node_target:
141
+ # Same connection - use pushdown
142
+ return ExecutionStrategy.PUSHDOWN
143
+
144
+ # Multiple connections or different target - must federate
145
+ return ExecutionStrategy.FEDERATED
146
+
147
+ def _get_node_target(self, node: ManifestNode) -> str:
148
+ """
149
+ Get the target connection for a node.
150
+
151
+ :param node: The manifest node
152
+ :returns: Target connection name
153
+ """
154
+ # Check if node has explicit target config
155
+ if hasattr(node, "config") and hasattr(node.config, "target"):
156
+ if node.config.target:
157
+ return node.config.target
158
+
159
+ # Otherwise, use default target from manifest
160
+ # Note: In DVT, this comes from profiles.yml default target
161
+ return "default"
162
+
163
+ def _select_compute_for_federation(
164
+ self, node: ManifestNode, cli_override: Optional[str] = None
165
+ ) -> str:
166
+ """
167
+ Select compute engine for federated execution.
168
+
169
+ Priority:
170
+ 1. cli_override parameter (passed at call time)
171
+ 2. self.cli_target_compute (passed at init time)
172
+ 3. Model config: {{ config(compute='...') }}
173
+ 4. Default from computes.yml target_compute
174
+
175
+ :param node: The node to execute
176
+ :param cli_override: CLI --target-compute override
177
+ :returns: Compute engine name
178
+ :raises DbtRuntimeError: If specified compute doesn't exist
179
+ """
180
+ compute_name = None
181
+
182
+ # Priority 1: CLI override (call-time)
183
+ if cli_override:
184
+ compute_name = cli_override
185
+
186
+ # Priority 2: CLI override (init-time)
187
+ elif self.cli_target_compute:
188
+ compute_name = self.cli_target_compute
189
+
190
+ # Priority 3: Model-level config
191
+ elif hasattr(node, "config") and hasattr(node.config, "compute"):
192
+ if node.config.compute:
193
+ compute_name = node.config.compute
194
+
195
+ # Priority 4: Default from computes.yml
196
+ elif self.compute_registry:
197
+ compute_name = self.compute_registry.target_compute
198
+
199
+ # Fallback if no registry
200
+ if not compute_name:
201
+ compute_name = "spark-local"
202
+
203
+ # Validate the compute engine exists
204
+ if self.compute_registry and not self.compute_registry.exists(compute_name):
205
+ available = [c.name for c in self.compute_registry.list()]
206
+ raise DbtRuntimeError(
207
+ f"Compute engine '{compute_name}' not found. "
208
+ f"Available engines: {', '.join(available)}"
209
+ )
210
+
211
+ return compute_name
212
+
213
+ def _estimate_workload(
214
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
215
+ ) -> WorkloadEstimate:
216
+ """
217
+ Estimate workload characteristics for a node.
218
+
219
+ Note: Used for informational purposes only, NOT for compute selection.
220
+
221
+ :param node: The node to analyze
222
+ :param analysis_result: Query analysis result
223
+ :returns: WorkloadEstimate
224
+ """
225
+ # Count sources
226
+ source_count = len(analysis_result.source_refs)
227
+ connection_count = len(analysis_result.source_connections)
228
+
229
+ # Estimate row count (informational only)
230
+ estimated_rows = self._estimate_row_count(analysis_result.source_refs)
231
+
232
+ # Analyze SQL for complexity (informational only)
233
+ sql = node.compiled_code if hasattr(node, "compiled_code") else node.raw_code
234
+ has_aggregations = self._has_aggregations(sql)
235
+ has_joins = self._has_joins(sql)
236
+
237
+ # Calculate complexity score (informational only)
238
+ complexity_score = self._calculate_complexity(
239
+ source_count=source_count,
240
+ connection_count=connection_count,
241
+ has_aggregations=has_aggregations,
242
+ has_joins=has_joins,
243
+ )
244
+
245
+ return WorkloadEstimate(
246
+ estimated_rows=estimated_rows,
247
+ source_count=source_count,
248
+ connection_count=connection_count,
249
+ has_aggregations=has_aggregations,
250
+ has_joins=has_joins,
251
+ complexity_score=complexity_score,
252
+ )
253
+
254
+ def _estimate_row_count(self, source_refs: set) -> int:
255
+ """
256
+ Estimate total row count from source tables.
257
+
258
+ Note: Used for informational purposes only.
259
+
260
+ :param source_refs: Set of source unique_ids
261
+ :returns: Estimated row count
262
+ """
263
+ total_rows = 0
264
+
265
+ for source_id in source_refs:
266
+ source = self.manifest.sources.get(source_id)
267
+ if not source:
268
+ total_rows += 100000
269
+ continue
270
+
271
+ # Heuristic based on naming (informational only)
272
+ if (
273
+ "fact" in source.identifier.lower()
274
+ or "events" in source.identifier.lower()
275
+ ):
276
+ total_rows += 1000000
277
+ elif (
278
+ "dim" in source.identifier.lower()
279
+ or "lookup" in source.identifier.lower()
280
+ ):
281
+ total_rows += 10000
282
+ else:
283
+ total_rows += 100000
284
+
285
+ return total_rows
286
+
287
+ def _has_aggregations(self, sql: str) -> bool:
288
+ """Check if SQL contains aggregations."""
289
+ sql_upper = sql.upper()
290
+ return any(
291
+ keyword in sql_upper
292
+ for keyword in [
293
+ " GROUP BY ",
294
+ " SUM(",
295
+ " COUNT(",
296
+ " AVG(",
297
+ " MIN(",
298
+ " MAX(",
299
+ " HAVING ",
300
+ ]
301
+ )
302
+
303
+ def _has_joins(self, sql: str) -> bool:
304
+ """Check if SQL contains joins."""
305
+ sql_upper = sql.upper()
306
+ return any(
307
+ keyword in sql_upper
308
+ for keyword in [
309
+ " JOIN ",
310
+ " INNER JOIN ",
311
+ " LEFT JOIN ",
312
+ " RIGHT JOIN ",
313
+ " FULL JOIN ",
314
+ " CROSS JOIN ",
315
+ ]
316
+ )
317
+
318
+ def _calculate_complexity(
319
+ self,
320
+ source_count: int,
321
+ connection_count: int,
322
+ has_aggregations: bool,
323
+ has_joins: bool,
324
+ ) -> float:
325
+ """Calculate query complexity score (0.0 to 1.0)."""
326
+ score = 0.0
327
+ score += min(source_count / 10.0, 0.3)
328
+ score += min(connection_count / 5.0, 0.2)
329
+ if has_aggregations:
330
+ score += 0.2
331
+ if has_joins:
332
+ score += 0.3
333
+ return min(score, 1.0)
334
+
335
+ def get_execution_strategy(
336
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
337
+ ) -> ExecutionStrategy:
338
+ """
339
+ Get the execution strategy for a node (public API).
340
+
341
+ :param node: The node
342
+ :param analysis_result: Query analysis result
343
+ :returns: ExecutionStrategy enum
344
+ """
345
+ return self._determine_execution_strategy(node, analysis_result)
346
+
347
+ def get_recommendation_reason(
348
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
349
+ ) -> str:
350
+ """
351
+ Get human-readable explanation for engine selection.
352
+
353
+ :param node: The node
354
+ :param analysis_result: Query analysis result
355
+ :returns: Explanation string
356
+ """
357
+ strategy = self._determine_execution_strategy(node, analysis_result)
358
+
359
+ if strategy == ExecutionStrategy.PUSHDOWN:
360
+ return "Pushdown: All sources in same target connection - executing directly"
361
+
362
+ # Federated execution
363
+ engine = self._select_compute_for_federation(node)
364
+ estimate = self._estimate_workload(node, analysis_result)
365
+
366
+ reasons = []
367
+ reasons.append(f"Cross-target query ({estimate.connection_count} connections)")
368
+
369
+ if self.cli_target_compute:
370
+ reasons.append(f"CLI override: --target-compute {self.cli_target_compute}")
371
+ elif hasattr(node, "config") and hasattr(node.config, "compute") and node.config.compute:
372
+ reasons.append(f"Model config: compute='{node.config.compute}'")
373
+ else:
374
+ reasons.append("Using default from computes.yml")
375
+
376
+ reason_str = "; ".join(reasons)
377
+ return f"Federated ({engine}): {reason_str}"
@@ -0,0 +1,55 @@
1
+ """
2
+ Spark Connection Strategies
3
+
4
+ This module provides different strategies for connecting to Spark clusters.
5
+ Uses the strategy pattern for flexible platform support.
6
+
7
+ v0.5.98: Added EMRStrategy, DataprocStrategy, and StandaloneStrategy.
8
+ v0.51.2: Removed Databricks support (serverless cannot read external JDBC sources).
9
+ """
10
+
11
+ from dbt.compute.strategies.base import BaseConnectionStrategy
12
+ from dbt.compute.strategies.local import LocalStrategy, cleanup_all_spark_sessions
13
+
14
+ # Strategies are imported lazily to avoid import errors when
15
+ # optional dependencies are not installed
16
+
17
+
18
+ def get_emr_strategy():
19
+ """
20
+ Lazily import and return EMRStrategy.
21
+
22
+ :returns: EMRStrategy class
23
+ """
24
+ from dbt.compute.strategies.emr import EMRStrategy
25
+ return EMRStrategy
26
+
27
+
28
+ def get_dataproc_strategy():
29
+ """
30
+ Lazily import and return DataprocStrategy.
31
+
32
+ :returns: DataprocStrategy class
33
+ """
34
+ from dbt.compute.strategies.dataproc import DataprocStrategy
35
+ return DataprocStrategy
36
+
37
+
38
+ def get_standalone_strategy():
39
+ """
40
+ Lazily import and return StandaloneStrategy.
41
+
42
+ :returns: StandaloneStrategy class
43
+ """
44
+ from dbt.compute.strategies.standalone import StandaloneStrategy
45
+ return StandaloneStrategy
46
+
47
+
48
+ __all__ = [
49
+ "BaseConnectionStrategy",
50
+ "LocalStrategy",
51
+ "cleanup_all_spark_sessions",
52
+ "get_emr_strategy",
53
+ "get_dataproc_strategy",
54
+ "get_standalone_strategy",
55
+ ]
@@ -0,0 +1,165 @@
1
+ """
2
+ Base Connection Strategy for Spark Engines
3
+
4
+ Defines the abstract interface for different Spark connection strategies.
5
+ Uses composition over inheritance for flexible platform support.
6
+
7
+ v0.5.98: Added JAR provisioning and connectivity testing methods.
8
+ """
9
+
10
+ from abc import ABC, abstractmethod
11
+ from typing import Any, Dict, Optional, Set, Tuple
12
+
13
+ try:
14
+ from pyspark.sql import SparkSession
15
+
16
+ PYSPARK_AVAILABLE = True
17
+ except ImportError:
18
+ PYSPARK_AVAILABLE = False
19
+ SparkSession = None
20
+
21
+
22
+ class BaseConnectionStrategy(ABC):
23
+ """
24
+ Abstract base class for Spark connection strategies.
25
+
26
+ Different strategies implement different ways to connect to Spark:
27
+ - LocalStrategy: Embedded PySpark (in-process)
28
+ - DatabricksStrategy: Databricks Connect (remote cluster)
29
+ - EMRStrategy: AWS EMR cluster
30
+ - DataprocStrategy: GCP Dataproc
31
+ - StandaloneStrategy: Self-managed Spark clusters
32
+ """
33
+
34
+ def __init__(self, config: Dict[str, Any], app_name: str = "DVT-Compute"):
35
+ """
36
+ Initialize connection strategy.
37
+
38
+ :param config: Strategy-specific configuration
39
+ :param app_name: Spark application name
40
+ """
41
+ self.config = config
42
+ self.app_name = app_name
43
+
44
+ @abstractmethod
45
+ def get_spark_session(self) -> SparkSession:
46
+ """
47
+ Create and return a SparkSession.
48
+
49
+ :returns: Initialized SparkSession
50
+ :raises DbtRuntimeError: If session creation fails
51
+ """
52
+ pass
53
+
54
+ @abstractmethod
55
+ def validate_config(self) -> None:
56
+ """
57
+ Validate strategy-specific configuration.
58
+
59
+ :raises DbtRuntimeError: If configuration is invalid
60
+ """
61
+ pass
62
+
63
+ def estimate_cost(self, duration_minutes: float) -> float:
64
+ """
65
+ Estimate cost for running on this platform.
66
+
67
+ Default implementation returns 0.0 (free). Override for cloud platforms.
68
+
69
+ :param duration_minutes: Estimated query duration in minutes
70
+ :returns: Estimated cost in USD
71
+ """
72
+ return 0.0
73
+
74
+ @abstractmethod
75
+ def close(self, spark: Optional[SparkSession]) -> None:
76
+ """
77
+ Clean up Spark session.
78
+
79
+ :param spark: SparkSession to clean up (may be None)
80
+ """
81
+ pass
82
+
83
+ def get_platform_name(self) -> str:
84
+ """
85
+ Get human-readable platform name.
86
+
87
+ :returns: Platform name (e.g., "local", "databricks", "emr")
88
+ """
89
+ return self.__class__.__name__.replace("Strategy", "").lower()
90
+
91
+ def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
92
+ """
93
+ Get Spark configuration for JDBC JAR provisioning.
94
+
95
+ Default implementation returns empty dict. Override in subclasses
96
+ to provide platform-specific JAR configuration.
97
+
98
+ Local platforms use spark.jars (local file paths).
99
+ Remote platforms use spark.jars.packages (Maven coordinates).
100
+
101
+ :param adapter_types: Set of adapter types that need JDBC drivers
102
+ :returns: Dictionary of Spark config keys/values (e.g., {"spark.jars": "..."})
103
+ """
104
+ return {}
105
+
106
+ def test_connectivity(self) -> Tuple[bool, str]:
107
+ """
108
+ Test basic connectivity to the Spark cluster.
109
+
110
+ Creates a session, runs a simple query, and returns status.
111
+ Override for platform-specific connectivity testing.
112
+
113
+ :returns: Tuple of (success, message)
114
+ """
115
+ try:
116
+ spark = self.get_spark_session()
117
+ # Run a simple SQL query to verify connectivity
118
+ spark.sql("SELECT 1 AS test").collect()
119
+ return (True, "Session created and SQL test passed")
120
+ except Exception as e:
121
+ return (False, str(e))
122
+
123
+ def test_jdbc_connectivity(
124
+ self,
125
+ jdbc_url: str,
126
+ properties: Dict[str, str],
127
+ table_or_query: str = "(SELECT 1 AS test) AS t",
128
+ ) -> Tuple[bool, str]:
129
+ """
130
+ Test JDBC connectivity through the Spark cluster.
131
+
132
+ Creates a session and attempts to read from a JDBC source.
133
+ This verifies that JDBC drivers are properly configured.
134
+
135
+ :param jdbc_url: JDBC connection URL
136
+ :param properties: JDBC connection properties (user, password, driver)
137
+ :param table_or_query: Table name or SQL query wrapped in parentheses
138
+ :returns: Tuple of (success, message)
139
+ """
140
+ try:
141
+ spark = self.get_spark_session()
142
+
143
+ # Attempt JDBC read
144
+ df = (
145
+ spark.read.format("jdbc")
146
+ .option("url", jdbc_url)
147
+ .option("dbtable", table_or_query)
148
+ .options(**properties)
149
+ .load()
150
+ )
151
+
152
+ # Force evaluation
153
+ row_count = df.count()
154
+ return (True, f"JDBC read successful ({row_count} rows)")
155
+ except Exception as e:
156
+ error_msg = str(e)
157
+ # Provide helpful error messages for common issues
158
+ if "ClassNotFoundException" in error_msg:
159
+ return (False, f"JDBC driver not found: {error_msg}")
160
+ elif "No suitable driver" in error_msg:
161
+ return (False, f"JDBC driver not loaded: {error_msg}")
162
+ elif "Authentication" in error_msg.lower() or "password" in error_msg.lower():
163
+ return (False, f"Authentication failed: {error_msg}")
164
+ else:
165
+ return (False, f"JDBC test failed: {error_msg}")