dvt-core 0.59.0a51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. dbt/__init__.py +7 -0
  2. dbt/_pydantic_shim.py +26 -0
  3. dbt/artifacts/__init__.py +0 -0
  4. dbt/artifacts/exceptions/__init__.py +1 -0
  5. dbt/artifacts/exceptions/schemas.py +31 -0
  6. dbt/artifacts/resources/__init__.py +116 -0
  7. dbt/artifacts/resources/base.py +67 -0
  8. dbt/artifacts/resources/types.py +93 -0
  9. dbt/artifacts/resources/v1/analysis.py +10 -0
  10. dbt/artifacts/resources/v1/catalog.py +23 -0
  11. dbt/artifacts/resources/v1/components.py +274 -0
  12. dbt/artifacts/resources/v1/config.py +277 -0
  13. dbt/artifacts/resources/v1/documentation.py +11 -0
  14. dbt/artifacts/resources/v1/exposure.py +51 -0
  15. dbt/artifacts/resources/v1/function.py +52 -0
  16. dbt/artifacts/resources/v1/generic_test.py +31 -0
  17. dbt/artifacts/resources/v1/group.py +21 -0
  18. dbt/artifacts/resources/v1/hook.py +11 -0
  19. dbt/artifacts/resources/v1/macro.py +29 -0
  20. dbt/artifacts/resources/v1/metric.py +172 -0
  21. dbt/artifacts/resources/v1/model.py +145 -0
  22. dbt/artifacts/resources/v1/owner.py +10 -0
  23. dbt/artifacts/resources/v1/saved_query.py +111 -0
  24. dbt/artifacts/resources/v1/seed.py +41 -0
  25. dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  26. dbt/artifacts/resources/v1/semantic_model.py +314 -0
  27. dbt/artifacts/resources/v1/singular_test.py +14 -0
  28. dbt/artifacts/resources/v1/snapshot.py +91 -0
  29. dbt/artifacts/resources/v1/source_definition.py +84 -0
  30. dbt/artifacts/resources/v1/sql_operation.py +10 -0
  31. dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
  32. dbt/artifacts/schemas/__init__.py +0 -0
  33. dbt/artifacts/schemas/base.py +191 -0
  34. dbt/artifacts/schemas/batch_results.py +24 -0
  35. dbt/artifacts/schemas/catalog/__init__.py +11 -0
  36. dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  37. dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
  38. dbt/artifacts/schemas/freshness/__init__.py +1 -0
  39. dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  40. dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
  41. dbt/artifacts/schemas/manifest/__init__.py +2 -0
  42. dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  43. dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
  44. dbt/artifacts/schemas/results.py +147 -0
  45. dbt/artifacts/schemas/run/__init__.py +2 -0
  46. dbt/artifacts/schemas/run/v5/__init__.py +0 -0
  47. dbt/artifacts/schemas/run/v5/run.py +184 -0
  48. dbt/artifacts/schemas/upgrades/__init__.py +4 -0
  49. dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  50. dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  51. dbt/artifacts/utils/validation.py +153 -0
  52. dbt/cli/__init__.py +1 -0
  53. dbt/cli/context.py +17 -0
  54. dbt/cli/exceptions.py +57 -0
  55. dbt/cli/flags.py +560 -0
  56. dbt/cli/main.py +2660 -0
  57. dbt/cli/option_types.py +121 -0
  58. dbt/cli/options.py +80 -0
  59. dbt/cli/params.py +844 -0
  60. dbt/cli/requires.py +490 -0
  61. dbt/cli/resolvers.py +60 -0
  62. dbt/cli/types.py +40 -0
  63. dbt/clients/__init__.py +0 -0
  64. dbt/clients/checked_load.py +83 -0
  65. dbt/clients/git.py +164 -0
  66. dbt/clients/jinja.py +206 -0
  67. dbt/clients/jinja_static.py +245 -0
  68. dbt/clients/registry.py +192 -0
  69. dbt/clients/yaml_helper.py +68 -0
  70. dbt/compilation.py +876 -0
  71. dbt/compute/__init__.py +14 -0
  72. dbt/compute/engines/__init__.py +12 -0
  73. dbt/compute/engines/spark_engine.py +642 -0
  74. dbt/compute/federated_executor.py +1080 -0
  75. dbt/compute/filter_pushdown.py +273 -0
  76. dbt/compute/jar_provisioning.py +273 -0
  77. dbt/compute/java_compat.py +689 -0
  78. dbt/compute/jdbc_utils.py +1252 -0
  79. dbt/compute/metadata/__init__.py +63 -0
  80. dbt/compute/metadata/adapters_registry.py +370 -0
  81. dbt/compute/metadata/catalog_store.py +1036 -0
  82. dbt/compute/metadata/registry.py +674 -0
  83. dbt/compute/metadata/store.py +1020 -0
  84. dbt/compute/smart_selector.py +377 -0
  85. dbt/compute/spark_logger.py +272 -0
  86. dbt/compute/strategies/__init__.py +55 -0
  87. dbt/compute/strategies/base.py +165 -0
  88. dbt/compute/strategies/dataproc.py +207 -0
  89. dbt/compute/strategies/emr.py +203 -0
  90. dbt/compute/strategies/local.py +472 -0
  91. dbt/compute/strategies/standalone.py +262 -0
  92. dbt/config/__init__.py +4 -0
  93. dbt/config/catalogs.py +94 -0
  94. dbt/config/compute.py +513 -0
  95. dbt/config/dvt_profile.py +408 -0
  96. dbt/config/profile.py +422 -0
  97. dbt/config/project.py +888 -0
  98. dbt/config/project_utils.py +48 -0
  99. dbt/config/renderer.py +231 -0
  100. dbt/config/runtime.py +564 -0
  101. dbt/config/selectors.py +208 -0
  102. dbt/config/utils.py +77 -0
  103. dbt/constants.py +28 -0
  104. dbt/context/__init__.py +0 -0
  105. dbt/context/base.py +745 -0
  106. dbt/context/configured.py +135 -0
  107. dbt/context/context_config.py +382 -0
  108. dbt/context/docs.py +82 -0
  109. dbt/context/exceptions_jinja.py +178 -0
  110. dbt/context/macro_resolver.py +195 -0
  111. dbt/context/macros.py +171 -0
  112. dbt/context/manifest.py +72 -0
  113. dbt/context/providers.py +2249 -0
  114. dbt/context/query_header.py +13 -0
  115. dbt/context/secret.py +58 -0
  116. dbt/context/target.py +74 -0
  117. dbt/contracts/__init__.py +0 -0
  118. dbt/contracts/files.py +413 -0
  119. dbt/contracts/graph/__init__.py +0 -0
  120. dbt/contracts/graph/manifest.py +1904 -0
  121. dbt/contracts/graph/metrics.py +97 -0
  122. dbt/contracts/graph/model_config.py +70 -0
  123. dbt/contracts/graph/node_args.py +42 -0
  124. dbt/contracts/graph/nodes.py +1806 -0
  125. dbt/contracts/graph/semantic_manifest.py +232 -0
  126. dbt/contracts/graph/unparsed.py +811 -0
  127. dbt/contracts/project.py +419 -0
  128. dbt/contracts/results.py +53 -0
  129. dbt/contracts/selection.py +23 -0
  130. dbt/contracts/sql.py +85 -0
  131. dbt/contracts/state.py +68 -0
  132. dbt/contracts/util.py +46 -0
  133. dbt/deprecations.py +348 -0
  134. dbt/deps/__init__.py +0 -0
  135. dbt/deps/base.py +152 -0
  136. dbt/deps/git.py +195 -0
  137. dbt/deps/local.py +79 -0
  138. dbt/deps/registry.py +130 -0
  139. dbt/deps/resolver.py +149 -0
  140. dbt/deps/tarball.py +120 -0
  141. dbt/docs/source/_ext/dbt_click.py +119 -0
  142. dbt/docs/source/conf.py +32 -0
  143. dbt/env_vars.py +64 -0
  144. dbt/event_time/event_time.py +40 -0
  145. dbt/event_time/sample_window.py +60 -0
  146. dbt/events/__init__.py +15 -0
  147. dbt/events/base_types.py +36 -0
  148. dbt/events/core_types_pb2.py +2 -0
  149. dbt/events/logging.py +108 -0
  150. dbt/events/types.py +2516 -0
  151. dbt/exceptions.py +1486 -0
  152. dbt/flags.py +89 -0
  153. dbt/graph/__init__.py +11 -0
  154. dbt/graph/cli.py +249 -0
  155. dbt/graph/graph.py +172 -0
  156. dbt/graph/queue.py +214 -0
  157. dbt/graph/selector.py +374 -0
  158. dbt/graph/selector_methods.py +975 -0
  159. dbt/graph/selector_spec.py +222 -0
  160. dbt/graph/thread_pool.py +18 -0
  161. dbt/hooks.py +21 -0
  162. dbt/include/README.md +49 -0
  163. dbt/include/__init__.py +3 -0
  164. dbt/include/data/adapters_registry.duckdb +0 -0
  165. dbt/include/data/build_comprehensive_registry.py +1254 -0
  166. dbt/include/data/build_registry.py +242 -0
  167. dbt/include/data/csv/adapter_queries.csv +33 -0
  168. dbt/include/data/csv/syntax_rules.csv +9 -0
  169. dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
  170. dbt/include/data/csv/type_mappings_databricks.csv +30 -0
  171. dbt/include/data/csv/type_mappings_mysql.csv +40 -0
  172. dbt/include/data/csv/type_mappings_oracle.csv +30 -0
  173. dbt/include/data/csv/type_mappings_postgres.csv +56 -0
  174. dbt/include/data/csv/type_mappings_redshift.csv +33 -0
  175. dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
  176. dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
  177. dbt/include/dvt_starter_project/README.md +15 -0
  178. dbt/include/dvt_starter_project/__init__.py +3 -0
  179. dbt/include/dvt_starter_project/analyses/PLACEHOLDER +0 -0
  180. dbt/include/dvt_starter_project/dvt_project.yml +39 -0
  181. dbt/include/dvt_starter_project/logs/PLACEHOLDER +0 -0
  182. dbt/include/dvt_starter_project/macros/PLACEHOLDER +0 -0
  183. dbt/include/dvt_starter_project/models/example/my_first_dbt_model.sql +27 -0
  184. dbt/include/dvt_starter_project/models/example/my_second_dbt_model.sql +6 -0
  185. dbt/include/dvt_starter_project/models/example/schema.yml +21 -0
  186. dbt/include/dvt_starter_project/seeds/PLACEHOLDER +0 -0
  187. dbt/include/dvt_starter_project/snapshots/PLACEHOLDER +0 -0
  188. dbt/include/dvt_starter_project/tests/PLACEHOLDER +0 -0
  189. dbt/internal_deprecations.py +26 -0
  190. dbt/jsonschemas/__init__.py +3 -0
  191. dbt/jsonschemas/jsonschemas.py +309 -0
  192. dbt/jsonschemas/project/0.0.110.json +4717 -0
  193. dbt/jsonschemas/project/0.0.85.json +2015 -0
  194. dbt/jsonschemas/resources/0.0.110.json +2636 -0
  195. dbt/jsonschemas/resources/0.0.85.json +2536 -0
  196. dbt/jsonschemas/resources/latest.json +6773 -0
  197. dbt/links.py +4 -0
  198. dbt/materializations/__init__.py +0 -0
  199. dbt/materializations/incremental/__init__.py +0 -0
  200. dbt/materializations/incremental/microbatch.py +236 -0
  201. dbt/mp_context.py +8 -0
  202. dbt/node_types.py +37 -0
  203. dbt/parser/__init__.py +23 -0
  204. dbt/parser/analysis.py +21 -0
  205. dbt/parser/base.py +548 -0
  206. dbt/parser/common.py +266 -0
  207. dbt/parser/docs.py +52 -0
  208. dbt/parser/fixtures.py +51 -0
  209. dbt/parser/functions.py +30 -0
  210. dbt/parser/generic_test.py +100 -0
  211. dbt/parser/generic_test_builders.py +333 -0
  212. dbt/parser/hooks.py +122 -0
  213. dbt/parser/macros.py +137 -0
  214. dbt/parser/manifest.py +2208 -0
  215. dbt/parser/models.py +573 -0
  216. dbt/parser/partial.py +1178 -0
  217. dbt/parser/read_files.py +445 -0
  218. dbt/parser/schema_generic_tests.py +422 -0
  219. dbt/parser/schema_renderer.py +111 -0
  220. dbt/parser/schema_yaml_readers.py +935 -0
  221. dbt/parser/schemas.py +1466 -0
  222. dbt/parser/search.py +149 -0
  223. dbt/parser/seeds.py +28 -0
  224. dbt/parser/singular_test.py +20 -0
  225. dbt/parser/snapshots.py +44 -0
  226. dbt/parser/sources.py +558 -0
  227. dbt/parser/sql.py +62 -0
  228. dbt/parser/unit_tests.py +621 -0
  229. dbt/plugins/__init__.py +20 -0
  230. dbt/plugins/contracts.py +9 -0
  231. dbt/plugins/exceptions.py +2 -0
  232. dbt/plugins/manager.py +163 -0
  233. dbt/plugins/manifest.py +21 -0
  234. dbt/profiler.py +20 -0
  235. dbt/py.typed +1 -0
  236. dbt/query_analyzer.py +410 -0
  237. dbt/runners/__init__.py +2 -0
  238. dbt/runners/exposure_runner.py +7 -0
  239. dbt/runners/no_op_runner.py +45 -0
  240. dbt/runners/saved_query_runner.py +7 -0
  241. dbt/selected_resources.py +8 -0
  242. dbt/task/__init__.py +0 -0
  243. dbt/task/base.py +506 -0
  244. dbt/task/build.py +197 -0
  245. dbt/task/clean.py +56 -0
  246. dbt/task/clone.py +161 -0
  247. dbt/task/compile.py +150 -0
  248. dbt/task/compute.py +458 -0
  249. dbt/task/debug.py +513 -0
  250. dbt/task/deps.py +280 -0
  251. dbt/task/docs/__init__.py +3 -0
  252. dbt/task/docs/api/__init__.py +23 -0
  253. dbt/task/docs/api/catalog.py +204 -0
  254. dbt/task/docs/api/lineage.py +234 -0
  255. dbt/task/docs/api/profile.py +204 -0
  256. dbt/task/docs/api/spark.py +186 -0
  257. dbt/task/docs/generate.py +1002 -0
  258. dbt/task/docs/index.html +250 -0
  259. dbt/task/docs/serve.py +174 -0
  260. dbt/task/dvt_output.py +509 -0
  261. dbt/task/dvt_run.py +282 -0
  262. dbt/task/dvt_seed.py +806 -0
  263. dbt/task/freshness.py +322 -0
  264. dbt/task/function.py +121 -0
  265. dbt/task/group_lookup.py +46 -0
  266. dbt/task/init.py +1022 -0
  267. dbt/task/java.py +316 -0
  268. dbt/task/list.py +236 -0
  269. dbt/task/metadata.py +804 -0
  270. dbt/task/migrate.py +714 -0
  271. dbt/task/printer.py +175 -0
  272. dbt/task/profile.py +1489 -0
  273. dbt/task/profile_serve.py +662 -0
  274. dbt/task/retract.py +441 -0
  275. dbt/task/retry.py +175 -0
  276. dbt/task/run.py +1647 -0
  277. dbt/task/run_operation.py +141 -0
  278. dbt/task/runnable.py +758 -0
  279. dbt/task/seed.py +103 -0
  280. dbt/task/show.py +149 -0
  281. dbt/task/snapshot.py +56 -0
  282. dbt/task/spark.py +414 -0
  283. dbt/task/sql.py +110 -0
  284. dbt/task/target_sync.py +814 -0
  285. dbt/task/test.py +464 -0
  286. dbt/tests/fixtures/__init__.py +1 -0
  287. dbt/tests/fixtures/project.py +620 -0
  288. dbt/tests/util.py +651 -0
  289. dbt/tracking.py +529 -0
  290. dbt/utils/__init__.py +3 -0
  291. dbt/utils/artifact_upload.py +151 -0
  292. dbt/utils/utils.py +408 -0
  293. dbt/version.py +271 -0
  294. dvt_cli/__init__.py +158 -0
  295. dvt_core-0.59.0a51.dist-info/METADATA +288 -0
  296. dvt_core-0.59.0a51.dist-info/RECORD +299 -0
  297. dvt_core-0.59.0a51.dist-info/WHEEL +5 -0
  298. dvt_core-0.59.0a51.dist-info/entry_points.txt +2 -0
  299. dvt_core-0.59.0a51.dist-info/top_level.txt +2 -0
@@ -0,0 +1,377 @@
1
+ """
2
+ Smart Compute Engine Selector
3
+
4
+ Selects compute engine based on DVT compute rules (NOT size-based).
5
+
6
+ v0.56.0: Refactored to follow DVT compute rules:
7
+ 1. CLI --target-compute override (highest priority)
8
+ 2. Model-level config {{ config(compute='...') }}
9
+ 3. Default from computes.yml target_compute
10
+ 4. Pushdown when model and all inputs are in same target (no Spark needed)
11
+
12
+ Selection is deterministic based on configuration, not data characteristics.
13
+ """
14
+
15
+ from dataclasses import dataclass
16
+ from enum import Enum
17
+ from typing import Any, Optional, Set
18
+
19
+ from dbt.contracts.graph.manifest import Manifest
20
+ from dbt.contracts.graph.nodes import ManifestNode
21
+ from dbt.query_analyzer import QueryAnalysisResult
22
+ from dbt_common.exceptions import DbtRuntimeError
23
+
24
+
25
+ class ExecutionStrategy(Enum):
26
+ """Execution strategy for a node."""
27
+
28
+ PUSHDOWN = "pushdown" # Execute directly on target adapter (same connection)
29
+ FEDERATED = "federated" # Execute via Spark for cross-target queries
30
+
31
+
32
+ @dataclass
33
+ class WorkloadEstimate:
34
+ """Estimated workload characteristics for a query."""
35
+
36
+ estimated_rows: int # Estimated total rows to process
37
+ source_count: int # Number of source tables
38
+ connection_count: int # Number of different connections
39
+ has_aggregations: bool # Query contains GROUP BY or aggregations
40
+ has_joins: bool # Query contains JOIN operations
41
+ complexity_score: float # 0.0 to 1.0, higher = more complex
42
+
43
+ @property
44
+ def estimated_data_mb(self) -> float:
45
+ """Rough estimate of data size in MB (assuming ~100 bytes/row)."""
46
+ return (self.estimated_rows * 100) / (1024 * 1024)
47
+
48
+
49
+ class SmartComputeSelector:
50
+ """
51
+ Selects compute engine based on DVT compute rules.
52
+
53
+ v0.56.0: Rule-based selection (NO size-based logic).
54
+
55
+ Selection hierarchy (highest to lowest priority):
56
+ 1. CLI --target-compute override
57
+ 2. Model config: {{ config(compute='spark-cluster') }}
58
+ 3. Default from computes.yml target_compute
59
+
60
+ Execution strategy:
61
+ - PUSHDOWN: When model and all inputs are in same target
62
+ - FEDERATED: When sources span multiple targets (requires Spark)
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ manifest: Manifest,
68
+ compute_registry: Optional[Any] = None,
69
+ cli_target_compute: Optional[str] = None,
70
+ ):
71
+ """
72
+ Initialize smart selector.
73
+
74
+ :param manifest: The dbt manifest
75
+ :param compute_registry: ComputeRegistry instance for compute configuration
76
+ :param cli_target_compute: CLI --target-compute override (highest priority)
77
+ """
78
+ self.manifest = manifest
79
+ self.compute_registry = compute_registry
80
+ self.cli_target_compute = cli_target_compute
81
+
82
+ def select_engine(
83
+ self,
84
+ node: ManifestNode,
85
+ analysis_result: QueryAnalysisResult,
86
+ cli_override: Optional[str] = None,
87
+ ) -> str:
88
+ """
89
+ Select compute engine based on DVT rules.
90
+
91
+ v0.56.0: Rule-based selection (no size-based logic).
92
+
93
+ Priority:
94
+ 1. cli_override parameter (passed at call time)
95
+ 2. self.cli_target_compute (passed at init time)
96
+ 3. Model config: {{ config(compute='...') }}
97
+ 4. Default from computes.yml target_compute
98
+
99
+ :param node: The node to execute
100
+ :param analysis_result: Query analysis result
101
+ :param cli_override: CLI --target-compute override
102
+ :returns: Compute engine name (e.g., "spark-local", "spark-cluster")
103
+ :raises DbtRuntimeError: If specified compute doesn't exist
104
+ """
105
+ # Determine execution strategy first
106
+ strategy = self._determine_execution_strategy(node, analysis_result)
107
+
108
+ # For pushdown, no Spark compute needed
109
+ if strategy == ExecutionStrategy.PUSHDOWN:
110
+ return "pushdown"
111
+
112
+ # For federated execution, select compute engine
113
+ return self._select_compute_for_federation(node, cli_override)
114
+
115
+ def _determine_execution_strategy(
116
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
117
+ ) -> ExecutionStrategy:
118
+ """
119
+ Determine whether to use pushdown or federation.
120
+
121
+ DVT Rule: Pushdown when model and ALL inputs are in same target.
122
+
123
+ :param node: The node to analyze
124
+ :param analysis_result: Query analysis result
125
+ :returns: ExecutionStrategy (PUSHDOWN or FEDERATED)
126
+ """
127
+ # Get target connection for this node
128
+ node_target = self._get_node_target(node)
129
+
130
+ # Get all source connections
131
+ source_connections = analysis_result.source_connections
132
+
133
+ # If no sources, can use pushdown (pure computation)
134
+ if not source_connections:
135
+ return ExecutionStrategy.PUSHDOWN
136
+
137
+ # Check if all sources are in the same connection as the target
138
+ if len(source_connections) == 1:
139
+ source_connection = next(iter(source_connections))
140
+ if source_connection == node_target:
141
+ # Same connection - use pushdown
142
+ return ExecutionStrategy.PUSHDOWN
143
+
144
+ # Multiple connections or different target - must federate
145
+ return ExecutionStrategy.FEDERATED
146
+
147
+ def _get_node_target(self, node: ManifestNode) -> str:
148
+ """
149
+ Get the target connection for a node.
150
+
151
+ :param node: The manifest node
152
+ :returns: Target connection name
153
+ """
154
+ # Check if node has explicit target config
155
+ if hasattr(node, "config") and hasattr(node.config, "target"):
156
+ if node.config.target:
157
+ return node.config.target
158
+
159
+ # Otherwise, use default target from manifest
160
+ # Note: In DVT, this comes from profiles.yml default target
161
+ return "default"
162
+
163
+ def _select_compute_for_federation(
164
+ self, node: ManifestNode, cli_override: Optional[str] = None
165
+ ) -> str:
166
+ """
167
+ Select compute engine for federated execution.
168
+
169
+ Priority:
170
+ 1. cli_override parameter (passed at call time)
171
+ 2. self.cli_target_compute (passed at init time)
172
+ 3. Model config: {{ config(compute='...') }}
173
+ 4. Default from computes.yml target_compute
174
+
175
+ :param node: The node to execute
176
+ :param cli_override: CLI --target-compute override
177
+ :returns: Compute engine name
178
+ :raises DbtRuntimeError: If specified compute doesn't exist
179
+ """
180
+ compute_name = None
181
+
182
+ # Priority 1: CLI override (call-time)
183
+ if cli_override:
184
+ compute_name = cli_override
185
+
186
+ # Priority 2: CLI override (init-time)
187
+ elif self.cli_target_compute:
188
+ compute_name = self.cli_target_compute
189
+
190
+ # Priority 3: Model-level config
191
+ elif hasattr(node, "config") and hasattr(node.config, "compute"):
192
+ if node.config.compute:
193
+ compute_name = node.config.compute
194
+
195
+ # Priority 4: Default from computes.yml
196
+ elif self.compute_registry:
197
+ compute_name = self.compute_registry.target_compute
198
+
199
+ # Fallback if no registry
200
+ if not compute_name:
201
+ compute_name = "spark-local"
202
+
203
+ # Validate the compute engine exists
204
+ if self.compute_registry and not self.compute_registry.exists(compute_name):
205
+ available = [c.name for c in self.compute_registry.list()]
206
+ raise DbtRuntimeError(
207
+ f"Compute engine '{compute_name}' not found. "
208
+ f"Available engines: {', '.join(available)}"
209
+ )
210
+
211
+ return compute_name
212
+
213
+ def _estimate_workload(
214
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
215
+ ) -> WorkloadEstimate:
216
+ """
217
+ Estimate workload characteristics for a node.
218
+
219
+ Note: Used for informational purposes only, NOT for compute selection.
220
+
221
+ :param node: The node to analyze
222
+ :param analysis_result: Query analysis result
223
+ :returns: WorkloadEstimate
224
+ """
225
+ # Count sources
226
+ source_count = len(analysis_result.source_refs)
227
+ connection_count = len(analysis_result.source_connections)
228
+
229
+ # Estimate row count (informational only)
230
+ estimated_rows = self._estimate_row_count(analysis_result.source_refs)
231
+
232
+ # Analyze SQL for complexity (informational only)
233
+ sql = node.compiled_code if hasattr(node, "compiled_code") else node.raw_code
234
+ has_aggregations = self._has_aggregations(sql)
235
+ has_joins = self._has_joins(sql)
236
+
237
+ # Calculate complexity score (informational only)
238
+ complexity_score = self._calculate_complexity(
239
+ source_count=source_count,
240
+ connection_count=connection_count,
241
+ has_aggregations=has_aggregations,
242
+ has_joins=has_joins,
243
+ )
244
+
245
+ return WorkloadEstimate(
246
+ estimated_rows=estimated_rows,
247
+ source_count=source_count,
248
+ connection_count=connection_count,
249
+ has_aggregations=has_aggregations,
250
+ has_joins=has_joins,
251
+ complexity_score=complexity_score,
252
+ )
253
+
254
+ def _estimate_row_count(self, source_refs: set) -> int:
255
+ """
256
+ Estimate total row count from source tables.
257
+
258
+ Note: Used for informational purposes only.
259
+
260
+ :param source_refs: Set of source unique_ids
261
+ :returns: Estimated row count
262
+ """
263
+ total_rows = 0
264
+
265
+ for source_id in source_refs:
266
+ source = self.manifest.sources.get(source_id)
267
+ if not source:
268
+ total_rows += 100000
269
+ continue
270
+
271
+ # Heuristic based on naming (informational only)
272
+ if (
273
+ "fact" in source.identifier.lower()
274
+ or "events" in source.identifier.lower()
275
+ ):
276
+ total_rows += 1000000
277
+ elif (
278
+ "dim" in source.identifier.lower()
279
+ or "lookup" in source.identifier.lower()
280
+ ):
281
+ total_rows += 10000
282
+ else:
283
+ total_rows += 100000
284
+
285
+ return total_rows
286
+
287
+ def _has_aggregations(self, sql: str) -> bool:
288
+ """Check if SQL contains aggregations."""
289
+ sql_upper = sql.upper()
290
+ return any(
291
+ keyword in sql_upper
292
+ for keyword in [
293
+ " GROUP BY ",
294
+ " SUM(",
295
+ " COUNT(",
296
+ " AVG(",
297
+ " MIN(",
298
+ " MAX(",
299
+ " HAVING ",
300
+ ]
301
+ )
302
+
303
+ def _has_joins(self, sql: str) -> bool:
304
+ """Check if SQL contains joins."""
305
+ sql_upper = sql.upper()
306
+ return any(
307
+ keyword in sql_upper
308
+ for keyword in [
309
+ " JOIN ",
310
+ " INNER JOIN ",
311
+ " LEFT JOIN ",
312
+ " RIGHT JOIN ",
313
+ " FULL JOIN ",
314
+ " CROSS JOIN ",
315
+ ]
316
+ )
317
+
318
+ def _calculate_complexity(
319
+ self,
320
+ source_count: int,
321
+ connection_count: int,
322
+ has_aggregations: bool,
323
+ has_joins: bool,
324
+ ) -> float:
325
+ """Calculate query complexity score (0.0 to 1.0)."""
326
+ score = 0.0
327
+ score += min(source_count / 10.0, 0.3)
328
+ score += min(connection_count / 5.0, 0.2)
329
+ if has_aggregations:
330
+ score += 0.2
331
+ if has_joins:
332
+ score += 0.3
333
+ return min(score, 1.0)
334
+
335
+ def get_execution_strategy(
336
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
337
+ ) -> ExecutionStrategy:
338
+ """
339
+ Get the execution strategy for a node (public API).
340
+
341
+ :param node: The node
342
+ :param analysis_result: Query analysis result
343
+ :returns: ExecutionStrategy enum
344
+ """
345
+ return self._determine_execution_strategy(node, analysis_result)
346
+
347
+ def get_recommendation_reason(
348
+ self, node: ManifestNode, analysis_result: QueryAnalysisResult
349
+ ) -> str:
350
+ """
351
+ Get human-readable explanation for engine selection.
352
+
353
+ :param node: The node
354
+ :param analysis_result: Query analysis result
355
+ :returns: Explanation string
356
+ """
357
+ strategy = self._determine_execution_strategy(node, analysis_result)
358
+
359
+ if strategy == ExecutionStrategy.PUSHDOWN:
360
+ return "Pushdown: All sources in same target connection - executing directly"
361
+
362
+ # Federated execution
363
+ engine = self._select_compute_for_federation(node)
364
+ estimate = self._estimate_workload(node, analysis_result)
365
+
366
+ reasons = []
367
+ reasons.append(f"Cross-target query ({estimate.connection_count} connections)")
368
+
369
+ if self.cli_target_compute:
370
+ reasons.append(f"CLI override: --target-compute {self.cli_target_compute}")
371
+ elif hasattr(node, "config") and hasattr(node.config, "compute") and node.config.compute:
372
+ reasons.append(f"Model config: compute='{node.config.compute}'")
373
+ else:
374
+ reasons.append("Using default from computes.yml")
375
+
376
+ reason_str = "; ".join(reasons)
377
+ return f"Federated ({engine}): {reason_str}"
@@ -0,0 +1,272 @@
1
+ # =============================================================================
2
+ # DVT Spark Output Logger
3
+ # =============================================================================
4
+ # Captures Spark/compute output to log files for debugging while keeping
5
+ # console clean with progress bars.
6
+ #
7
+ # DVT v0.59.0a36: New module for Spark output capture
8
+ # =============================================================================
9
+
10
+ from __future__ import annotations
11
+
12
+ import sys
13
+ import threading
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+ from typing import Optional, TextIO
17
+
18
+
19
+ class TeeWriter:
20
+ """
21
+ A writer that writes to both the original stream and a log file.
22
+
23
+ This allows us to capture Spark output to a log file while still
24
+ passing it through (though in practice we suppress console output
25
+ by using Rich's Live display which takes over the terminal).
26
+ """
27
+
28
+ def __init__(self, original: TextIO, log_file: TextIO, suppress_console: bool = True):
29
+ self.original = original
30
+ self.log_file = log_file
31
+ self.suppress_console = suppress_console
32
+ self._lock = threading.Lock()
33
+
34
+ def write(self, data: str) -> int:
35
+ """Write data to log file and optionally to original stream."""
36
+ with self._lock:
37
+ # Always write to log file
38
+ try:
39
+ self.log_file.write(data)
40
+ self.log_file.flush()
41
+ except Exception:
42
+ pass # Don't break if log file write fails
43
+
44
+ # Write to original only if not suppressing console
45
+ if not self.suppress_console:
46
+ return self.original.write(data)
47
+
48
+ return len(data)
49
+
50
+ def flush(self) -> None:
51
+ """Flush both streams."""
52
+ with self._lock:
53
+ try:
54
+ self.log_file.flush()
55
+ except Exception:
56
+ pass
57
+ if not self.suppress_console:
58
+ self.original.flush()
59
+
60
+ def fileno(self) -> int:
61
+ """Return the file descriptor of the original stream."""
62
+ return self.original.fileno()
63
+
64
+ def isatty(self) -> bool:
65
+ """Return whether the original stream is a tty."""
66
+ return self.original.isatty()
67
+
68
+
69
+ class SparkOutputLogger:
70
+ """
71
+ Captures Spark/compute stderr and stdout to a log file.
72
+
73
+ The log file is written to target/{compute_name}_log.txt and overwrites
74
+ each time a new session starts. Each session is separated by a clear
75
+ header with timestamp.
76
+
77
+ Usage:
78
+ logger = SparkOutputLogger.get_instance(target_dir="/path/to/target", compute_name="spark")
79
+ logger.start_session()
80
+ # ... Spark operations ...
81
+ logger.end_session()
82
+
83
+ The logger is a singleton per (target_dir, compute_name) combination.
84
+ """
85
+
86
+ _instances: dict[tuple[str, str], "SparkOutputLogger"] = {}
87
+ _global_lock = threading.Lock()
88
+
89
+ def __init__(self, target_dir: str, compute_name: str = "spark"):
90
+ """
91
+ Initialize the Spark output logger.
92
+
93
+ Args:
94
+ target_dir: Path to the dbt target directory
95
+ compute_name: Name of the compute engine (used in log filename)
96
+ """
97
+ self.target_dir = Path(target_dir)
98
+ self.compute_name = compute_name
99
+ self.log_path = self.target_dir / f"{compute_name}_log.txt"
100
+ self._log_file: Optional[TextIO] = None
101
+ self._original_stderr: Optional[TextIO] = None
102
+ self._original_stdout: Optional[TextIO] = None
103
+ self._tee_stderr: Optional[TeeWriter] = None
104
+ self._tee_stdout: Optional[TeeWriter] = None
105
+ self._session_active = False
106
+ self._lock = threading.Lock()
107
+
108
+ @classmethod
109
+ def get_instance(cls, target_dir: str, compute_name: str = "spark") -> "SparkOutputLogger":
110
+ """
111
+ Get or create a singleton instance for the given target_dir and compute_name.
112
+
113
+ Args:
114
+ target_dir: Path to the dbt target directory
115
+ compute_name: Name of the compute engine
116
+
117
+ Returns:
118
+ SparkOutputLogger instance
119
+ """
120
+ key = (str(target_dir), compute_name)
121
+ with cls._global_lock:
122
+ if key not in cls._instances:
123
+ cls._instances[key] = cls(target_dir, compute_name)
124
+ return cls._instances[key]
125
+
126
+ def start_session(self, suppress_console: bool = True) -> None:
127
+ """
128
+ Start a new logging session.
129
+
130
+ This overwrites the previous log file and writes a session header.
131
+ stderr and stdout are redirected to capture Spark output.
132
+
133
+ Args:
134
+ suppress_console: If True, suppress output to console (default: True)
135
+ """
136
+ with self._lock:
137
+ if self._session_active:
138
+ return # Already active
139
+
140
+ try:
141
+ # Ensure target directory exists
142
+ self.target_dir.mkdir(parents=True, exist_ok=True)
143
+
144
+ # Open log file (overwrite mode)
145
+ self._log_file = open(self.log_path, 'w', encoding='utf-8')
146
+
147
+ # Write session header
148
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
149
+ self._log_file.write("=" * 80 + "\n")
150
+ self._log_file.write(f" DVT {self.compute_name.upper()} LOG\n")
151
+ self._log_file.write(f" Session started: {timestamp}\n")
152
+ self._log_file.write("=" * 80 + "\n\n")
153
+ self._log_file.flush()
154
+
155
+ # Save original streams
156
+ self._original_stderr = sys.stderr
157
+ self._original_stdout = sys.stdout
158
+
159
+ # Create tee writers
160
+ self._tee_stderr = TeeWriter(
161
+ self._original_stderr,
162
+ self._log_file,
163
+ suppress_console=suppress_console,
164
+ )
165
+ self._tee_stdout = TeeWriter(
166
+ self._original_stdout,
167
+ self._log_file,
168
+ suppress_console=suppress_console,
169
+ )
170
+
171
+ # Redirect stderr and stdout
172
+ sys.stderr = self._tee_stderr # type: ignore
173
+ sys.stdout = self._tee_stdout # type: ignore
174
+
175
+ self._session_active = True
176
+
177
+ except Exception as e:
178
+ # Don't break the application if logging fails
179
+ self._cleanup()
180
+ # Optionally log the error
181
+ try:
182
+ if self._original_stderr:
183
+ self._original_stderr.write(f"[DVT] Warning: Could not start Spark logging: {e}\n")
184
+ except Exception:
185
+ pass
186
+
187
+ def write_separator(self, label: str = "") -> None:
188
+ """
189
+ Write a separator line to the log file.
190
+
191
+ Useful for marking different phases of Spark execution.
192
+
193
+ Args:
194
+ label: Optional label for the separator
195
+ """
196
+ with self._lock:
197
+ if self._log_file:
198
+ try:
199
+ timestamp = datetime.now().strftime("%H:%M:%S")
200
+ if label:
201
+ self._log_file.write(f"\n--- [{timestamp}] {label} ---\n\n")
202
+ else:
203
+ self._log_file.write(f"\n--- [{timestamp}] ---\n\n")
204
+ self._log_file.flush()
205
+ except Exception:
206
+ pass
207
+
208
+ def end_session(self) -> None:
209
+ """
210
+ End the logging session.
211
+
212
+ Restores original stderr and stdout, closes the log file.
213
+ """
214
+ with self._lock:
215
+ if not self._session_active:
216
+ return
217
+
218
+ try:
219
+ # Write session footer
220
+ if self._log_file:
221
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
222
+ self._log_file.write("\n")
223
+ self._log_file.write("=" * 80 + "\n")
224
+ self._log_file.write(f" Session ended: {timestamp}\n")
225
+ self._log_file.write("=" * 80 + "\n")
226
+
227
+ except Exception:
228
+ pass
229
+
230
+ self._cleanup()
231
+
232
+ def _cleanup(self) -> None:
233
+ """Restore original streams and close log file."""
234
+ # Restore original streams
235
+ if self._original_stderr:
236
+ sys.stderr = self._original_stderr
237
+ self._original_stderr = None
238
+
239
+ if self._original_stdout:
240
+ sys.stdout = self._original_stdout
241
+ self._original_stdout = None
242
+
243
+ # Close log file
244
+ if self._log_file:
245
+ try:
246
+ self._log_file.close()
247
+ except Exception:
248
+ pass
249
+ self._log_file = None
250
+
251
+ self._tee_stderr = None
252
+ self._tee_stdout = None
253
+ self._session_active = False
254
+
255
+ def __del__(self):
256
+ """Ensure cleanup on deletion."""
257
+ self._cleanup()
258
+
259
+
260
+ # Convenience function for getting the logger
261
+ def get_spark_logger(target_dir: str, compute_name: str = "spark") -> SparkOutputLogger:
262
+ """
263
+ Get a Spark output logger for the given target directory.
264
+
265
+ Args:
266
+ target_dir: Path to the dbt target directory
267
+ compute_name: Name of the compute engine (default: "spark")
268
+
269
+ Returns:
270
+ SparkOutputLogger instance
271
+ """
272
+ return SparkOutputLogger.get_instance(target_dir, compute_name)
@@ -0,0 +1,55 @@
1
+ """
2
+ Spark Connection Strategies
3
+
4
+ This module provides different strategies for connecting to Spark clusters.
5
+ Uses the strategy pattern for flexible platform support.
6
+
7
+ v0.5.98: Added EMRStrategy, DataprocStrategy, and StandaloneStrategy.
8
+ v0.51.2: Removed Databricks support (serverless cannot read external JDBC sources).
9
+ """
10
+
11
+ from dbt.compute.strategies.base import BaseConnectionStrategy
12
+ from dbt.compute.strategies.local import LocalStrategy, cleanup_all_spark_sessions
13
+
14
+ # Strategies are imported lazily to avoid import errors when
15
+ # optional dependencies are not installed
16
+
17
+
18
+ def get_emr_strategy():
19
+ """
20
+ Lazily import and return EMRStrategy.
21
+
22
+ :returns: EMRStrategy class
23
+ """
24
+ from dbt.compute.strategies.emr import EMRStrategy
25
+ return EMRStrategy
26
+
27
+
28
+ def get_dataproc_strategy():
29
+ """
30
+ Lazily import and return DataprocStrategy.
31
+
32
+ :returns: DataprocStrategy class
33
+ """
34
+ from dbt.compute.strategies.dataproc import DataprocStrategy
35
+ return DataprocStrategy
36
+
37
+
38
+ def get_standalone_strategy():
39
+ """
40
+ Lazily import and return StandaloneStrategy.
41
+
42
+ :returns: StandaloneStrategy class
43
+ """
44
+ from dbt.compute.strategies.standalone import StandaloneStrategy
45
+ return StandaloneStrategy
46
+
47
+
48
+ __all__ = [
49
+ "BaseConnectionStrategy",
50
+ "LocalStrategy",
51
+ "cleanup_all_spark_sessions",
52
+ "get_emr_strategy",
53
+ "get_dataproc_strategy",
54
+ "get_standalone_strategy",
55
+ ]