dvt-core 1.11.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dvt-core might be problematic. Click here for more details.

Files changed (261) hide show
  1. dvt/__init__.py +7 -0
  2. dvt/_pydantic_shim.py +26 -0
  3. dvt/adapters/__init__.py +16 -0
  4. dvt/adapters/multi_adapter_manager.py +268 -0
  5. dvt/artifacts/__init__.py +0 -0
  6. dvt/artifacts/exceptions/__init__.py +1 -0
  7. dvt/artifacts/exceptions/schemas.py +31 -0
  8. dvt/artifacts/resources/__init__.py +116 -0
  9. dvt/artifacts/resources/base.py +68 -0
  10. dvt/artifacts/resources/types.py +93 -0
  11. dvt/artifacts/resources/v1/analysis.py +10 -0
  12. dvt/artifacts/resources/v1/catalog.py +23 -0
  13. dvt/artifacts/resources/v1/components.py +275 -0
  14. dvt/artifacts/resources/v1/config.py +282 -0
  15. dvt/artifacts/resources/v1/documentation.py +11 -0
  16. dvt/artifacts/resources/v1/exposure.py +52 -0
  17. dvt/artifacts/resources/v1/function.py +53 -0
  18. dvt/artifacts/resources/v1/generic_test.py +32 -0
  19. dvt/artifacts/resources/v1/group.py +22 -0
  20. dvt/artifacts/resources/v1/hook.py +11 -0
  21. dvt/artifacts/resources/v1/macro.py +30 -0
  22. dvt/artifacts/resources/v1/metric.py +173 -0
  23. dvt/artifacts/resources/v1/model.py +146 -0
  24. dvt/artifacts/resources/v1/owner.py +10 -0
  25. dvt/artifacts/resources/v1/saved_query.py +112 -0
  26. dvt/artifacts/resources/v1/seed.py +42 -0
  27. dvt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  28. dvt/artifacts/resources/v1/semantic_model.py +315 -0
  29. dvt/artifacts/resources/v1/singular_test.py +14 -0
  30. dvt/artifacts/resources/v1/snapshot.py +92 -0
  31. dvt/artifacts/resources/v1/source_definition.py +85 -0
  32. dvt/artifacts/resources/v1/sql_operation.py +10 -0
  33. dvt/artifacts/resources/v1/unit_test_definition.py +78 -0
  34. dvt/artifacts/schemas/__init__.py +0 -0
  35. dvt/artifacts/schemas/base.py +191 -0
  36. dvt/artifacts/schemas/batch_results.py +24 -0
  37. dvt/artifacts/schemas/catalog/__init__.py +12 -0
  38. dvt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  39. dvt/artifacts/schemas/catalog/v1/catalog.py +60 -0
  40. dvt/artifacts/schemas/freshness/__init__.py +1 -0
  41. dvt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  42. dvt/artifacts/schemas/freshness/v3/freshness.py +159 -0
  43. dvt/artifacts/schemas/manifest/__init__.py +2 -0
  44. dvt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  45. dvt/artifacts/schemas/manifest/v12/manifest.py +212 -0
  46. dvt/artifacts/schemas/results.py +148 -0
  47. dvt/artifacts/schemas/run/__init__.py +2 -0
  48. dvt/artifacts/schemas/run/v5/__init__.py +0 -0
  49. dvt/artifacts/schemas/run/v5/run.py +184 -0
  50. dvt/artifacts/schemas/upgrades/__init__.py +4 -0
  51. dvt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  52. dvt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  53. dvt/artifacts/utils/validation.py +153 -0
  54. dvt/cli/__init__.py +1 -0
  55. dvt/cli/context.py +16 -0
  56. dvt/cli/exceptions.py +56 -0
  57. dvt/cli/flags.py +558 -0
  58. dvt/cli/main.py +971 -0
  59. dvt/cli/option_types.py +121 -0
  60. dvt/cli/options.py +79 -0
  61. dvt/cli/params.py +803 -0
  62. dvt/cli/requires.py +478 -0
  63. dvt/cli/resolvers.py +32 -0
  64. dvt/cli/types.py +40 -0
  65. dvt/clients/__init__.py +0 -0
  66. dvt/clients/checked_load.py +82 -0
  67. dvt/clients/git.py +164 -0
  68. dvt/clients/jinja.py +206 -0
  69. dvt/clients/jinja_static.py +245 -0
  70. dvt/clients/registry.py +192 -0
  71. dvt/clients/yaml_helper.py +68 -0
  72. dvt/compilation.py +833 -0
  73. dvt/compute/__init__.py +26 -0
  74. dvt/compute/base.py +288 -0
  75. dvt/compute/engines/__init__.py +13 -0
  76. dvt/compute/engines/duckdb_engine.py +368 -0
  77. dvt/compute/engines/spark_engine.py +273 -0
  78. dvt/compute/query_analyzer.py +212 -0
  79. dvt/compute/router.py +483 -0
  80. dvt/config/__init__.py +4 -0
  81. dvt/config/catalogs.py +95 -0
  82. dvt/config/compute_config.py +406 -0
  83. dvt/config/profile.py +411 -0
  84. dvt/config/profiles_v2.py +464 -0
  85. dvt/config/project.py +893 -0
  86. dvt/config/renderer.py +232 -0
  87. dvt/config/runtime.py +491 -0
  88. dvt/config/selectors.py +209 -0
  89. dvt/config/utils.py +78 -0
  90. dvt/connectors/.gitignore +6 -0
  91. dvt/connectors/README.md +306 -0
  92. dvt/connectors/catalog.yml +217 -0
  93. dvt/connectors/download_connectors.py +300 -0
  94. dvt/constants.py +29 -0
  95. dvt/context/__init__.py +0 -0
  96. dvt/context/base.py +746 -0
  97. dvt/context/configured.py +136 -0
  98. dvt/context/context_config.py +350 -0
  99. dvt/context/docs.py +82 -0
  100. dvt/context/exceptions_jinja.py +179 -0
  101. dvt/context/macro_resolver.py +195 -0
  102. dvt/context/macros.py +171 -0
  103. dvt/context/manifest.py +73 -0
  104. dvt/context/providers.py +2198 -0
  105. dvt/context/query_header.py +14 -0
  106. dvt/context/secret.py +59 -0
  107. dvt/context/target.py +74 -0
  108. dvt/contracts/__init__.py +0 -0
  109. dvt/contracts/files.py +413 -0
  110. dvt/contracts/graph/__init__.py +0 -0
  111. dvt/contracts/graph/manifest.py +1904 -0
  112. dvt/contracts/graph/metrics.py +98 -0
  113. dvt/contracts/graph/model_config.py +71 -0
  114. dvt/contracts/graph/node_args.py +42 -0
  115. dvt/contracts/graph/nodes.py +1806 -0
  116. dvt/contracts/graph/semantic_manifest.py +233 -0
  117. dvt/contracts/graph/unparsed.py +812 -0
  118. dvt/contracts/project.py +417 -0
  119. dvt/contracts/results.py +53 -0
  120. dvt/contracts/selection.py +23 -0
  121. dvt/contracts/sql.py +86 -0
  122. dvt/contracts/state.py +69 -0
  123. dvt/contracts/util.py +46 -0
  124. dvt/deprecations.py +347 -0
  125. dvt/deps/__init__.py +0 -0
  126. dvt/deps/base.py +153 -0
  127. dvt/deps/git.py +196 -0
  128. dvt/deps/local.py +80 -0
  129. dvt/deps/registry.py +131 -0
  130. dvt/deps/resolver.py +149 -0
  131. dvt/deps/tarball.py +121 -0
  132. dvt/docs/source/_ext/dbt_click.py +118 -0
  133. dvt/docs/source/conf.py +32 -0
  134. dvt/env_vars.py +64 -0
  135. dvt/event_time/event_time.py +40 -0
  136. dvt/event_time/sample_window.py +60 -0
  137. dvt/events/__init__.py +16 -0
  138. dvt/events/base_types.py +37 -0
  139. dvt/events/core_types_pb2.py +2 -0
  140. dvt/events/logging.py +109 -0
  141. dvt/events/types.py +2534 -0
  142. dvt/exceptions.py +1487 -0
  143. dvt/flags.py +89 -0
  144. dvt/graph/__init__.py +11 -0
  145. dvt/graph/cli.py +248 -0
  146. dvt/graph/graph.py +172 -0
  147. dvt/graph/queue.py +213 -0
  148. dvt/graph/selector.py +375 -0
  149. dvt/graph/selector_methods.py +976 -0
  150. dvt/graph/selector_spec.py +223 -0
  151. dvt/graph/thread_pool.py +18 -0
  152. dvt/hooks.py +21 -0
  153. dvt/include/README.md +49 -0
  154. dvt/include/__init__.py +3 -0
  155. dvt/include/global_project.py +4 -0
  156. dvt/include/starter_project/.gitignore +4 -0
  157. dvt/include/starter_project/README.md +15 -0
  158. dvt/include/starter_project/__init__.py +3 -0
  159. dvt/include/starter_project/analyses/.gitkeep +0 -0
  160. dvt/include/starter_project/dvt_project.yml +36 -0
  161. dvt/include/starter_project/macros/.gitkeep +0 -0
  162. dvt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  163. dvt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  164. dvt/include/starter_project/models/example/schema.yml +21 -0
  165. dvt/include/starter_project/seeds/.gitkeep +0 -0
  166. dvt/include/starter_project/snapshots/.gitkeep +0 -0
  167. dvt/include/starter_project/tests/.gitkeep +0 -0
  168. dvt/internal_deprecations.py +27 -0
  169. dvt/jsonschemas/__init__.py +3 -0
  170. dvt/jsonschemas/jsonschemas.py +309 -0
  171. dvt/jsonschemas/project/0.0.110.json +4717 -0
  172. dvt/jsonschemas/project/0.0.85.json +2015 -0
  173. dvt/jsonschemas/resources/0.0.110.json +2636 -0
  174. dvt/jsonschemas/resources/0.0.85.json +2536 -0
  175. dvt/jsonschemas/resources/latest.json +6773 -0
  176. dvt/links.py +4 -0
  177. dvt/materializations/__init__.py +0 -0
  178. dvt/materializations/incremental/__init__.py +0 -0
  179. dvt/materializations/incremental/microbatch.py +235 -0
  180. dvt/mp_context.py +8 -0
  181. dvt/node_types.py +37 -0
  182. dvt/parser/__init__.py +23 -0
  183. dvt/parser/analysis.py +21 -0
  184. dvt/parser/base.py +549 -0
  185. dvt/parser/common.py +267 -0
  186. dvt/parser/docs.py +52 -0
  187. dvt/parser/fixtures.py +51 -0
  188. dvt/parser/functions.py +30 -0
  189. dvt/parser/generic_test.py +100 -0
  190. dvt/parser/generic_test_builders.py +334 -0
  191. dvt/parser/hooks.py +119 -0
  192. dvt/parser/macros.py +137 -0
  193. dvt/parser/manifest.py +2204 -0
  194. dvt/parser/models.py +574 -0
  195. dvt/parser/partial.py +1179 -0
  196. dvt/parser/read_files.py +445 -0
  197. dvt/parser/schema_generic_tests.py +423 -0
  198. dvt/parser/schema_renderer.py +111 -0
  199. dvt/parser/schema_yaml_readers.py +936 -0
  200. dvt/parser/schemas.py +1467 -0
  201. dvt/parser/search.py +149 -0
  202. dvt/parser/seeds.py +28 -0
  203. dvt/parser/singular_test.py +20 -0
  204. dvt/parser/snapshots.py +44 -0
  205. dvt/parser/sources.py +557 -0
  206. dvt/parser/sql.py +63 -0
  207. dvt/parser/unit_tests.py +622 -0
  208. dvt/plugins/__init__.py +20 -0
  209. dvt/plugins/contracts.py +10 -0
  210. dvt/plugins/exceptions.py +2 -0
  211. dvt/plugins/manager.py +164 -0
  212. dvt/plugins/manifest.py +21 -0
  213. dvt/profiler.py +20 -0
  214. dvt/py.typed +1 -0
  215. dvt/runners/__init__.py +2 -0
  216. dvt/runners/exposure_runner.py +7 -0
  217. dvt/runners/no_op_runner.py +46 -0
  218. dvt/runners/saved_query_runner.py +7 -0
  219. dvt/selected_resources.py +8 -0
  220. dvt/task/__init__.py +0 -0
  221. dvt/task/base.py +504 -0
  222. dvt/task/build.py +197 -0
  223. dvt/task/clean.py +57 -0
  224. dvt/task/clone.py +162 -0
  225. dvt/task/compile.py +151 -0
  226. dvt/task/compute.py +366 -0
  227. dvt/task/debug.py +650 -0
  228. dvt/task/deps.py +280 -0
  229. dvt/task/docs/__init__.py +3 -0
  230. dvt/task/docs/generate.py +408 -0
  231. dvt/task/docs/index.html +250 -0
  232. dvt/task/docs/serve.py +28 -0
  233. dvt/task/freshness.py +323 -0
  234. dvt/task/function.py +122 -0
  235. dvt/task/group_lookup.py +46 -0
  236. dvt/task/init.py +374 -0
  237. dvt/task/list.py +237 -0
  238. dvt/task/printer.py +176 -0
  239. dvt/task/profiles.py +256 -0
  240. dvt/task/retry.py +175 -0
  241. dvt/task/run.py +1146 -0
  242. dvt/task/run_operation.py +142 -0
  243. dvt/task/runnable.py +802 -0
  244. dvt/task/seed.py +104 -0
  245. dvt/task/show.py +150 -0
  246. dvt/task/snapshot.py +57 -0
  247. dvt/task/sql.py +111 -0
  248. dvt/task/test.py +464 -0
  249. dvt/tests/fixtures/__init__.py +1 -0
  250. dvt/tests/fixtures/project.py +620 -0
  251. dvt/tests/util.py +651 -0
  252. dvt/tracking.py +529 -0
  253. dvt/utils/__init__.py +3 -0
  254. dvt/utils/artifact_upload.py +151 -0
  255. dvt/utils/utils.py +408 -0
  256. dvt/version.py +249 -0
  257. dvt_core-1.11.0b4.dist-info/METADATA +252 -0
  258. dvt_core-1.11.0b4.dist-info/RECORD +261 -0
  259. dvt_core-1.11.0b4.dist-info/WHEEL +5 -0
  260. dvt_core-1.11.0b4.dist-info/entry_points.txt +2 -0
  261. dvt_core-1.11.0b4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,273 @@
1
+ """
2
+ Spark compute engine implementation.
3
+
4
+ This module provides DVT's Spark compute layer for large-scale processing
5
+ of heterogeneous data sources.
6
+
7
+ DVT uses dbt adapters for data extraction, eliminating the need for JDBC JARs:
8
+ - Data extracted via dbt adapters (Python)
9
+ - Converted to Arrow format for efficient transfer
10
+ - Loaded into Spark DataFrames
11
+ - No JDBC drivers or JAR management required
12
+ """
13
+
14
+ from datetime import datetime
15
+ from typing import Any, Optional, Tuple, Union
16
+
17
+ from dvt.compute.base import (
18
+ BaseComputeEngine,
19
+ ComputeResult,
20
+ ExecutionStrategy,
21
+ QueryExecutionPlan,
22
+ )
23
+ from dvt.config.compute_config import SparkClusterConfig, SparkLocalConfig
24
+ from dvt.events import fire_event
25
+ from dvt.events.types import Note
26
+
27
+ from dbt.adapters.exceptions import DbtRuntimeError
28
+
29
+ # PySpark import - will fail gracefully if not installed
30
+ try:
31
+ from pyspark.sql import SparkSession
32
+
33
+ PYSPARK_AVAILABLE = True
34
+ except ImportError:
35
+ PYSPARK_AVAILABLE = False
36
+ SparkSession = None
37
+
38
+
39
+ class SparkEngine(BaseComputeEngine):
40
+ """
41
+ Spark compute engine for DVT.
42
+
43
+ Supports both local (single-node) and cluster (distributed) modes.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ config: Union[SparkLocalConfig, SparkClusterConfig],
49
+ profile_registry: Any,
50
+ mode: str = "local",
51
+ ):
52
+ """
53
+ Initialize Spark engine.
54
+
55
+ Args:
56
+ config: Spark configuration (local or cluster)
57
+ profile_registry: Registry for resolving profile connections
58
+ mode: 'local' or 'cluster'
59
+ """
60
+ super().__init__(config=config.__dict__)
61
+ self.spark_config = config
62
+ self.profile_registry = profile_registry
63
+ self.mode = mode
64
+ self.spark: Optional[Any] = None # SparkSession
65
+
66
+ def initialize(self) -> None:
67
+ """Initialize Spark session and load connectors."""
68
+ if not PYSPARK_AVAILABLE:
69
+ raise DbtRuntimeError("PySpark is not installed. Install with: pip install pyspark")
70
+
71
+ try:
72
+ fire_event(Note(msg=f"Initializing Spark engine ({self.mode} mode)"))
73
+
74
+ # Create Spark session builder
75
+ builder = SparkSession.builder.appName(self.spark_config.app_name)
76
+
77
+ # Set master
78
+ builder = builder.master(self.spark_config.master)
79
+
80
+ # Set memory and cores
81
+ if hasattr(self.spark_config, "memory"):
82
+ builder = builder.config("spark.executor.memory", self.spark_config.memory)
83
+ if hasattr(self.spark_config, "driver_memory"):
84
+ builder = builder.config("spark.driver.memory", self.spark_config.driver_memory)
85
+ if hasattr(self.spark_config, "executor_cores"):
86
+ builder = builder.config(
87
+ "spark.executor.cores", str(self.spark_config.executor_cores)
88
+ )
89
+
90
+ # Apply additional config
91
+ for key, value in self.spark_config.config.items():
92
+ builder = builder.config(key, value)
93
+
94
+ # Note: No JDBC JARs needed - DVT uses dbt adapters for data extraction
95
+
96
+ # Create session
97
+ self.spark = builder.getOrCreate()
98
+
99
+ # Set log level
100
+ self.spark.sparkContext.setLogLevel(self.spark_config.log_level)
101
+
102
+ self._initialized = True
103
+ fire_event(Note(msg="Spark engine initialized successfully"))
104
+
105
+ except Exception as e:
106
+ raise DbtRuntimeError(f"Failed to initialize Spark engine: {e}")
107
+
108
+ def shutdown(self) -> None:
109
+ """Shutdown Spark session."""
110
+ if self.spark:
111
+ try:
112
+ self.spark.stop()
113
+ fire_event(Note(msg="Spark engine shutdown"))
114
+ except Exception as e:
115
+ fire_event(Note(msg=f"Error shutting down Spark: {e}"))
116
+ finally:
117
+ self.spark = None
118
+ self._initialized = False
119
+
120
+ def execute_query(
121
+ self,
122
+ sql: str,
123
+ execution_plan: QueryExecutionPlan,
124
+ ) -> ComputeResult:
125
+ """
126
+ Execute SQL query in Spark.
127
+
128
+ Args:
129
+ sql: SQL query to execute
130
+ execution_plan: Execution plan with source information
131
+
132
+ Returns:
133
+ ComputeResult
134
+ """
135
+ if not self._initialized or not self.spark:
136
+ return ComputeResult(
137
+ success=False,
138
+ error="Spark engine not initialized",
139
+ )
140
+
141
+ try:
142
+ start_time = datetime.now()
143
+
144
+ # Register source tables
145
+ for source in execution_plan.sources:
146
+ self._register_source(source.profile_name, source.adapter_type, source.relation)
147
+
148
+ # Execute query
149
+ df = self.spark.sql(sql)
150
+
151
+ # Get row count (triggers execution)
152
+ rows_affected = df.count()
153
+
154
+ # Calculate execution time
155
+ execution_time = (datetime.now() - start_time).total_seconds() * 1000
156
+
157
+ return ComputeResult(
158
+ success=True,
159
+ rows_affected=rows_affected,
160
+ execution_time_ms=execution_time,
161
+ strategy_used=ExecutionStrategy.COMPUTE_LAYER,
162
+ compute_engine_used=f"spark_{self.mode}",
163
+ )
164
+
165
+ except Exception as e:
166
+ return ComputeResult(
167
+ success=False,
168
+ error=str(e),
169
+ strategy_used=ExecutionStrategy.COMPUTE_LAYER,
170
+ compute_engine_used=f"spark_{self.mode}",
171
+ )
172
+
173
+ def _register_source(self, profile_name: str, adapter_type: str, relation: Any) -> None:
174
+ """
175
+ Register a source table in Spark.
176
+
177
+ TODO: Replace with dbt adapter-based extraction:
178
+ 1. Get dbt adapter for profile
179
+ 2. Execute SELECT * FROM table via adapter
180
+ 3. Convert Agate table to Arrow
181
+ 4. Create Spark DataFrame from Arrow
182
+ 5. Register as temp view
183
+
184
+ Args:
185
+ profile_name: Profile name
186
+ adapter_type: Adapter type
187
+ relation: Relation object
188
+ """
189
+ # TODO: Implement dbt adapter-based extraction
190
+ # For now, this is a placeholder
191
+ fire_event(
192
+ Note(
193
+ msg=f"TODO: Extract {relation} from {profile_name} via dbt adapter (not yet implemented)"
194
+ )
195
+ )
196
+
197
+ def can_handle(self, execution_plan: QueryExecutionPlan) -> bool:
198
+ """
199
+ Check if Spark can handle this execution plan.
200
+
201
+ Spark can handle any database with a dbt adapter (uses adapter-based extraction).
202
+
203
+ Args:
204
+ execution_plan: Execution plan
205
+
206
+ Returns:
207
+ True if Spark can handle it
208
+ """
209
+ # Spark can handle any source with a dbt adapter
210
+ # Data extracted via adapter, converted to Arrow, loaded into Spark
211
+ return True
212
+
213
+ def estimate_cost(self, execution_plan: QueryExecutionPlan) -> float:
214
+ """
215
+ Estimate cost of executing with Spark.
216
+
217
+ Spark has higher overhead but scales better:
218
+ - Small data (< 1GB): High cost (DuckDB is better)
219
+ - Medium data (1-10GB): Medium cost
220
+ - Large data (> 10GB): Low cost (Spark shines here)
221
+
222
+ Args:
223
+ execution_plan: Execution plan
224
+
225
+ Returns:
226
+ Cost estimate
227
+ """
228
+ data_size_gb = execution_plan.estimated_data_size_mb / 1024
229
+
230
+ if self.mode == "local":
231
+ # Local mode has startup overhead
232
+ if data_size_gb < 1:
233
+ return 80.0 # High cost for small data
234
+ elif data_size_gb < 10:
235
+ return 40.0 # Medium cost
236
+ else:
237
+ return 20.0 # Low cost for large data
238
+ else:
239
+ # Cluster mode has even more overhead but scales better
240
+ if data_size_gb < 10:
241
+ return 100.0 # Very high cost for small/medium data
242
+ elif data_size_gb < 100:
243
+ return 30.0 # Medium cost
244
+ else:
245
+ return 10.0 # Very low cost for huge data
246
+
247
+ def get_engine_name(self) -> str:
248
+ """Get engine name."""
249
+ return f"spark_{self.mode}"
250
+
251
+ def test_connection(self) -> Tuple[bool, Optional[str]]:
252
+ """
253
+ Test if Spark is available and working.
254
+
255
+ Returns:
256
+ (success, error_message)
257
+ """
258
+ if not PYSPARK_AVAILABLE:
259
+ return (False, "PySpark not installed")
260
+
261
+ try:
262
+ # Try to create a session
263
+ spark = (
264
+ SparkSession.builder.master("local[1]")
265
+ .appName("dvt-test")
266
+ .config("spark.ui.enabled", "false")
267
+ .getOrCreate()
268
+ )
269
+ spark.sql("SELECT 1").collect()
270
+ spark.stop()
271
+ return (True, None)
272
+ except Exception as e:
273
+ return (False, str(e))
@@ -0,0 +1,212 @@
1
+ """
2
+ Query analyzer for DVT execution routing.
3
+
4
+ This module analyzes SQL queries to extract source references and build
5
+ execution plans for the ExecutionRouter.
6
+ """
7
+
8
+ import re
9
+ from typing import Any, Dict, List, Optional, Set, Tuple
10
+
11
+ from dvt.compute.base import QueryExecutionPlan, SourceInfo
12
+ from dvt.contracts.graph.manifest import Manifest
13
+
14
+ from dbt_common.events.functions import fire_event
15
+ from dbt_common.events.types import Note
16
+ from dbt_common.exceptions import DbtRuntimeError
17
+
18
+
19
+ class QueryAnalyzer:
20
+ """
21
+ Analyzes SQL queries to extract source dependencies.
22
+
23
+ This analyzer:
24
+ - Parses Jinja source() calls to find source references
25
+ - Looks up source metadata from manifest
26
+ - Estimates data sizes for execution planning
27
+ - Builds SourceInfo objects for ExecutionRouter
28
+ """
29
+
30
+ def __init__(self, manifest: Manifest):
31
+ """
32
+ Initialize QueryAnalyzer.
33
+
34
+ Args:
35
+ manifest: Manifest with source definitions
36
+ """
37
+ self.manifest = manifest
38
+
39
+ def analyze_model_sql(self, sql: str, model_node: Optional[Any] = None) -> List[SourceInfo]:
40
+ """
41
+ Analyze SQL to extract source references.
42
+
43
+ Args:
44
+ sql: SQL query (may contain Jinja)
45
+ model_node: Optional model node for context
46
+
47
+ Returns:
48
+ List of SourceInfo objects
49
+ """
50
+ # Extract source references from SQL
51
+ source_refs = self._extract_source_references(sql)
52
+
53
+ # Build SourceInfo for each reference
54
+ source_infos: List[SourceInfo] = []
55
+ for source_name, table_name in source_refs:
56
+ source_info = self._build_source_info(source_name, table_name)
57
+ if source_info:
58
+ source_infos.append(source_info)
59
+
60
+ return source_infos
61
+
62
+ def _extract_source_references(self, sql: str) -> Set[Tuple[str, str]]:
63
+ """
64
+ Extract source() references from SQL.
65
+
66
+ Looks for patterns like:
67
+ - {{ source('schema_name', 'table_name') }}
68
+ - {{source("schema_name", "table_name")}}
69
+ - {{ source( 'schema_name' , 'table_name' ) }}
70
+
71
+ Args:
72
+ sql: SQL query with Jinja
73
+
74
+ Returns:
75
+ Set of (source_name, table_name) tuples
76
+ """
77
+ sources: Set[Tuple[str, str]] = set()
78
+
79
+ # Pattern to match source() calls
80
+ # Handles single or double quotes, optional whitespace
81
+ pattern = r"{{\s*source\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*['\"]([^'\"]+)['\"]\s*\)\s*}}"
82
+
83
+ matches = re.finditer(pattern, sql, re.IGNORECASE)
84
+ for match in matches:
85
+ source_name = match.group(1)
86
+ table_name = match.group(2)
87
+ sources.add((source_name, table_name))
88
+
89
+ return sources
90
+
91
+ def _build_source_info(self, source_name: str, table_name: str) -> Optional[SourceInfo]:
92
+ """
93
+ Build SourceInfo from source reference.
94
+
95
+ Args:
96
+ source_name: Source schema name
97
+ table_name: Table name
98
+
99
+ Returns:
100
+ SourceInfo or None if source not found
101
+ """
102
+ # Resolve source in manifest
103
+ source = self.manifest.resolve_source(
104
+ source_name=source_name,
105
+ table_name=table_name,
106
+ current_project=None,
107
+ node_package=None,
108
+ )
109
+
110
+ if not source or not hasattr(source, "unique_id"):
111
+ fire_event(
112
+ Note(msg=f"Warning: Source '{source_name}.{table_name}' not found in manifest")
113
+ )
114
+ return None
115
+
116
+ # Extract profile name (DVT-specific)
117
+ profile_name = getattr(source, "profile", None)
118
+ if not profile_name:
119
+ # No profile specified - use default target profile
120
+ fire_event(
121
+ Note(
122
+ msg=f"Source '{source_name}.{table_name}' has no profile, "
123
+ "will use default target"
124
+ )
125
+ )
126
+ profile_name = "default"
127
+
128
+ # Determine adapter type
129
+ # TODO: Look up adapter type from profile
130
+ adapter_type = "unknown"
131
+
132
+ # Estimate data size
133
+ # TODO: Implement actual size estimation
134
+ # For now, use placeholder values
135
+ estimated_size_mb = None
136
+ estimated_rows = None
137
+
138
+ # Build SourceInfo
139
+ source_info = SourceInfo(
140
+ source_name=f"{source_name}.{table_name}",
141
+ profile_name=profile_name,
142
+ adapter_type=adapter_type,
143
+ database=getattr(source, "database", None),
144
+ schema=getattr(source, "schema", source_name),
145
+ identifier=getattr(source, "identifier", table_name),
146
+ estimated_size_mb=estimated_size_mb,
147
+ estimated_rows=estimated_rows,
148
+ )
149
+
150
+ return source_info
151
+
152
+ def build_execution_plan_for_model(
153
+ self, sql: str, model_node: Optional[Any] = None
154
+ ) -> QueryExecutionPlan:
155
+ """
156
+ Build complete execution plan for a model.
157
+
158
+ This is a convenience method that:
159
+ 1. Analyzes SQL to extract sources
160
+ 2. Builds SourceInfo objects
161
+ 3. Creates QueryExecutionPlan
162
+
163
+ Args:
164
+ sql: SQL query
165
+ model_node: Optional model node
166
+
167
+ Returns:
168
+ QueryExecutionPlan ready for strategy selection
169
+ """
170
+ # Analyze sources
171
+ sources = self.analyze_model_sql(sql, model_node)
172
+
173
+ # Calculate metrics
174
+ unique_profiles = {s.profile_name for s in sources}
175
+ unique_adapters = {s.adapter_type for s in sources}
176
+ is_homogeneous = len(unique_profiles) <= 1 and len(unique_adapters) <= 1
177
+
178
+ # Estimate data size
179
+ total_size_mb = sum(s.estimated_size_mb or 0 for s in sources)
180
+ total_rows = sum(s.estimated_rows or 0 for s in sources)
181
+
182
+ # Create execution plan
183
+ from dvt.compute.base import ExecutionStrategy
184
+
185
+ plan = QueryExecutionPlan(
186
+ strategy=ExecutionStrategy.AUTO,
187
+ sources=sources,
188
+ is_homogeneous=is_homogeneous,
189
+ estimated_data_size_mb=total_size_mb if total_size_mb > 0 else None,
190
+ estimated_rows=total_rows if total_rows > 0 else None,
191
+ )
192
+
193
+ # Set pushdown target if homogeneous
194
+ if is_homogeneous and sources:
195
+ plan.pushdown_target = sources[0].profile_name
196
+
197
+ return plan
198
+
199
+
200
+ def analyze_query_sources(sql: str, manifest: Manifest) -> List[SourceInfo]:
201
+ """
202
+ Convenience function to analyze query sources.
203
+
204
+ Args:
205
+ sql: SQL query
206
+ manifest: Manifest
207
+
208
+ Returns:
209
+ List of SourceInfo objects
210
+ """
211
+ analyzer = QueryAnalyzer(manifest)
212
+ return analyzer.analyze_model_sql(sql)