dvt-core 1.11.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dvt-core might be problematic. Click here for more details.

Files changed (261) hide show
  1. dvt/__init__.py +7 -0
  2. dvt/_pydantic_shim.py +26 -0
  3. dvt/adapters/__init__.py +16 -0
  4. dvt/adapters/multi_adapter_manager.py +268 -0
  5. dvt/artifacts/__init__.py +0 -0
  6. dvt/artifacts/exceptions/__init__.py +1 -0
  7. dvt/artifacts/exceptions/schemas.py +31 -0
  8. dvt/artifacts/resources/__init__.py +116 -0
  9. dvt/artifacts/resources/base.py +68 -0
  10. dvt/artifacts/resources/types.py +93 -0
  11. dvt/artifacts/resources/v1/analysis.py +10 -0
  12. dvt/artifacts/resources/v1/catalog.py +23 -0
  13. dvt/artifacts/resources/v1/components.py +275 -0
  14. dvt/artifacts/resources/v1/config.py +282 -0
  15. dvt/artifacts/resources/v1/documentation.py +11 -0
  16. dvt/artifacts/resources/v1/exposure.py +52 -0
  17. dvt/artifacts/resources/v1/function.py +53 -0
  18. dvt/artifacts/resources/v1/generic_test.py +32 -0
  19. dvt/artifacts/resources/v1/group.py +22 -0
  20. dvt/artifacts/resources/v1/hook.py +11 -0
  21. dvt/artifacts/resources/v1/macro.py +30 -0
  22. dvt/artifacts/resources/v1/metric.py +173 -0
  23. dvt/artifacts/resources/v1/model.py +146 -0
  24. dvt/artifacts/resources/v1/owner.py +10 -0
  25. dvt/artifacts/resources/v1/saved_query.py +112 -0
  26. dvt/artifacts/resources/v1/seed.py +42 -0
  27. dvt/artifacts/resources/v1/semantic_layer_components.py +72 -0
  28. dvt/artifacts/resources/v1/semantic_model.py +315 -0
  29. dvt/artifacts/resources/v1/singular_test.py +14 -0
  30. dvt/artifacts/resources/v1/snapshot.py +92 -0
  31. dvt/artifacts/resources/v1/source_definition.py +85 -0
  32. dvt/artifacts/resources/v1/sql_operation.py +10 -0
  33. dvt/artifacts/resources/v1/unit_test_definition.py +78 -0
  34. dvt/artifacts/schemas/__init__.py +0 -0
  35. dvt/artifacts/schemas/base.py +191 -0
  36. dvt/artifacts/schemas/batch_results.py +24 -0
  37. dvt/artifacts/schemas/catalog/__init__.py +12 -0
  38. dvt/artifacts/schemas/catalog/v1/__init__.py +0 -0
  39. dvt/artifacts/schemas/catalog/v1/catalog.py +60 -0
  40. dvt/artifacts/schemas/freshness/__init__.py +1 -0
  41. dvt/artifacts/schemas/freshness/v3/__init__.py +0 -0
  42. dvt/artifacts/schemas/freshness/v3/freshness.py +159 -0
  43. dvt/artifacts/schemas/manifest/__init__.py +2 -0
  44. dvt/artifacts/schemas/manifest/v12/__init__.py +0 -0
  45. dvt/artifacts/schemas/manifest/v12/manifest.py +212 -0
  46. dvt/artifacts/schemas/results.py +148 -0
  47. dvt/artifacts/schemas/run/__init__.py +2 -0
  48. dvt/artifacts/schemas/run/v5/__init__.py +0 -0
  49. dvt/artifacts/schemas/run/v5/run.py +184 -0
  50. dvt/artifacts/schemas/upgrades/__init__.py +4 -0
  51. dvt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
  52. dvt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
  53. dvt/artifacts/utils/validation.py +153 -0
  54. dvt/cli/__init__.py +1 -0
  55. dvt/cli/context.py +16 -0
  56. dvt/cli/exceptions.py +56 -0
  57. dvt/cli/flags.py +558 -0
  58. dvt/cli/main.py +971 -0
  59. dvt/cli/option_types.py +121 -0
  60. dvt/cli/options.py +79 -0
  61. dvt/cli/params.py +803 -0
  62. dvt/cli/requires.py +478 -0
  63. dvt/cli/resolvers.py +32 -0
  64. dvt/cli/types.py +40 -0
  65. dvt/clients/__init__.py +0 -0
  66. dvt/clients/checked_load.py +82 -0
  67. dvt/clients/git.py +164 -0
  68. dvt/clients/jinja.py +206 -0
  69. dvt/clients/jinja_static.py +245 -0
  70. dvt/clients/registry.py +192 -0
  71. dvt/clients/yaml_helper.py +68 -0
  72. dvt/compilation.py +833 -0
  73. dvt/compute/__init__.py +26 -0
  74. dvt/compute/base.py +288 -0
  75. dvt/compute/engines/__init__.py +13 -0
  76. dvt/compute/engines/duckdb_engine.py +368 -0
  77. dvt/compute/engines/spark_engine.py +273 -0
  78. dvt/compute/query_analyzer.py +212 -0
  79. dvt/compute/router.py +483 -0
  80. dvt/config/__init__.py +4 -0
  81. dvt/config/catalogs.py +95 -0
  82. dvt/config/compute_config.py +406 -0
  83. dvt/config/profile.py +411 -0
  84. dvt/config/profiles_v2.py +464 -0
  85. dvt/config/project.py +893 -0
  86. dvt/config/renderer.py +232 -0
  87. dvt/config/runtime.py +491 -0
  88. dvt/config/selectors.py +209 -0
  89. dvt/config/utils.py +78 -0
  90. dvt/connectors/.gitignore +6 -0
  91. dvt/connectors/README.md +306 -0
  92. dvt/connectors/catalog.yml +217 -0
  93. dvt/connectors/download_connectors.py +300 -0
  94. dvt/constants.py +29 -0
  95. dvt/context/__init__.py +0 -0
  96. dvt/context/base.py +746 -0
  97. dvt/context/configured.py +136 -0
  98. dvt/context/context_config.py +350 -0
  99. dvt/context/docs.py +82 -0
  100. dvt/context/exceptions_jinja.py +179 -0
  101. dvt/context/macro_resolver.py +195 -0
  102. dvt/context/macros.py +171 -0
  103. dvt/context/manifest.py +73 -0
  104. dvt/context/providers.py +2198 -0
  105. dvt/context/query_header.py +14 -0
  106. dvt/context/secret.py +59 -0
  107. dvt/context/target.py +74 -0
  108. dvt/contracts/__init__.py +0 -0
  109. dvt/contracts/files.py +413 -0
  110. dvt/contracts/graph/__init__.py +0 -0
  111. dvt/contracts/graph/manifest.py +1904 -0
  112. dvt/contracts/graph/metrics.py +98 -0
  113. dvt/contracts/graph/model_config.py +71 -0
  114. dvt/contracts/graph/node_args.py +42 -0
  115. dvt/contracts/graph/nodes.py +1806 -0
  116. dvt/contracts/graph/semantic_manifest.py +233 -0
  117. dvt/contracts/graph/unparsed.py +812 -0
  118. dvt/contracts/project.py +417 -0
  119. dvt/contracts/results.py +53 -0
  120. dvt/contracts/selection.py +23 -0
  121. dvt/contracts/sql.py +86 -0
  122. dvt/contracts/state.py +69 -0
  123. dvt/contracts/util.py +46 -0
  124. dvt/deprecations.py +347 -0
  125. dvt/deps/__init__.py +0 -0
  126. dvt/deps/base.py +153 -0
  127. dvt/deps/git.py +196 -0
  128. dvt/deps/local.py +80 -0
  129. dvt/deps/registry.py +131 -0
  130. dvt/deps/resolver.py +149 -0
  131. dvt/deps/tarball.py +121 -0
  132. dvt/docs/source/_ext/dbt_click.py +118 -0
  133. dvt/docs/source/conf.py +32 -0
  134. dvt/env_vars.py +64 -0
  135. dvt/event_time/event_time.py +40 -0
  136. dvt/event_time/sample_window.py +60 -0
  137. dvt/events/__init__.py +16 -0
  138. dvt/events/base_types.py +37 -0
  139. dvt/events/core_types_pb2.py +2 -0
  140. dvt/events/logging.py +109 -0
  141. dvt/events/types.py +2534 -0
  142. dvt/exceptions.py +1487 -0
  143. dvt/flags.py +89 -0
  144. dvt/graph/__init__.py +11 -0
  145. dvt/graph/cli.py +248 -0
  146. dvt/graph/graph.py +172 -0
  147. dvt/graph/queue.py +213 -0
  148. dvt/graph/selector.py +375 -0
  149. dvt/graph/selector_methods.py +976 -0
  150. dvt/graph/selector_spec.py +223 -0
  151. dvt/graph/thread_pool.py +18 -0
  152. dvt/hooks.py +21 -0
  153. dvt/include/README.md +49 -0
  154. dvt/include/__init__.py +3 -0
  155. dvt/include/global_project.py +4 -0
  156. dvt/include/starter_project/.gitignore +4 -0
  157. dvt/include/starter_project/README.md +15 -0
  158. dvt/include/starter_project/__init__.py +3 -0
  159. dvt/include/starter_project/analyses/.gitkeep +0 -0
  160. dvt/include/starter_project/dvt_project.yml +36 -0
  161. dvt/include/starter_project/macros/.gitkeep +0 -0
  162. dvt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
  163. dvt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
  164. dvt/include/starter_project/models/example/schema.yml +21 -0
  165. dvt/include/starter_project/seeds/.gitkeep +0 -0
  166. dvt/include/starter_project/snapshots/.gitkeep +0 -0
  167. dvt/include/starter_project/tests/.gitkeep +0 -0
  168. dvt/internal_deprecations.py +27 -0
  169. dvt/jsonschemas/__init__.py +3 -0
  170. dvt/jsonschemas/jsonschemas.py +309 -0
  171. dvt/jsonschemas/project/0.0.110.json +4717 -0
  172. dvt/jsonschemas/project/0.0.85.json +2015 -0
  173. dvt/jsonschemas/resources/0.0.110.json +2636 -0
  174. dvt/jsonschemas/resources/0.0.85.json +2536 -0
  175. dvt/jsonschemas/resources/latest.json +6773 -0
  176. dvt/links.py +4 -0
  177. dvt/materializations/__init__.py +0 -0
  178. dvt/materializations/incremental/__init__.py +0 -0
  179. dvt/materializations/incremental/microbatch.py +235 -0
  180. dvt/mp_context.py +8 -0
  181. dvt/node_types.py +37 -0
  182. dvt/parser/__init__.py +23 -0
  183. dvt/parser/analysis.py +21 -0
  184. dvt/parser/base.py +549 -0
  185. dvt/parser/common.py +267 -0
  186. dvt/parser/docs.py +52 -0
  187. dvt/parser/fixtures.py +51 -0
  188. dvt/parser/functions.py +30 -0
  189. dvt/parser/generic_test.py +100 -0
  190. dvt/parser/generic_test_builders.py +334 -0
  191. dvt/parser/hooks.py +119 -0
  192. dvt/parser/macros.py +137 -0
  193. dvt/parser/manifest.py +2204 -0
  194. dvt/parser/models.py +574 -0
  195. dvt/parser/partial.py +1179 -0
  196. dvt/parser/read_files.py +445 -0
  197. dvt/parser/schema_generic_tests.py +423 -0
  198. dvt/parser/schema_renderer.py +111 -0
  199. dvt/parser/schema_yaml_readers.py +936 -0
  200. dvt/parser/schemas.py +1467 -0
  201. dvt/parser/search.py +149 -0
  202. dvt/parser/seeds.py +28 -0
  203. dvt/parser/singular_test.py +20 -0
  204. dvt/parser/snapshots.py +44 -0
  205. dvt/parser/sources.py +557 -0
  206. dvt/parser/sql.py +63 -0
  207. dvt/parser/unit_tests.py +622 -0
  208. dvt/plugins/__init__.py +20 -0
  209. dvt/plugins/contracts.py +10 -0
  210. dvt/plugins/exceptions.py +2 -0
  211. dvt/plugins/manager.py +164 -0
  212. dvt/plugins/manifest.py +21 -0
  213. dvt/profiler.py +20 -0
  214. dvt/py.typed +1 -0
  215. dvt/runners/__init__.py +2 -0
  216. dvt/runners/exposure_runner.py +7 -0
  217. dvt/runners/no_op_runner.py +46 -0
  218. dvt/runners/saved_query_runner.py +7 -0
  219. dvt/selected_resources.py +8 -0
  220. dvt/task/__init__.py +0 -0
  221. dvt/task/base.py +504 -0
  222. dvt/task/build.py +197 -0
  223. dvt/task/clean.py +57 -0
  224. dvt/task/clone.py +162 -0
  225. dvt/task/compile.py +151 -0
  226. dvt/task/compute.py +366 -0
  227. dvt/task/debug.py +650 -0
  228. dvt/task/deps.py +280 -0
  229. dvt/task/docs/__init__.py +3 -0
  230. dvt/task/docs/generate.py +408 -0
  231. dvt/task/docs/index.html +250 -0
  232. dvt/task/docs/serve.py +28 -0
  233. dvt/task/freshness.py +323 -0
  234. dvt/task/function.py +122 -0
  235. dvt/task/group_lookup.py +46 -0
  236. dvt/task/init.py +374 -0
  237. dvt/task/list.py +237 -0
  238. dvt/task/printer.py +176 -0
  239. dvt/task/profiles.py +256 -0
  240. dvt/task/retry.py +175 -0
  241. dvt/task/run.py +1146 -0
  242. dvt/task/run_operation.py +142 -0
  243. dvt/task/runnable.py +802 -0
  244. dvt/task/seed.py +104 -0
  245. dvt/task/show.py +150 -0
  246. dvt/task/snapshot.py +57 -0
  247. dvt/task/sql.py +111 -0
  248. dvt/task/test.py +464 -0
  249. dvt/tests/fixtures/__init__.py +1 -0
  250. dvt/tests/fixtures/project.py +620 -0
  251. dvt/tests/util.py +651 -0
  252. dvt/tracking.py +529 -0
  253. dvt/utils/__init__.py +3 -0
  254. dvt/utils/artifact_upload.py +151 -0
  255. dvt/utils/utils.py +408 -0
  256. dvt/version.py +249 -0
  257. dvt_core-1.11.0b4.dist-info/METADATA +252 -0
  258. dvt_core-1.11.0b4.dist-info/RECORD +261 -0
  259. dvt_core-1.11.0b4.dist-info/WHEEL +5 -0
  260. dvt_core-1.11.0b4.dist-info/entry_points.txt +2 -0
  261. dvt_core-1.11.0b4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,26 @@
1
+ """
2
+ DVT Compute Layer
3
+
4
+ This package provides the compute engine abstraction for processing
5
+ heterogeneous data sources.
6
+ """
7
+
8
+ from dvt.compute.base import (
9
+ BaseComputeEngine,
10
+ ComputeResult,
11
+ QueryExecutionPlan,
12
+ SourceInfo,
13
+ )
14
+ from dvt.compute.query_analyzer import QueryAnalyzer, analyze_query_sources
15
+ from dvt.compute.router import ExecutionRouter, ExecutionStrategy
16
+
17
+ __all__ = [
18
+ "BaseComputeEngine",
19
+ "ComputeResult",
20
+ "QueryExecutionPlan",
21
+ "SourceInfo",
22
+ "ExecutionRouter",
23
+ "ExecutionStrategy",
24
+ "QueryAnalyzer",
25
+ "analyze_query_sources",
26
+ ]
dvt/compute/base.py ADDED
@@ -0,0 +1,288 @@
1
+ """
2
+ Base compute engine abstraction.
3
+
4
+ This module defines the interface that all compute engines (DuckDB, Spark)
5
+ must implement.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime
11
+ from enum import Enum
12
+ from typing import Any, Dict, List, Optional, Set, Tuple
13
+
14
+ from dbt.adapters.base import BaseRelation
15
+
16
+
17
+ class ExecutionStrategy(Enum):
18
+ """Strategy for executing a query."""
19
+
20
+ PUSHDOWN = "pushdown" # Execute on source database
21
+ COMPUTE_LAYER = "compute_layer" # Execute in DuckDB/Spark
22
+ AUTO = "auto" # Let DVT decide
23
+
24
+
25
+ @dataclass
26
+ class SourceInfo:
27
+ """Information about a data source referenced in a query."""
28
+
29
+ profile_name: str
30
+ adapter_type: str
31
+ relation: BaseRelation
32
+ estimated_rows: Optional[int] = None
33
+ estimated_size_mb: Optional[float] = None
34
+
35
+
36
+ @dataclass
37
+ class QueryExecutionPlan:
38
+ """
39
+ Execution plan for a query.
40
+
41
+ This describes how DVT will execute the query (pushdown vs compute layer).
42
+ """
43
+
44
+ strategy: ExecutionStrategy
45
+ compute_engine: Optional[str] = None # 'duckdb', 'spark_local', 'spark_cluster'
46
+ sources: List[SourceInfo] = field(default_factory=list)
47
+ is_homogeneous: bool = True # All sources same adapter
48
+ estimated_data_size_mb: float = 0.0
49
+ estimated_rows: int = 0
50
+ pushdown_target: Optional[str] = None # Which adapter to push down to
51
+ reason: str = "" # Explanation of strategy choice
52
+
53
+ def is_pushdown_possible(self) -> bool:
54
+ """Check if pushdown is possible."""
55
+ return self.is_homogeneous and len(set(s.profile_name for s in self.sources)) == 1
56
+
57
+ def get_unique_adapters(self) -> Set[str]:
58
+ """Get set of unique adapter types."""
59
+ return {s.adapter_type for s in self.sources}
60
+
61
+ def get_unique_profiles(self) -> Set[str]:
62
+ """Get set of unique profile names."""
63
+ return {s.profile_name for s in self.sources}
64
+
65
+
66
+ @dataclass
67
+ class ComputeResult:
68
+ """Result of compute engine execution."""
69
+
70
+ success: bool
71
+ rows_affected: int = 0
72
+ execution_time_ms: float = 0.0
73
+ strategy_used: Optional[ExecutionStrategy] = None
74
+ compute_engine_used: Optional[str] = None
75
+ error: Optional[str] = None
76
+ warnings: List[str] = field(default_factory=list)
77
+ metadata: Dict[str, Any] = field(default_factory=dict)
78
+
79
+ def __post_init__(self):
80
+ """Set execution timestamp."""
81
+ self.metadata.setdefault("executed_at", datetime.now().isoformat())
82
+
83
+
84
+ class BaseComputeEngine(ABC):
85
+ """
86
+ Base class for compute engines.
87
+
88
+ All compute engines (DuckDB, Spark) must implement this interface.
89
+ """
90
+
91
+ def __init__(self, config: Dict[str, Any]):
92
+ """
93
+ Initialize compute engine.
94
+
95
+ Args:
96
+ config: Engine-specific configuration
97
+ """
98
+ self.config = config
99
+ self._initialized = False
100
+
101
+ @abstractmethod
102
+ def initialize(self) -> None:
103
+ """
104
+ Initialize the compute engine.
105
+
106
+ This is called once before any queries are executed.
107
+ Should set up connections, load extensions, etc.
108
+ """
109
+ pass
110
+
111
+ @abstractmethod
112
+ def shutdown(self) -> None:
113
+ """
114
+ Shutdown the compute engine.
115
+
116
+ Clean up resources, close connections, etc.
117
+ """
118
+ pass
119
+
120
+ @abstractmethod
121
+ def execute_query(
122
+ self,
123
+ sql: str,
124
+ execution_plan: QueryExecutionPlan,
125
+ ) -> ComputeResult:
126
+ """
127
+ Execute a SQL query using this compute engine.
128
+
129
+ Args:
130
+ sql: SQL query to execute
131
+ execution_plan: Execution plan with source information
132
+
133
+ Returns:
134
+ ComputeResult with execution status and metadata
135
+ """
136
+ pass
137
+
138
+ @abstractmethod
139
+ def can_handle(self, execution_plan: QueryExecutionPlan) -> bool:
140
+ """
141
+ Check if this engine can handle the given execution plan.
142
+
143
+ Args:
144
+ execution_plan: Execution plan to check
145
+
146
+ Returns:
147
+ True if this engine can handle the plan
148
+ """
149
+ pass
150
+
151
+ @abstractmethod
152
+ def estimate_cost(self, execution_plan: QueryExecutionPlan) -> float:
153
+ """
154
+ Estimate cost of executing with this engine.
155
+
156
+ Lower cost = better choice.
157
+
158
+ Args:
159
+ execution_plan: Execution plan to estimate
160
+
161
+ Returns:
162
+ Cost estimate (arbitrary units, relative to other engines)
163
+ """
164
+ pass
165
+
166
+ @abstractmethod
167
+ def get_engine_name(self) -> str:
168
+ """Get name of this compute engine."""
169
+ pass
170
+
171
+ @abstractmethod
172
+ def test_connection(self) -> Tuple[bool, Optional[str]]:
173
+ """
174
+ Test if engine is available and working.
175
+
176
+ Returns:
177
+ (success, error_message)
178
+ """
179
+ pass
180
+
181
+ def is_initialized(self) -> bool:
182
+ """Check if engine is initialized."""
183
+ return self._initialized
184
+
185
+ def __enter__(self):
186
+ """Context manager entry."""
187
+ if not self._initialized:
188
+ self.initialize()
189
+ return self
190
+
191
+ def __exit__(self, exc_type, exc_val, exc_tb):
192
+ """Context manager exit."""
193
+ self.shutdown()
194
+ return False
195
+
196
+
197
+ class PushdownEngine(BaseComputeEngine):
198
+ """
199
+ Special compute engine for pushdown execution.
200
+
201
+ This doesn't actually compute anything - it delegates to the
202
+ source adapter directly.
203
+ """
204
+
205
+ def __init__(self, adapter_factory):
206
+ """
207
+ Initialize pushdown engine.
208
+
209
+ Args:
210
+ adapter_factory: Factory to get adapters by profile name
211
+ """
212
+ super().__init__(config={})
213
+ self.adapter_factory = adapter_factory
214
+
215
+ def initialize(self) -> None:
216
+ """Initialize (no-op for pushdown)."""
217
+ self._initialized = True
218
+
219
+ def shutdown(self) -> None:
220
+ """Shutdown (no-op for pushdown)."""
221
+ self._initialized = False
222
+
223
+ def execute_query(
224
+ self,
225
+ sql: str,
226
+ execution_plan: QueryExecutionPlan,
227
+ ) -> ComputeResult:
228
+ """
229
+ Execute query via pushdown to source adapter.
230
+
231
+ Args:
232
+ sql: SQL query to execute
233
+ execution_plan: Execution plan
234
+
235
+ Returns:
236
+ ComputeResult
237
+ """
238
+ if not execution_plan.pushdown_target:
239
+ return ComputeResult(
240
+ success=False,
241
+ error="Pushdown target not specified in execution plan",
242
+ )
243
+
244
+ try:
245
+ start_time = datetime.now()
246
+
247
+ # Get adapter for pushdown target
248
+ adapter = self.adapter_factory.get_adapter(execution_plan.pushdown_target)
249
+
250
+ # Execute on adapter
251
+ result = adapter.execute(sql, fetch=False)
252
+
253
+ # Calculate execution time
254
+ execution_time = (datetime.now() - start_time).total_seconds() * 1000
255
+
256
+ return ComputeResult(
257
+ success=True,
258
+ rows_affected=getattr(result, "rows_affected", 0),
259
+ execution_time_ms=execution_time,
260
+ strategy_used=ExecutionStrategy.PUSHDOWN,
261
+ compute_engine_used=execution_plan.pushdown_target,
262
+ )
263
+
264
+ except Exception as e:
265
+ return ComputeResult(
266
+ success=False,
267
+ error=str(e),
268
+ strategy_used=ExecutionStrategy.PUSHDOWN,
269
+ )
270
+
271
+ def can_handle(self, execution_plan: QueryExecutionPlan) -> bool:
272
+ """Check if pushdown is possible."""
273
+ return execution_plan.is_pushdown_possible()
274
+
275
+ def estimate_cost(self, execution_plan: QueryExecutionPlan) -> float:
276
+ """Pushdown has lowest cost (no data movement)."""
277
+ if execution_plan.is_pushdown_possible():
278
+ return 1.0 # Lowest cost
279
+ else:
280
+ return float("inf") # Impossible
281
+
282
+ def get_engine_name(self) -> str:
283
+ """Get engine name."""
284
+ return "pushdown"
285
+
286
+ def test_connection(self) -> Tuple[bool, Optional[str]]:
287
+ """Test pushdown (always available)."""
288
+ return (True, None)
@@ -0,0 +1,13 @@
1
+ """
2
+ DVT Compute Engines
3
+
4
+ This package contains implementations of compute engines (DuckDB, Spark).
5
+ """
6
+
7
+ from dvt.compute.engines.duckdb_engine import DuckDBEngine
8
+ from dvt.compute.engines.spark_engine import SparkEngine
9
+
10
+ __all__ = [
11
+ "DuckDBEngine",
12
+ "SparkEngine",
13
+ ]
@@ -0,0 +1,368 @@
1
+ """
2
+ DuckDB compute engine implementation.
3
+
4
+ This module provides DVT's DuckDB compute layer for processing heterogeneous
5
+ data sources.
6
+ """
7
+
8
+ import os
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Any, Dict, Optional, Tuple
12
+
13
+ from dvt.compute.base import (
14
+ BaseComputeEngine,
15
+ ComputeResult,
16
+ ExecutionStrategy,
17
+ QueryExecutionPlan,
18
+ )
19
+ from dvt.config.compute_config import DuckDBConfig
20
+ from dvt.events import fire_event
21
+ from dvt.events.types import Note
22
+
23
+ from dbt.adapters.exceptions import DbtRuntimeError
24
+
25
+ # DuckDB import - will fail gracefully if not installed
26
+ try:
27
+ import duckdb
28
+
29
+ DUCKDB_AVAILABLE = True
30
+ except ImportError:
31
+ DUCKDB_AVAILABLE = False
32
+ duckdb = None
33
+
34
+
35
+ class DuckDBEngine(BaseComputeEngine):
36
+ """
37
+ DuckDB compute engine for DVT.
38
+
39
+ Uses DuckDB's ability to directly query multiple database types
40
+ via scanners (postgres_scanner, mysql_scanner, etc.) and cloud
41
+ storage (httpfs extension for S3).
42
+ """
43
+
44
+ def __init__(self, config: DuckDBConfig, profile_registry: Any):
45
+ """
46
+ Initialize DuckDB engine.
47
+
48
+ Args:
49
+ config: DuckDB configuration
50
+ profile_registry: Registry for resolving profile connections
51
+ """
52
+ super().__init__(config=config.__dict__)
53
+ self.duckdb_config = config
54
+ self.profile_registry = profile_registry
55
+ self.connection: Optional[Any] = None # duckdb.DuckDBPyConnection
56
+ self._attached_profiles: set[str] = set()
57
+
58
+ def initialize(self) -> None:
59
+ """Initialize DuckDB connection and load extensions."""
60
+ if not DUCKDB_AVAILABLE:
61
+ raise DbtRuntimeError("DuckDB is not installed. Install with: pip install duckdb")
62
+
63
+ try:
64
+ fire_event(Note(msg="Initializing DuckDB compute engine"))
65
+
66
+ # Create in-memory connection
67
+ self.connection = duckdb.connect(":memory:")
68
+
69
+ # Configure DuckDB settings
70
+ self.connection.execute(f"SET memory_limit='{self.duckdb_config.memory_limit}'")
71
+ self.connection.execute(f"SET threads={self.duckdb_config.threads}")
72
+ self.connection.execute(f"SET max_memory='{self.duckdb_config.max_memory}'")
73
+ self.connection.execute(f"SET temp_directory='{self.duckdb_config.temp_directory}'")
74
+
75
+ # Enable/disable features
76
+ if not self.duckdb_config.enable_optimizer:
77
+ self.connection.execute("SET enable_optimizer=false")
78
+ if self.duckdb_config.enable_profiling:
79
+ self.connection.execute("SET enable_profiling=true")
80
+ if self.duckdb_config.enable_progress_bar:
81
+ self.connection.execute("SET enable_progress_bar=true")
82
+
83
+ # Install and load extensions
84
+ for ext in self.duckdb_config.extensions:
85
+ try:
86
+ fire_event(Note(msg=f"Installing DuckDB extension: {ext}"))
87
+ self.connection.execute(f"INSTALL {ext}")
88
+ self.connection.execute(f"LOAD {ext}")
89
+ except Exception as e:
90
+ fire_event(Note(msg=f"Failed to load extension {ext}: {e}"))
91
+
92
+ # Configure S3 if specified
93
+ if self.duckdb_config.s3:
94
+ self._configure_s3()
95
+
96
+ self._initialized = True
97
+ fire_event(Note(msg="DuckDB engine initialized successfully"))
98
+
99
+ except Exception as e:
100
+ raise DbtRuntimeError(f"Failed to initialize DuckDB engine: {e}")
101
+
102
+ def shutdown(self) -> None:
103
+ """Shutdown DuckDB connection."""
104
+ if self.connection:
105
+ try:
106
+ self.connection.close()
107
+ fire_event(Note(msg="DuckDB engine shutdown"))
108
+ except Exception as e:
109
+ fire_event(Note(msg=f"Error shutting down DuckDB: {e}"))
110
+ finally:
111
+ self.connection = None
112
+ self._initialized = False
113
+ self._attached_profiles.clear()
114
+
115
+ def execute_query(
116
+ self,
117
+ sql: str,
118
+ execution_plan: QueryExecutionPlan,
119
+ ) -> ComputeResult:
120
+ """
121
+ Execute SQL query in DuckDB.
122
+
123
+ Args:
124
+ sql: SQL query to execute
125
+ execution_plan: Execution plan with source information
126
+
127
+ Returns:
128
+ ComputeResult
129
+ """
130
+ if not self._initialized or not self.connection:
131
+ return ComputeResult(
132
+ success=False,
133
+ error="DuckDB engine not initialized",
134
+ )
135
+
136
+ try:
137
+ start_time = datetime.now()
138
+
139
+ # Attach source databases
140
+ for source in execution_plan.sources:
141
+ self._attach_profile(source.profile_name, source.adapter_type)
142
+
143
+ # Execute query
144
+ result = self.connection.execute(sql)
145
+
146
+ # Get row count if available
147
+ try:
148
+ rows_affected = len(result.fetchall()) if result else 0
149
+ except Exception:
150
+ rows_affected = 0
151
+
152
+ # Calculate execution time
153
+ execution_time = (datetime.now() - start_time).total_seconds() * 1000
154
+
155
+ return ComputeResult(
156
+ success=True,
157
+ rows_affected=rows_affected,
158
+ execution_time_ms=execution_time,
159
+ strategy_used=ExecutionStrategy.COMPUTE_LAYER,
160
+ compute_engine_used="duckdb",
161
+ metadata={
162
+ "attached_profiles": list(self._attached_profiles),
163
+ },
164
+ )
165
+
166
+ except Exception as e:
167
+ return ComputeResult(
168
+ success=False,
169
+ error=str(e),
170
+ strategy_used=ExecutionStrategy.COMPUTE_LAYER,
171
+ compute_engine_used="duckdb",
172
+ )
173
+
174
+ def _attach_profile(self, profile_name: str, adapter_type: str) -> None:
175
+ """
176
+ Attach a profile to DuckDB for querying.
177
+
178
+ Uses appropriate scanner based on adapter type:
179
+ - postgres: postgres_scanner
180
+ - mysql: mysql_scanner
181
+ - s3: httpfs extension
182
+
183
+ Args:
184
+ profile_name: Profile name
185
+ adapter_type: Adapter type (postgres, mysql, etc.)
186
+ """
187
+ # Skip if already attached
188
+ if profile_name in self._attached_profiles:
189
+ return
190
+
191
+ # Get profile configuration
192
+ profile_config = self.profile_registry.get_or_create_profile(profile_name)
193
+ if not profile_config:
194
+ raise DbtRuntimeError(f"Profile '{profile_name}' not found")
195
+
196
+ # Attach based on adapter type
197
+ if adapter_type == "postgres":
198
+ self._attach_postgres(profile_name, profile_config)
199
+ elif adapter_type == "mysql":
200
+ self._attach_mysql(profile_name, profile_config)
201
+ elif adapter_type == "s3":
202
+ self._configure_s3_for_profile(profile_name, profile_config)
203
+ else:
204
+ fire_event(
205
+ Note(
206
+ msg=f"Warning: Adapter type '{adapter_type}' not yet supported in DuckDB engine"
207
+ )
208
+ )
209
+
210
+ self._attached_profiles.add(profile_name)
211
+
212
+ def _attach_postgres(self, profile_name: str, profile_config: Dict[str, Any]) -> None:
213
+ """Attach PostgreSQL database using postgres_scanner."""
214
+ try:
215
+ # Build connection string
216
+ conn_str = (
217
+ f"host={profile_config.get('host')} "
218
+ f"port={profile_config.get('port', 5432)} "
219
+ f"dbname={profile_config.get('database')} "
220
+ f"user={profile_config.get('user')} "
221
+ f"password={profile_config.get('password')}"
222
+ )
223
+
224
+ # Attach database
225
+ attach_sql = f"""
226
+ ATTACH 'postgres:{conn_str}' AS {profile_name} (TYPE POSTGRES)
227
+ """
228
+ self.connection.execute(attach_sql)
229
+
230
+ fire_event(Note(msg=f"Attached Postgres profile: {profile_name}"))
231
+
232
+ except Exception as e:
233
+ raise DbtRuntimeError(f"Failed to attach Postgres profile '{profile_name}': {e}")
234
+
235
+ def _attach_mysql(self, profile_name: str, profile_config: Dict[str, Any]) -> None:
236
+ """Attach MySQL database using mysql_scanner."""
237
+ try:
238
+ # Build connection string
239
+ conn_str = (
240
+ f"host={profile_config.get('host')} "
241
+ f"port={profile_config.get('port', 3306)} "
242
+ f"database={profile_config.get('database')} "
243
+ f"user={profile_config.get('user')} "
244
+ f"password={profile_config.get('password')}"
245
+ )
246
+
247
+ # Attach database
248
+ attach_sql = f"""
249
+ ATTACH 'mysql:{conn_str}' AS {profile_name} (TYPE MYSQL)
250
+ """
251
+ self.connection.execute(attach_sql)
252
+
253
+ fire_event(Note(msg=f"Attached MySQL profile: {profile_name}"))
254
+
255
+ except Exception as e:
256
+ raise DbtRuntimeError(f"Failed to attach MySQL profile '{profile_name}': {e}")
257
+
258
+ def _configure_s3(self) -> None:
259
+ """Configure S3 access for DuckDB."""
260
+ if not self.duckdb_config.s3:
261
+ return
262
+
263
+ s3_config = self.duckdb_config.s3
264
+
265
+ # Set S3 region
266
+ if "region" in s3_config:
267
+ self.connection.execute(f"SET s3_region='{s3_config['region']}'")
268
+
269
+ # Set credentials from environment or config
270
+ access_key = s3_config.get("access_key_id") or os.environ.get("AWS_ACCESS_KEY_ID")
271
+ secret_key = s3_config.get("secret_access_key") or os.environ.get("AWS_SECRET_ACCESS_KEY")
272
+
273
+ if access_key and secret_key:
274
+ self.connection.execute(f"SET s3_access_key_id='{access_key}'")
275
+ self.connection.execute(f"SET s3_secret_access_key='{secret_key}'")
276
+
277
+ # Set other S3 options
278
+ if "use_ssl" in s3_config:
279
+ self.connection.execute(f"SET s3_use_ssl={str(s3_config['use_ssl']).lower()}")
280
+
281
+ if "url_style" in s3_config:
282
+ self.connection.execute(f"SET s3_url_style='{s3_config['url_style']}'")
283
+
284
+ fire_event(Note(msg="Configured S3 access for DuckDB"))
285
+
286
+ def _configure_s3_for_profile(self, profile_name: str, profile_config: Dict[str, Any]) -> None:
287
+ """Configure S3 access for specific profile."""
288
+ # S3 configuration is global in DuckDB, but we can set profile-specific settings
289
+ # For now, just mark as attached
290
+ fire_event(Note(msg=f"S3 profile '{profile_name}' ready for querying"))
291
+
292
+ def can_handle(self, execution_plan: QueryExecutionPlan) -> bool:
293
+ """
294
+ Check if DuckDB can handle this execution plan.
295
+
296
+ DuckDB can handle most queries, but:
297
+ - Pushdown-only queries should go to pushdown engine
298
+ - Very large datasets (> 1TB) should use Spark
299
+
300
+ Args:
301
+ execution_plan: Execution plan
302
+
303
+ Returns:
304
+ True if DuckDB can handle it
305
+ """
306
+ # DuckDB can handle up to ~1TB of data efficiently
307
+ if execution_plan.estimated_data_size_mb > 1024 * 1024: # 1TB
308
+ return False
309
+
310
+ # Check if all adapters are supported
311
+ supported_adapters = {"postgres", "mysql", "s3", "duckdb"}
312
+ for source in execution_plan.sources:
313
+ if source.adapter_type not in supported_adapters:
314
+ fire_event(
315
+ Note(msg=f"Adapter '{source.adapter_type}' not supported by DuckDB engine")
316
+ )
317
+ return False
318
+
319
+ return True
320
+
321
+ def estimate_cost(self, execution_plan: QueryExecutionPlan) -> float:
322
+ """
323
+ Estimate cost of executing with DuckDB.
324
+
325
+ DuckDB is:
326
+ - Very fast for small data (< 1GB)
327
+ - Still good for medium data (1-100GB)
328
+ - Gets slower for large data (> 100GB)
329
+
330
+ Args:
331
+ execution_plan: Execution plan
332
+
333
+ Returns:
334
+ Cost estimate
335
+ """
336
+ data_size_gb = execution_plan.estimated_data_size_mb / 1024
337
+
338
+ if data_size_gb < 1:
339
+ return 10.0 # Very low cost for small data
340
+ elif data_size_gb < 10:
341
+ return 20.0 # Low cost for medium-small data
342
+ elif data_size_gb < 100:
343
+ return 50.0 # Medium cost for medium data
344
+ else:
345
+ return 100.0 # High cost for large data (Spark might be better)
346
+
347
+ def get_engine_name(self) -> str:
348
+ """Get engine name."""
349
+ return "duckdb"
350
+
351
+ def test_connection(self) -> Tuple[bool, Optional[str]]:
352
+ """
353
+ Test if DuckDB is available and working.
354
+
355
+ Returns:
356
+ (success, error_message)
357
+ """
358
+ if not DUCKDB_AVAILABLE:
359
+ return (False, "DuckDB not installed")
360
+
361
+ try:
362
+ # Try to create a connection
363
+ conn = duckdb.connect(":memory:")
364
+ conn.execute("SELECT 1")
365
+ conn.close()
366
+ return (True, None)
367
+ except Exception as e:
368
+ return (False, str(e))