dvt-core 1.11.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dvt-core might be problematic. Click here for more details.
- dvt/__init__.py +7 -0
- dvt/_pydantic_shim.py +26 -0
- dvt/adapters/__init__.py +16 -0
- dvt/adapters/multi_adapter_manager.py +268 -0
- dvt/artifacts/__init__.py +0 -0
- dvt/artifacts/exceptions/__init__.py +1 -0
- dvt/artifacts/exceptions/schemas.py +31 -0
- dvt/artifacts/resources/__init__.py +116 -0
- dvt/artifacts/resources/base.py +68 -0
- dvt/artifacts/resources/types.py +93 -0
- dvt/artifacts/resources/v1/analysis.py +10 -0
- dvt/artifacts/resources/v1/catalog.py +23 -0
- dvt/artifacts/resources/v1/components.py +275 -0
- dvt/artifacts/resources/v1/config.py +282 -0
- dvt/artifacts/resources/v1/documentation.py +11 -0
- dvt/artifacts/resources/v1/exposure.py +52 -0
- dvt/artifacts/resources/v1/function.py +53 -0
- dvt/artifacts/resources/v1/generic_test.py +32 -0
- dvt/artifacts/resources/v1/group.py +22 -0
- dvt/artifacts/resources/v1/hook.py +11 -0
- dvt/artifacts/resources/v1/macro.py +30 -0
- dvt/artifacts/resources/v1/metric.py +173 -0
- dvt/artifacts/resources/v1/model.py +146 -0
- dvt/artifacts/resources/v1/owner.py +10 -0
- dvt/artifacts/resources/v1/saved_query.py +112 -0
- dvt/artifacts/resources/v1/seed.py +42 -0
- dvt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dvt/artifacts/resources/v1/semantic_model.py +315 -0
- dvt/artifacts/resources/v1/singular_test.py +14 -0
- dvt/artifacts/resources/v1/snapshot.py +92 -0
- dvt/artifacts/resources/v1/source_definition.py +85 -0
- dvt/artifacts/resources/v1/sql_operation.py +10 -0
- dvt/artifacts/resources/v1/unit_test_definition.py +78 -0
- dvt/artifacts/schemas/__init__.py +0 -0
- dvt/artifacts/schemas/base.py +191 -0
- dvt/artifacts/schemas/batch_results.py +24 -0
- dvt/artifacts/schemas/catalog/__init__.py +12 -0
- dvt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dvt/artifacts/schemas/catalog/v1/catalog.py +60 -0
- dvt/artifacts/schemas/freshness/__init__.py +1 -0
- dvt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dvt/artifacts/schemas/freshness/v3/freshness.py +159 -0
- dvt/artifacts/schemas/manifest/__init__.py +2 -0
- dvt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dvt/artifacts/schemas/manifest/v12/manifest.py +212 -0
- dvt/artifacts/schemas/results.py +148 -0
- dvt/artifacts/schemas/run/__init__.py +2 -0
- dvt/artifacts/schemas/run/v5/__init__.py +0 -0
- dvt/artifacts/schemas/run/v5/run.py +184 -0
- dvt/artifacts/schemas/upgrades/__init__.py +4 -0
- dvt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dvt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dvt/artifacts/utils/validation.py +153 -0
- dvt/cli/__init__.py +1 -0
- dvt/cli/context.py +16 -0
- dvt/cli/exceptions.py +56 -0
- dvt/cli/flags.py +558 -0
- dvt/cli/main.py +971 -0
- dvt/cli/option_types.py +121 -0
- dvt/cli/options.py +79 -0
- dvt/cli/params.py +803 -0
- dvt/cli/requires.py +478 -0
- dvt/cli/resolvers.py +32 -0
- dvt/cli/types.py +40 -0
- dvt/clients/__init__.py +0 -0
- dvt/clients/checked_load.py +82 -0
- dvt/clients/git.py +164 -0
- dvt/clients/jinja.py +206 -0
- dvt/clients/jinja_static.py +245 -0
- dvt/clients/registry.py +192 -0
- dvt/clients/yaml_helper.py +68 -0
- dvt/compilation.py +833 -0
- dvt/compute/__init__.py +26 -0
- dvt/compute/base.py +288 -0
- dvt/compute/engines/__init__.py +13 -0
- dvt/compute/engines/duckdb_engine.py +368 -0
- dvt/compute/engines/spark_engine.py +273 -0
- dvt/compute/query_analyzer.py +212 -0
- dvt/compute/router.py +483 -0
- dvt/config/__init__.py +4 -0
- dvt/config/catalogs.py +95 -0
- dvt/config/compute_config.py +406 -0
- dvt/config/profile.py +411 -0
- dvt/config/profiles_v2.py +464 -0
- dvt/config/project.py +893 -0
- dvt/config/renderer.py +232 -0
- dvt/config/runtime.py +491 -0
- dvt/config/selectors.py +209 -0
- dvt/config/utils.py +78 -0
- dvt/connectors/.gitignore +6 -0
- dvt/connectors/README.md +306 -0
- dvt/connectors/catalog.yml +217 -0
- dvt/connectors/download_connectors.py +300 -0
- dvt/constants.py +29 -0
- dvt/context/__init__.py +0 -0
- dvt/context/base.py +746 -0
- dvt/context/configured.py +136 -0
- dvt/context/context_config.py +350 -0
- dvt/context/docs.py +82 -0
- dvt/context/exceptions_jinja.py +179 -0
- dvt/context/macro_resolver.py +195 -0
- dvt/context/macros.py +171 -0
- dvt/context/manifest.py +73 -0
- dvt/context/providers.py +2198 -0
- dvt/context/query_header.py +14 -0
- dvt/context/secret.py +59 -0
- dvt/context/target.py +74 -0
- dvt/contracts/__init__.py +0 -0
- dvt/contracts/files.py +413 -0
- dvt/contracts/graph/__init__.py +0 -0
- dvt/contracts/graph/manifest.py +1904 -0
- dvt/contracts/graph/metrics.py +98 -0
- dvt/contracts/graph/model_config.py +71 -0
- dvt/contracts/graph/node_args.py +42 -0
- dvt/contracts/graph/nodes.py +1806 -0
- dvt/contracts/graph/semantic_manifest.py +233 -0
- dvt/contracts/graph/unparsed.py +812 -0
- dvt/contracts/project.py +417 -0
- dvt/contracts/results.py +53 -0
- dvt/contracts/selection.py +23 -0
- dvt/contracts/sql.py +86 -0
- dvt/contracts/state.py +69 -0
- dvt/contracts/util.py +46 -0
- dvt/deprecations.py +347 -0
- dvt/deps/__init__.py +0 -0
- dvt/deps/base.py +153 -0
- dvt/deps/git.py +196 -0
- dvt/deps/local.py +80 -0
- dvt/deps/registry.py +131 -0
- dvt/deps/resolver.py +149 -0
- dvt/deps/tarball.py +121 -0
- dvt/docs/source/_ext/dbt_click.py +118 -0
- dvt/docs/source/conf.py +32 -0
- dvt/env_vars.py +64 -0
- dvt/event_time/event_time.py +40 -0
- dvt/event_time/sample_window.py +60 -0
- dvt/events/__init__.py +16 -0
- dvt/events/base_types.py +37 -0
- dvt/events/core_types_pb2.py +2 -0
- dvt/events/logging.py +109 -0
- dvt/events/types.py +2534 -0
- dvt/exceptions.py +1487 -0
- dvt/flags.py +89 -0
- dvt/graph/__init__.py +11 -0
- dvt/graph/cli.py +248 -0
- dvt/graph/graph.py +172 -0
- dvt/graph/queue.py +213 -0
- dvt/graph/selector.py +375 -0
- dvt/graph/selector_methods.py +976 -0
- dvt/graph/selector_spec.py +223 -0
- dvt/graph/thread_pool.py +18 -0
- dvt/hooks.py +21 -0
- dvt/include/README.md +49 -0
- dvt/include/__init__.py +3 -0
- dvt/include/global_project.py +4 -0
- dvt/include/starter_project/.gitignore +4 -0
- dvt/include/starter_project/README.md +15 -0
- dvt/include/starter_project/__init__.py +3 -0
- dvt/include/starter_project/analyses/.gitkeep +0 -0
- dvt/include/starter_project/dvt_project.yml +36 -0
- dvt/include/starter_project/macros/.gitkeep +0 -0
- dvt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dvt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dvt/include/starter_project/models/example/schema.yml +21 -0
- dvt/include/starter_project/seeds/.gitkeep +0 -0
- dvt/include/starter_project/snapshots/.gitkeep +0 -0
- dvt/include/starter_project/tests/.gitkeep +0 -0
- dvt/internal_deprecations.py +27 -0
- dvt/jsonschemas/__init__.py +3 -0
- dvt/jsonschemas/jsonschemas.py +309 -0
- dvt/jsonschemas/project/0.0.110.json +4717 -0
- dvt/jsonschemas/project/0.0.85.json +2015 -0
- dvt/jsonschemas/resources/0.0.110.json +2636 -0
- dvt/jsonschemas/resources/0.0.85.json +2536 -0
- dvt/jsonschemas/resources/latest.json +6773 -0
- dvt/links.py +4 -0
- dvt/materializations/__init__.py +0 -0
- dvt/materializations/incremental/__init__.py +0 -0
- dvt/materializations/incremental/microbatch.py +235 -0
- dvt/mp_context.py +8 -0
- dvt/node_types.py +37 -0
- dvt/parser/__init__.py +23 -0
- dvt/parser/analysis.py +21 -0
- dvt/parser/base.py +549 -0
- dvt/parser/common.py +267 -0
- dvt/parser/docs.py +52 -0
- dvt/parser/fixtures.py +51 -0
- dvt/parser/functions.py +30 -0
- dvt/parser/generic_test.py +100 -0
- dvt/parser/generic_test_builders.py +334 -0
- dvt/parser/hooks.py +119 -0
- dvt/parser/macros.py +137 -0
- dvt/parser/manifest.py +2204 -0
- dvt/parser/models.py +574 -0
- dvt/parser/partial.py +1179 -0
- dvt/parser/read_files.py +445 -0
- dvt/parser/schema_generic_tests.py +423 -0
- dvt/parser/schema_renderer.py +111 -0
- dvt/parser/schema_yaml_readers.py +936 -0
- dvt/parser/schemas.py +1467 -0
- dvt/parser/search.py +149 -0
- dvt/parser/seeds.py +28 -0
- dvt/parser/singular_test.py +20 -0
- dvt/parser/snapshots.py +44 -0
- dvt/parser/sources.py +557 -0
- dvt/parser/sql.py +63 -0
- dvt/parser/unit_tests.py +622 -0
- dvt/plugins/__init__.py +20 -0
- dvt/plugins/contracts.py +10 -0
- dvt/plugins/exceptions.py +2 -0
- dvt/plugins/manager.py +164 -0
- dvt/plugins/manifest.py +21 -0
- dvt/profiler.py +20 -0
- dvt/py.typed +1 -0
- dvt/runners/__init__.py +2 -0
- dvt/runners/exposure_runner.py +7 -0
- dvt/runners/no_op_runner.py +46 -0
- dvt/runners/saved_query_runner.py +7 -0
- dvt/selected_resources.py +8 -0
- dvt/task/__init__.py +0 -0
- dvt/task/base.py +504 -0
- dvt/task/build.py +197 -0
- dvt/task/clean.py +57 -0
- dvt/task/clone.py +162 -0
- dvt/task/compile.py +151 -0
- dvt/task/compute.py +366 -0
- dvt/task/debug.py +650 -0
- dvt/task/deps.py +280 -0
- dvt/task/docs/__init__.py +3 -0
- dvt/task/docs/generate.py +408 -0
- dvt/task/docs/index.html +250 -0
- dvt/task/docs/serve.py +28 -0
- dvt/task/freshness.py +323 -0
- dvt/task/function.py +122 -0
- dvt/task/group_lookup.py +46 -0
- dvt/task/init.py +374 -0
- dvt/task/list.py +237 -0
- dvt/task/printer.py +176 -0
- dvt/task/profiles.py +256 -0
- dvt/task/retry.py +175 -0
- dvt/task/run.py +1146 -0
- dvt/task/run_operation.py +142 -0
- dvt/task/runnable.py +802 -0
- dvt/task/seed.py +104 -0
- dvt/task/show.py +150 -0
- dvt/task/snapshot.py +57 -0
- dvt/task/sql.py +111 -0
- dvt/task/test.py +464 -0
- dvt/tests/fixtures/__init__.py +1 -0
- dvt/tests/fixtures/project.py +620 -0
- dvt/tests/util.py +651 -0
- dvt/tracking.py +529 -0
- dvt/utils/__init__.py +3 -0
- dvt/utils/artifact_upload.py +151 -0
- dvt/utils/utils.py +408 -0
- dvt/version.py +249 -0
- dvt_core-1.11.0b4.dist-info/METADATA +252 -0
- dvt_core-1.11.0b4.dist-info/RECORD +261 -0
- dvt_core-1.11.0b4.dist-info/WHEEL +5 -0
- dvt_core-1.11.0b4.dist-info/entry_points.txt +2 -0
- dvt_core-1.11.0b4.dist-info/top_level.txt +1 -0
dvt/compute/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DVT Compute Layer
|
|
3
|
+
|
|
4
|
+
This package provides the compute engine abstraction for processing
|
|
5
|
+
heterogeneous data sources.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dvt.compute.base import (
|
|
9
|
+
BaseComputeEngine,
|
|
10
|
+
ComputeResult,
|
|
11
|
+
QueryExecutionPlan,
|
|
12
|
+
SourceInfo,
|
|
13
|
+
)
|
|
14
|
+
from dvt.compute.query_analyzer import QueryAnalyzer, analyze_query_sources
|
|
15
|
+
from dvt.compute.router import ExecutionRouter, ExecutionStrategy
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"BaseComputeEngine",
|
|
19
|
+
"ComputeResult",
|
|
20
|
+
"QueryExecutionPlan",
|
|
21
|
+
"SourceInfo",
|
|
22
|
+
"ExecutionRouter",
|
|
23
|
+
"ExecutionStrategy",
|
|
24
|
+
"QueryAnalyzer",
|
|
25
|
+
"analyze_query_sources",
|
|
26
|
+
]
|
dvt/compute/base.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base compute engine abstraction.
|
|
3
|
+
|
|
4
|
+
This module defines the interface that all compute engines (DuckDB, Spark)
|
|
5
|
+
must implement.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
13
|
+
|
|
14
|
+
from dbt.adapters.base import BaseRelation
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ExecutionStrategy(Enum):
|
|
18
|
+
"""Strategy for executing a query."""
|
|
19
|
+
|
|
20
|
+
PUSHDOWN = "pushdown" # Execute on source database
|
|
21
|
+
COMPUTE_LAYER = "compute_layer" # Execute in DuckDB/Spark
|
|
22
|
+
AUTO = "auto" # Let DVT decide
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class SourceInfo:
|
|
27
|
+
"""Information about a data source referenced in a query."""
|
|
28
|
+
|
|
29
|
+
profile_name: str
|
|
30
|
+
adapter_type: str
|
|
31
|
+
relation: BaseRelation
|
|
32
|
+
estimated_rows: Optional[int] = None
|
|
33
|
+
estimated_size_mb: Optional[float] = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class QueryExecutionPlan:
|
|
38
|
+
"""
|
|
39
|
+
Execution plan for a query.
|
|
40
|
+
|
|
41
|
+
This describes how DVT will execute the query (pushdown vs compute layer).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
strategy: ExecutionStrategy
|
|
45
|
+
compute_engine: Optional[str] = None # 'duckdb', 'spark_local', 'spark_cluster'
|
|
46
|
+
sources: List[SourceInfo] = field(default_factory=list)
|
|
47
|
+
is_homogeneous: bool = True # All sources same adapter
|
|
48
|
+
estimated_data_size_mb: float = 0.0
|
|
49
|
+
estimated_rows: int = 0
|
|
50
|
+
pushdown_target: Optional[str] = None # Which adapter to push down to
|
|
51
|
+
reason: str = "" # Explanation of strategy choice
|
|
52
|
+
|
|
53
|
+
def is_pushdown_possible(self) -> bool:
|
|
54
|
+
"""Check if pushdown is possible."""
|
|
55
|
+
return self.is_homogeneous and len(set(s.profile_name for s in self.sources)) == 1
|
|
56
|
+
|
|
57
|
+
def get_unique_adapters(self) -> Set[str]:
|
|
58
|
+
"""Get set of unique adapter types."""
|
|
59
|
+
return {s.adapter_type for s in self.sources}
|
|
60
|
+
|
|
61
|
+
def get_unique_profiles(self) -> Set[str]:
|
|
62
|
+
"""Get set of unique profile names."""
|
|
63
|
+
return {s.profile_name for s in self.sources}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class ComputeResult:
|
|
68
|
+
"""Result of compute engine execution."""
|
|
69
|
+
|
|
70
|
+
success: bool
|
|
71
|
+
rows_affected: int = 0
|
|
72
|
+
execution_time_ms: float = 0.0
|
|
73
|
+
strategy_used: Optional[ExecutionStrategy] = None
|
|
74
|
+
compute_engine_used: Optional[str] = None
|
|
75
|
+
error: Optional[str] = None
|
|
76
|
+
warnings: List[str] = field(default_factory=list)
|
|
77
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
78
|
+
|
|
79
|
+
def __post_init__(self):
|
|
80
|
+
"""Set execution timestamp."""
|
|
81
|
+
self.metadata.setdefault("executed_at", datetime.now().isoformat())
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class BaseComputeEngine(ABC):
|
|
85
|
+
"""
|
|
86
|
+
Base class for compute engines.
|
|
87
|
+
|
|
88
|
+
All compute engines (DuckDB, Spark) must implement this interface.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(self, config: Dict[str, Any]):
|
|
92
|
+
"""
|
|
93
|
+
Initialize compute engine.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
config: Engine-specific configuration
|
|
97
|
+
"""
|
|
98
|
+
self.config = config
|
|
99
|
+
self._initialized = False
|
|
100
|
+
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def initialize(self) -> None:
|
|
103
|
+
"""
|
|
104
|
+
Initialize the compute engine.
|
|
105
|
+
|
|
106
|
+
This is called once before any queries are executed.
|
|
107
|
+
Should set up connections, load extensions, etc.
|
|
108
|
+
"""
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def shutdown(self) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Shutdown the compute engine.
|
|
115
|
+
|
|
116
|
+
Clean up resources, close connections, etc.
|
|
117
|
+
"""
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def execute_query(
|
|
122
|
+
self,
|
|
123
|
+
sql: str,
|
|
124
|
+
execution_plan: QueryExecutionPlan,
|
|
125
|
+
) -> ComputeResult:
|
|
126
|
+
"""
|
|
127
|
+
Execute a SQL query using this compute engine.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
sql: SQL query to execute
|
|
131
|
+
execution_plan: Execution plan with source information
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
ComputeResult with execution status and metadata
|
|
135
|
+
"""
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
@abstractmethod
|
|
139
|
+
def can_handle(self, execution_plan: QueryExecutionPlan) -> bool:
|
|
140
|
+
"""
|
|
141
|
+
Check if this engine can handle the given execution plan.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
execution_plan: Execution plan to check
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
True if this engine can handle the plan
|
|
148
|
+
"""
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
@abstractmethod
|
|
152
|
+
def estimate_cost(self, execution_plan: QueryExecutionPlan) -> float:
|
|
153
|
+
"""
|
|
154
|
+
Estimate cost of executing with this engine.
|
|
155
|
+
|
|
156
|
+
Lower cost = better choice.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
execution_plan: Execution plan to estimate
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Cost estimate (arbitrary units, relative to other engines)
|
|
163
|
+
"""
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
@abstractmethod
|
|
167
|
+
def get_engine_name(self) -> str:
|
|
168
|
+
"""Get name of this compute engine."""
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
@abstractmethod
|
|
172
|
+
def test_connection(self) -> Tuple[bool, Optional[str]]:
|
|
173
|
+
"""
|
|
174
|
+
Test if engine is available and working.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
(success, error_message)
|
|
178
|
+
"""
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
def is_initialized(self) -> bool:
|
|
182
|
+
"""Check if engine is initialized."""
|
|
183
|
+
return self._initialized
|
|
184
|
+
|
|
185
|
+
def __enter__(self):
|
|
186
|
+
"""Context manager entry."""
|
|
187
|
+
if not self._initialized:
|
|
188
|
+
self.initialize()
|
|
189
|
+
return self
|
|
190
|
+
|
|
191
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
192
|
+
"""Context manager exit."""
|
|
193
|
+
self.shutdown()
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class PushdownEngine(BaseComputeEngine):
|
|
198
|
+
"""
|
|
199
|
+
Special compute engine for pushdown execution.
|
|
200
|
+
|
|
201
|
+
This doesn't actually compute anything - it delegates to the
|
|
202
|
+
source adapter directly.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
def __init__(self, adapter_factory):
|
|
206
|
+
"""
|
|
207
|
+
Initialize pushdown engine.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
adapter_factory: Factory to get adapters by profile name
|
|
211
|
+
"""
|
|
212
|
+
super().__init__(config={})
|
|
213
|
+
self.adapter_factory = adapter_factory
|
|
214
|
+
|
|
215
|
+
def initialize(self) -> None:
|
|
216
|
+
"""Initialize (no-op for pushdown)."""
|
|
217
|
+
self._initialized = True
|
|
218
|
+
|
|
219
|
+
def shutdown(self) -> None:
|
|
220
|
+
"""Shutdown (no-op for pushdown)."""
|
|
221
|
+
self._initialized = False
|
|
222
|
+
|
|
223
|
+
def execute_query(
|
|
224
|
+
self,
|
|
225
|
+
sql: str,
|
|
226
|
+
execution_plan: QueryExecutionPlan,
|
|
227
|
+
) -> ComputeResult:
|
|
228
|
+
"""
|
|
229
|
+
Execute query via pushdown to source adapter.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
sql: SQL query to execute
|
|
233
|
+
execution_plan: Execution plan
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
ComputeResult
|
|
237
|
+
"""
|
|
238
|
+
if not execution_plan.pushdown_target:
|
|
239
|
+
return ComputeResult(
|
|
240
|
+
success=False,
|
|
241
|
+
error="Pushdown target not specified in execution plan",
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
try:
|
|
245
|
+
start_time = datetime.now()
|
|
246
|
+
|
|
247
|
+
# Get adapter for pushdown target
|
|
248
|
+
adapter = self.adapter_factory.get_adapter(execution_plan.pushdown_target)
|
|
249
|
+
|
|
250
|
+
# Execute on adapter
|
|
251
|
+
result = adapter.execute(sql, fetch=False)
|
|
252
|
+
|
|
253
|
+
# Calculate execution time
|
|
254
|
+
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
255
|
+
|
|
256
|
+
return ComputeResult(
|
|
257
|
+
success=True,
|
|
258
|
+
rows_affected=getattr(result, "rows_affected", 0),
|
|
259
|
+
execution_time_ms=execution_time,
|
|
260
|
+
strategy_used=ExecutionStrategy.PUSHDOWN,
|
|
261
|
+
compute_engine_used=execution_plan.pushdown_target,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
except Exception as e:
|
|
265
|
+
return ComputeResult(
|
|
266
|
+
success=False,
|
|
267
|
+
error=str(e),
|
|
268
|
+
strategy_used=ExecutionStrategy.PUSHDOWN,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
def can_handle(self, execution_plan: QueryExecutionPlan) -> bool:
|
|
272
|
+
"""Check if pushdown is possible."""
|
|
273
|
+
return execution_plan.is_pushdown_possible()
|
|
274
|
+
|
|
275
|
+
def estimate_cost(self, execution_plan: QueryExecutionPlan) -> float:
|
|
276
|
+
"""Pushdown has lowest cost (no data movement)."""
|
|
277
|
+
if execution_plan.is_pushdown_possible():
|
|
278
|
+
return 1.0 # Lowest cost
|
|
279
|
+
else:
|
|
280
|
+
return float("inf") # Impossible
|
|
281
|
+
|
|
282
|
+
def get_engine_name(self) -> str:
|
|
283
|
+
"""Get engine name."""
|
|
284
|
+
return "pushdown"
|
|
285
|
+
|
|
286
|
+
def test_connection(self) -> Tuple[bool, Optional[str]]:
|
|
287
|
+
"""Test pushdown (always available)."""
|
|
288
|
+
return (True, None)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DVT Compute Engines
|
|
3
|
+
|
|
4
|
+
This package contains implementations of compute engines (DuckDB, Spark).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dvt.compute.engines.duckdb_engine import DuckDBEngine
|
|
8
|
+
from dvt.compute.engines.spark_engine import SparkEngine
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"DuckDBEngine",
|
|
12
|
+
"SparkEngine",
|
|
13
|
+
]
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DuckDB compute engine implementation.
|
|
3
|
+
|
|
4
|
+
This module provides DVT's DuckDB compute layer for processing heterogeneous
|
|
5
|
+
data sources.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Dict, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
from dvt.compute.base import (
|
|
14
|
+
BaseComputeEngine,
|
|
15
|
+
ComputeResult,
|
|
16
|
+
ExecutionStrategy,
|
|
17
|
+
QueryExecutionPlan,
|
|
18
|
+
)
|
|
19
|
+
from dvt.config.compute_config import DuckDBConfig
|
|
20
|
+
from dvt.events import fire_event
|
|
21
|
+
from dvt.events.types import Note
|
|
22
|
+
|
|
23
|
+
from dbt.adapters.exceptions import DbtRuntimeError
|
|
24
|
+
|
|
25
|
+
# DuckDB import - will fail gracefully if not installed
|
|
26
|
+
try:
|
|
27
|
+
import duckdb
|
|
28
|
+
|
|
29
|
+
DUCKDB_AVAILABLE = True
|
|
30
|
+
except ImportError:
|
|
31
|
+
DUCKDB_AVAILABLE = False
|
|
32
|
+
duckdb = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DuckDBEngine(BaseComputeEngine):
|
|
36
|
+
"""
|
|
37
|
+
DuckDB compute engine for DVT.
|
|
38
|
+
|
|
39
|
+
Uses DuckDB's ability to directly query multiple database types
|
|
40
|
+
via scanners (postgres_scanner, mysql_scanner, etc.) and cloud
|
|
41
|
+
storage (httpfs extension for S3).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, config: DuckDBConfig, profile_registry: Any):
|
|
45
|
+
"""
|
|
46
|
+
Initialize DuckDB engine.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
config: DuckDB configuration
|
|
50
|
+
profile_registry: Registry for resolving profile connections
|
|
51
|
+
"""
|
|
52
|
+
super().__init__(config=config.__dict__)
|
|
53
|
+
self.duckdb_config = config
|
|
54
|
+
self.profile_registry = profile_registry
|
|
55
|
+
self.connection: Optional[Any] = None # duckdb.DuckDBPyConnection
|
|
56
|
+
self._attached_profiles: set[str] = set()
|
|
57
|
+
|
|
58
|
+
def initialize(self) -> None:
|
|
59
|
+
"""Initialize DuckDB connection and load extensions."""
|
|
60
|
+
if not DUCKDB_AVAILABLE:
|
|
61
|
+
raise DbtRuntimeError("DuckDB is not installed. Install with: pip install duckdb")
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
fire_event(Note(msg="Initializing DuckDB compute engine"))
|
|
65
|
+
|
|
66
|
+
# Create in-memory connection
|
|
67
|
+
self.connection = duckdb.connect(":memory:")
|
|
68
|
+
|
|
69
|
+
# Configure DuckDB settings
|
|
70
|
+
self.connection.execute(f"SET memory_limit='{self.duckdb_config.memory_limit}'")
|
|
71
|
+
self.connection.execute(f"SET threads={self.duckdb_config.threads}")
|
|
72
|
+
self.connection.execute(f"SET max_memory='{self.duckdb_config.max_memory}'")
|
|
73
|
+
self.connection.execute(f"SET temp_directory='{self.duckdb_config.temp_directory}'")
|
|
74
|
+
|
|
75
|
+
# Enable/disable features
|
|
76
|
+
if not self.duckdb_config.enable_optimizer:
|
|
77
|
+
self.connection.execute("SET enable_optimizer=false")
|
|
78
|
+
if self.duckdb_config.enable_profiling:
|
|
79
|
+
self.connection.execute("SET enable_profiling=true")
|
|
80
|
+
if self.duckdb_config.enable_progress_bar:
|
|
81
|
+
self.connection.execute("SET enable_progress_bar=true")
|
|
82
|
+
|
|
83
|
+
# Install and load extensions
|
|
84
|
+
for ext in self.duckdb_config.extensions:
|
|
85
|
+
try:
|
|
86
|
+
fire_event(Note(msg=f"Installing DuckDB extension: {ext}"))
|
|
87
|
+
self.connection.execute(f"INSTALL {ext}")
|
|
88
|
+
self.connection.execute(f"LOAD {ext}")
|
|
89
|
+
except Exception as e:
|
|
90
|
+
fire_event(Note(msg=f"Failed to load extension {ext}: {e}"))
|
|
91
|
+
|
|
92
|
+
# Configure S3 if specified
|
|
93
|
+
if self.duckdb_config.s3:
|
|
94
|
+
self._configure_s3()
|
|
95
|
+
|
|
96
|
+
self._initialized = True
|
|
97
|
+
fire_event(Note(msg="DuckDB engine initialized successfully"))
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
raise DbtRuntimeError(f"Failed to initialize DuckDB engine: {e}")
|
|
101
|
+
|
|
102
|
+
def shutdown(self) -> None:
|
|
103
|
+
"""Shutdown DuckDB connection."""
|
|
104
|
+
if self.connection:
|
|
105
|
+
try:
|
|
106
|
+
self.connection.close()
|
|
107
|
+
fire_event(Note(msg="DuckDB engine shutdown"))
|
|
108
|
+
except Exception as e:
|
|
109
|
+
fire_event(Note(msg=f"Error shutting down DuckDB: {e}"))
|
|
110
|
+
finally:
|
|
111
|
+
self.connection = None
|
|
112
|
+
self._initialized = False
|
|
113
|
+
self._attached_profiles.clear()
|
|
114
|
+
|
|
115
|
+
def execute_query(
|
|
116
|
+
self,
|
|
117
|
+
sql: str,
|
|
118
|
+
execution_plan: QueryExecutionPlan,
|
|
119
|
+
) -> ComputeResult:
|
|
120
|
+
"""
|
|
121
|
+
Execute SQL query in DuckDB.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
sql: SQL query to execute
|
|
125
|
+
execution_plan: Execution plan with source information
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
ComputeResult
|
|
129
|
+
"""
|
|
130
|
+
if not self._initialized or not self.connection:
|
|
131
|
+
return ComputeResult(
|
|
132
|
+
success=False,
|
|
133
|
+
error="DuckDB engine not initialized",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
start_time = datetime.now()
|
|
138
|
+
|
|
139
|
+
# Attach source databases
|
|
140
|
+
for source in execution_plan.sources:
|
|
141
|
+
self._attach_profile(source.profile_name, source.adapter_type)
|
|
142
|
+
|
|
143
|
+
# Execute query
|
|
144
|
+
result = self.connection.execute(sql)
|
|
145
|
+
|
|
146
|
+
# Get row count if available
|
|
147
|
+
try:
|
|
148
|
+
rows_affected = len(result.fetchall()) if result else 0
|
|
149
|
+
except Exception:
|
|
150
|
+
rows_affected = 0
|
|
151
|
+
|
|
152
|
+
# Calculate execution time
|
|
153
|
+
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
154
|
+
|
|
155
|
+
return ComputeResult(
|
|
156
|
+
success=True,
|
|
157
|
+
rows_affected=rows_affected,
|
|
158
|
+
execution_time_ms=execution_time,
|
|
159
|
+
strategy_used=ExecutionStrategy.COMPUTE_LAYER,
|
|
160
|
+
compute_engine_used="duckdb",
|
|
161
|
+
metadata={
|
|
162
|
+
"attached_profiles": list(self._attached_profiles),
|
|
163
|
+
},
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
return ComputeResult(
|
|
168
|
+
success=False,
|
|
169
|
+
error=str(e),
|
|
170
|
+
strategy_used=ExecutionStrategy.COMPUTE_LAYER,
|
|
171
|
+
compute_engine_used="duckdb",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def _attach_profile(self, profile_name: str, adapter_type: str) -> None:
|
|
175
|
+
"""
|
|
176
|
+
Attach a profile to DuckDB for querying.
|
|
177
|
+
|
|
178
|
+
Uses appropriate scanner based on adapter type:
|
|
179
|
+
- postgres: postgres_scanner
|
|
180
|
+
- mysql: mysql_scanner
|
|
181
|
+
- s3: httpfs extension
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
profile_name: Profile name
|
|
185
|
+
adapter_type: Adapter type (postgres, mysql, etc.)
|
|
186
|
+
"""
|
|
187
|
+
# Skip if already attached
|
|
188
|
+
if profile_name in self._attached_profiles:
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
# Get profile configuration
|
|
192
|
+
profile_config = self.profile_registry.get_or_create_profile(profile_name)
|
|
193
|
+
if not profile_config:
|
|
194
|
+
raise DbtRuntimeError(f"Profile '{profile_name}' not found")
|
|
195
|
+
|
|
196
|
+
# Attach based on adapter type
|
|
197
|
+
if adapter_type == "postgres":
|
|
198
|
+
self._attach_postgres(profile_name, profile_config)
|
|
199
|
+
elif adapter_type == "mysql":
|
|
200
|
+
self._attach_mysql(profile_name, profile_config)
|
|
201
|
+
elif adapter_type == "s3":
|
|
202
|
+
self._configure_s3_for_profile(profile_name, profile_config)
|
|
203
|
+
else:
|
|
204
|
+
fire_event(
|
|
205
|
+
Note(
|
|
206
|
+
msg=f"Warning: Adapter type '{adapter_type}' not yet supported in DuckDB engine"
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
self._attached_profiles.add(profile_name)
|
|
211
|
+
|
|
212
|
+
def _attach_postgres(self, profile_name: str, profile_config: Dict[str, Any]) -> None:
|
|
213
|
+
"""Attach PostgreSQL database using postgres_scanner."""
|
|
214
|
+
try:
|
|
215
|
+
# Build connection string
|
|
216
|
+
conn_str = (
|
|
217
|
+
f"host={profile_config.get('host')} "
|
|
218
|
+
f"port={profile_config.get('port', 5432)} "
|
|
219
|
+
f"dbname={profile_config.get('database')} "
|
|
220
|
+
f"user={profile_config.get('user')} "
|
|
221
|
+
f"password={profile_config.get('password')}"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Attach database
|
|
225
|
+
attach_sql = f"""
|
|
226
|
+
ATTACH 'postgres:{conn_str}' AS {profile_name} (TYPE POSTGRES)
|
|
227
|
+
"""
|
|
228
|
+
self.connection.execute(attach_sql)
|
|
229
|
+
|
|
230
|
+
fire_event(Note(msg=f"Attached Postgres profile: {profile_name}"))
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
raise DbtRuntimeError(f"Failed to attach Postgres profile '{profile_name}': {e}")
|
|
234
|
+
|
|
235
|
+
def _attach_mysql(self, profile_name: str, profile_config: Dict[str, Any]) -> None:
|
|
236
|
+
"""Attach MySQL database using mysql_scanner."""
|
|
237
|
+
try:
|
|
238
|
+
# Build connection string
|
|
239
|
+
conn_str = (
|
|
240
|
+
f"host={profile_config.get('host')} "
|
|
241
|
+
f"port={profile_config.get('port', 3306)} "
|
|
242
|
+
f"database={profile_config.get('database')} "
|
|
243
|
+
f"user={profile_config.get('user')} "
|
|
244
|
+
f"password={profile_config.get('password')}"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Attach database
|
|
248
|
+
attach_sql = f"""
|
|
249
|
+
ATTACH 'mysql:{conn_str}' AS {profile_name} (TYPE MYSQL)
|
|
250
|
+
"""
|
|
251
|
+
self.connection.execute(attach_sql)
|
|
252
|
+
|
|
253
|
+
fire_event(Note(msg=f"Attached MySQL profile: {profile_name}"))
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
raise DbtRuntimeError(f"Failed to attach MySQL profile '{profile_name}': {e}")
|
|
257
|
+
|
|
258
|
+
def _configure_s3(self) -> None:
|
|
259
|
+
"""Configure S3 access for DuckDB."""
|
|
260
|
+
if not self.duckdb_config.s3:
|
|
261
|
+
return
|
|
262
|
+
|
|
263
|
+
s3_config = self.duckdb_config.s3
|
|
264
|
+
|
|
265
|
+
# Set S3 region
|
|
266
|
+
if "region" in s3_config:
|
|
267
|
+
self.connection.execute(f"SET s3_region='{s3_config['region']}'")
|
|
268
|
+
|
|
269
|
+
# Set credentials from environment or config
|
|
270
|
+
access_key = s3_config.get("access_key_id") or os.environ.get("AWS_ACCESS_KEY_ID")
|
|
271
|
+
secret_key = s3_config.get("secret_access_key") or os.environ.get("AWS_SECRET_ACCESS_KEY")
|
|
272
|
+
|
|
273
|
+
if access_key and secret_key:
|
|
274
|
+
self.connection.execute(f"SET s3_access_key_id='{access_key}'")
|
|
275
|
+
self.connection.execute(f"SET s3_secret_access_key='{secret_key}'")
|
|
276
|
+
|
|
277
|
+
# Set other S3 options
|
|
278
|
+
if "use_ssl" in s3_config:
|
|
279
|
+
self.connection.execute(f"SET s3_use_ssl={str(s3_config['use_ssl']).lower()}")
|
|
280
|
+
|
|
281
|
+
if "url_style" in s3_config:
|
|
282
|
+
self.connection.execute(f"SET s3_url_style='{s3_config['url_style']}'")
|
|
283
|
+
|
|
284
|
+
fire_event(Note(msg="Configured S3 access for DuckDB"))
|
|
285
|
+
|
|
286
|
+
def _configure_s3_for_profile(self, profile_name: str, profile_config: Dict[str, Any]) -> None:
|
|
287
|
+
"""Configure S3 access for specific profile."""
|
|
288
|
+
# S3 configuration is global in DuckDB, but we can set profile-specific settings
|
|
289
|
+
# For now, just mark as attached
|
|
290
|
+
fire_event(Note(msg=f"S3 profile '{profile_name}' ready for querying"))
|
|
291
|
+
|
|
292
|
+
def can_handle(self, execution_plan: QueryExecutionPlan) -> bool:
|
|
293
|
+
"""
|
|
294
|
+
Check if DuckDB can handle this execution plan.
|
|
295
|
+
|
|
296
|
+
DuckDB can handle most queries, but:
|
|
297
|
+
- Pushdown-only queries should go to pushdown engine
|
|
298
|
+
- Very large datasets (> 1TB) should use Spark
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
execution_plan: Execution plan
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
True if DuckDB can handle it
|
|
305
|
+
"""
|
|
306
|
+
# DuckDB can handle up to ~1TB of data efficiently
|
|
307
|
+
if execution_plan.estimated_data_size_mb > 1024 * 1024: # 1TB
|
|
308
|
+
return False
|
|
309
|
+
|
|
310
|
+
# Check if all adapters are supported
|
|
311
|
+
supported_adapters = {"postgres", "mysql", "s3", "duckdb"}
|
|
312
|
+
for source in execution_plan.sources:
|
|
313
|
+
if source.adapter_type not in supported_adapters:
|
|
314
|
+
fire_event(
|
|
315
|
+
Note(msg=f"Adapter '{source.adapter_type}' not supported by DuckDB engine")
|
|
316
|
+
)
|
|
317
|
+
return False
|
|
318
|
+
|
|
319
|
+
return True
|
|
320
|
+
|
|
321
|
+
def estimate_cost(self, execution_plan: QueryExecutionPlan) -> float:
|
|
322
|
+
"""
|
|
323
|
+
Estimate cost of executing with DuckDB.
|
|
324
|
+
|
|
325
|
+
DuckDB is:
|
|
326
|
+
- Very fast for small data (< 1GB)
|
|
327
|
+
- Still good for medium data (1-100GB)
|
|
328
|
+
- Gets slower for large data (> 100GB)
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
execution_plan: Execution plan
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Cost estimate
|
|
335
|
+
"""
|
|
336
|
+
data_size_gb = execution_plan.estimated_data_size_mb / 1024
|
|
337
|
+
|
|
338
|
+
if data_size_gb < 1:
|
|
339
|
+
return 10.0 # Very low cost for small data
|
|
340
|
+
elif data_size_gb < 10:
|
|
341
|
+
return 20.0 # Low cost for medium-small data
|
|
342
|
+
elif data_size_gb < 100:
|
|
343
|
+
return 50.0 # Medium cost for medium data
|
|
344
|
+
else:
|
|
345
|
+
return 100.0 # High cost for large data (Spark might be better)
|
|
346
|
+
|
|
347
|
+
def get_engine_name(self) -> str:
|
|
348
|
+
"""Get engine name."""
|
|
349
|
+
return "duckdb"
|
|
350
|
+
|
|
351
|
+
def test_connection(self) -> Tuple[bool, Optional[str]]:
|
|
352
|
+
"""
|
|
353
|
+
Test if DuckDB is available and working.
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
(success, error_message)
|
|
357
|
+
"""
|
|
358
|
+
if not DUCKDB_AVAILABLE:
|
|
359
|
+
return (False, "DuckDB not installed")
|
|
360
|
+
|
|
361
|
+
try:
|
|
362
|
+
# Try to create a connection
|
|
363
|
+
conn = duckdb.connect(":memory:")
|
|
364
|
+
conn.execute("SELECT 1")
|
|
365
|
+
conn.close()
|
|
366
|
+
return (True, None)
|
|
367
|
+
except Exception as e:
|
|
368
|
+
return (False, str(e))
|