dvt-core 1.11.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dvt-core might be problematic. Click here for more details.
- dvt/__init__.py +7 -0
- dvt/_pydantic_shim.py +26 -0
- dvt/adapters/__init__.py +16 -0
- dvt/adapters/multi_adapter_manager.py +268 -0
- dvt/artifacts/__init__.py +0 -0
- dvt/artifacts/exceptions/__init__.py +1 -0
- dvt/artifacts/exceptions/schemas.py +31 -0
- dvt/artifacts/resources/__init__.py +116 -0
- dvt/artifacts/resources/base.py +68 -0
- dvt/artifacts/resources/types.py +93 -0
- dvt/artifacts/resources/v1/analysis.py +10 -0
- dvt/artifacts/resources/v1/catalog.py +23 -0
- dvt/artifacts/resources/v1/components.py +275 -0
- dvt/artifacts/resources/v1/config.py +282 -0
- dvt/artifacts/resources/v1/documentation.py +11 -0
- dvt/artifacts/resources/v1/exposure.py +52 -0
- dvt/artifacts/resources/v1/function.py +53 -0
- dvt/artifacts/resources/v1/generic_test.py +32 -0
- dvt/artifacts/resources/v1/group.py +22 -0
- dvt/artifacts/resources/v1/hook.py +11 -0
- dvt/artifacts/resources/v1/macro.py +30 -0
- dvt/artifacts/resources/v1/metric.py +173 -0
- dvt/artifacts/resources/v1/model.py +146 -0
- dvt/artifacts/resources/v1/owner.py +10 -0
- dvt/artifacts/resources/v1/saved_query.py +112 -0
- dvt/artifacts/resources/v1/seed.py +42 -0
- dvt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dvt/artifacts/resources/v1/semantic_model.py +315 -0
- dvt/artifacts/resources/v1/singular_test.py +14 -0
- dvt/artifacts/resources/v1/snapshot.py +92 -0
- dvt/artifacts/resources/v1/source_definition.py +85 -0
- dvt/artifacts/resources/v1/sql_operation.py +10 -0
- dvt/artifacts/resources/v1/unit_test_definition.py +78 -0
- dvt/artifacts/schemas/__init__.py +0 -0
- dvt/artifacts/schemas/base.py +191 -0
- dvt/artifacts/schemas/batch_results.py +24 -0
- dvt/artifacts/schemas/catalog/__init__.py +12 -0
- dvt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dvt/artifacts/schemas/catalog/v1/catalog.py +60 -0
- dvt/artifacts/schemas/freshness/__init__.py +1 -0
- dvt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dvt/artifacts/schemas/freshness/v3/freshness.py +159 -0
- dvt/artifacts/schemas/manifest/__init__.py +2 -0
- dvt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dvt/artifacts/schemas/manifest/v12/manifest.py +212 -0
- dvt/artifacts/schemas/results.py +148 -0
- dvt/artifacts/schemas/run/__init__.py +2 -0
- dvt/artifacts/schemas/run/v5/__init__.py +0 -0
- dvt/artifacts/schemas/run/v5/run.py +184 -0
- dvt/artifacts/schemas/upgrades/__init__.py +4 -0
- dvt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dvt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dvt/artifacts/utils/validation.py +153 -0
- dvt/cli/__init__.py +1 -0
- dvt/cli/context.py +16 -0
- dvt/cli/exceptions.py +56 -0
- dvt/cli/flags.py +558 -0
- dvt/cli/main.py +971 -0
- dvt/cli/option_types.py +121 -0
- dvt/cli/options.py +79 -0
- dvt/cli/params.py +803 -0
- dvt/cli/requires.py +478 -0
- dvt/cli/resolvers.py +32 -0
- dvt/cli/types.py +40 -0
- dvt/clients/__init__.py +0 -0
- dvt/clients/checked_load.py +82 -0
- dvt/clients/git.py +164 -0
- dvt/clients/jinja.py +206 -0
- dvt/clients/jinja_static.py +245 -0
- dvt/clients/registry.py +192 -0
- dvt/clients/yaml_helper.py +68 -0
- dvt/compilation.py +833 -0
- dvt/compute/__init__.py +26 -0
- dvt/compute/base.py +288 -0
- dvt/compute/engines/__init__.py +13 -0
- dvt/compute/engines/duckdb_engine.py +368 -0
- dvt/compute/engines/spark_engine.py +273 -0
- dvt/compute/query_analyzer.py +212 -0
- dvt/compute/router.py +483 -0
- dvt/config/__init__.py +4 -0
- dvt/config/catalogs.py +95 -0
- dvt/config/compute_config.py +406 -0
- dvt/config/profile.py +411 -0
- dvt/config/profiles_v2.py +464 -0
- dvt/config/project.py +893 -0
- dvt/config/renderer.py +232 -0
- dvt/config/runtime.py +491 -0
- dvt/config/selectors.py +209 -0
- dvt/config/utils.py +78 -0
- dvt/connectors/.gitignore +6 -0
- dvt/connectors/README.md +306 -0
- dvt/connectors/catalog.yml +217 -0
- dvt/connectors/download_connectors.py +300 -0
- dvt/constants.py +29 -0
- dvt/context/__init__.py +0 -0
- dvt/context/base.py +746 -0
- dvt/context/configured.py +136 -0
- dvt/context/context_config.py +350 -0
- dvt/context/docs.py +82 -0
- dvt/context/exceptions_jinja.py +179 -0
- dvt/context/macro_resolver.py +195 -0
- dvt/context/macros.py +171 -0
- dvt/context/manifest.py +73 -0
- dvt/context/providers.py +2198 -0
- dvt/context/query_header.py +14 -0
- dvt/context/secret.py +59 -0
- dvt/context/target.py +74 -0
- dvt/contracts/__init__.py +0 -0
- dvt/contracts/files.py +413 -0
- dvt/contracts/graph/__init__.py +0 -0
- dvt/contracts/graph/manifest.py +1904 -0
- dvt/contracts/graph/metrics.py +98 -0
- dvt/contracts/graph/model_config.py +71 -0
- dvt/contracts/graph/node_args.py +42 -0
- dvt/contracts/graph/nodes.py +1806 -0
- dvt/contracts/graph/semantic_manifest.py +233 -0
- dvt/contracts/graph/unparsed.py +812 -0
- dvt/contracts/project.py +417 -0
- dvt/contracts/results.py +53 -0
- dvt/contracts/selection.py +23 -0
- dvt/contracts/sql.py +86 -0
- dvt/contracts/state.py +69 -0
- dvt/contracts/util.py +46 -0
- dvt/deprecations.py +347 -0
- dvt/deps/__init__.py +0 -0
- dvt/deps/base.py +153 -0
- dvt/deps/git.py +196 -0
- dvt/deps/local.py +80 -0
- dvt/deps/registry.py +131 -0
- dvt/deps/resolver.py +149 -0
- dvt/deps/tarball.py +121 -0
- dvt/docs/source/_ext/dbt_click.py +118 -0
- dvt/docs/source/conf.py +32 -0
- dvt/env_vars.py +64 -0
- dvt/event_time/event_time.py +40 -0
- dvt/event_time/sample_window.py +60 -0
- dvt/events/__init__.py +16 -0
- dvt/events/base_types.py +37 -0
- dvt/events/core_types_pb2.py +2 -0
- dvt/events/logging.py +109 -0
- dvt/events/types.py +2534 -0
- dvt/exceptions.py +1487 -0
- dvt/flags.py +89 -0
- dvt/graph/__init__.py +11 -0
- dvt/graph/cli.py +248 -0
- dvt/graph/graph.py +172 -0
- dvt/graph/queue.py +213 -0
- dvt/graph/selector.py +375 -0
- dvt/graph/selector_methods.py +976 -0
- dvt/graph/selector_spec.py +223 -0
- dvt/graph/thread_pool.py +18 -0
- dvt/hooks.py +21 -0
- dvt/include/README.md +49 -0
- dvt/include/__init__.py +3 -0
- dvt/include/global_project.py +4 -0
- dvt/include/starter_project/.gitignore +4 -0
- dvt/include/starter_project/README.md +15 -0
- dvt/include/starter_project/__init__.py +3 -0
- dvt/include/starter_project/analyses/.gitkeep +0 -0
- dvt/include/starter_project/dvt_project.yml +36 -0
- dvt/include/starter_project/macros/.gitkeep +0 -0
- dvt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dvt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dvt/include/starter_project/models/example/schema.yml +21 -0
- dvt/include/starter_project/seeds/.gitkeep +0 -0
- dvt/include/starter_project/snapshots/.gitkeep +0 -0
- dvt/include/starter_project/tests/.gitkeep +0 -0
- dvt/internal_deprecations.py +27 -0
- dvt/jsonschemas/__init__.py +3 -0
- dvt/jsonschemas/jsonschemas.py +309 -0
- dvt/jsonschemas/project/0.0.110.json +4717 -0
- dvt/jsonschemas/project/0.0.85.json +2015 -0
- dvt/jsonschemas/resources/0.0.110.json +2636 -0
- dvt/jsonschemas/resources/0.0.85.json +2536 -0
- dvt/jsonschemas/resources/latest.json +6773 -0
- dvt/links.py +4 -0
- dvt/materializations/__init__.py +0 -0
- dvt/materializations/incremental/__init__.py +0 -0
- dvt/materializations/incremental/microbatch.py +235 -0
- dvt/mp_context.py +8 -0
- dvt/node_types.py +37 -0
- dvt/parser/__init__.py +23 -0
- dvt/parser/analysis.py +21 -0
- dvt/parser/base.py +549 -0
- dvt/parser/common.py +267 -0
- dvt/parser/docs.py +52 -0
- dvt/parser/fixtures.py +51 -0
- dvt/parser/functions.py +30 -0
- dvt/parser/generic_test.py +100 -0
- dvt/parser/generic_test_builders.py +334 -0
- dvt/parser/hooks.py +119 -0
- dvt/parser/macros.py +137 -0
- dvt/parser/manifest.py +2204 -0
- dvt/parser/models.py +574 -0
- dvt/parser/partial.py +1179 -0
- dvt/parser/read_files.py +445 -0
- dvt/parser/schema_generic_tests.py +423 -0
- dvt/parser/schema_renderer.py +111 -0
- dvt/parser/schema_yaml_readers.py +936 -0
- dvt/parser/schemas.py +1467 -0
- dvt/parser/search.py +149 -0
- dvt/parser/seeds.py +28 -0
- dvt/parser/singular_test.py +20 -0
- dvt/parser/snapshots.py +44 -0
- dvt/parser/sources.py +557 -0
- dvt/parser/sql.py +63 -0
- dvt/parser/unit_tests.py +622 -0
- dvt/plugins/__init__.py +20 -0
- dvt/plugins/contracts.py +10 -0
- dvt/plugins/exceptions.py +2 -0
- dvt/plugins/manager.py +164 -0
- dvt/plugins/manifest.py +21 -0
- dvt/profiler.py +20 -0
- dvt/py.typed +1 -0
- dvt/runners/__init__.py +2 -0
- dvt/runners/exposure_runner.py +7 -0
- dvt/runners/no_op_runner.py +46 -0
- dvt/runners/saved_query_runner.py +7 -0
- dvt/selected_resources.py +8 -0
- dvt/task/__init__.py +0 -0
- dvt/task/base.py +504 -0
- dvt/task/build.py +197 -0
- dvt/task/clean.py +57 -0
- dvt/task/clone.py +162 -0
- dvt/task/compile.py +151 -0
- dvt/task/compute.py +366 -0
- dvt/task/debug.py +650 -0
- dvt/task/deps.py +280 -0
- dvt/task/docs/__init__.py +3 -0
- dvt/task/docs/generate.py +408 -0
- dvt/task/docs/index.html +250 -0
- dvt/task/docs/serve.py +28 -0
- dvt/task/freshness.py +323 -0
- dvt/task/function.py +122 -0
- dvt/task/group_lookup.py +46 -0
- dvt/task/init.py +374 -0
- dvt/task/list.py +237 -0
- dvt/task/printer.py +176 -0
- dvt/task/profiles.py +256 -0
- dvt/task/retry.py +175 -0
- dvt/task/run.py +1146 -0
- dvt/task/run_operation.py +142 -0
- dvt/task/runnable.py +802 -0
- dvt/task/seed.py +104 -0
- dvt/task/show.py +150 -0
- dvt/task/snapshot.py +57 -0
- dvt/task/sql.py +111 -0
- dvt/task/test.py +464 -0
- dvt/tests/fixtures/__init__.py +1 -0
- dvt/tests/fixtures/project.py +620 -0
- dvt/tests/util.py +651 -0
- dvt/tracking.py +529 -0
- dvt/utils/__init__.py +3 -0
- dvt/utils/artifact_upload.py +151 -0
- dvt/utils/utils.py +408 -0
- dvt/version.py +249 -0
- dvt_core-1.11.0b4.dist-info/METADATA +252 -0
- dvt_core-1.11.0b4.dist-info/RECORD +261 -0
- dvt_core-1.11.0b4.dist-info/WHEEL +5 -0
- dvt_core-1.11.0b4.dist-info/entry_points.txt +2 -0
- dvt_core-1.11.0b4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spark compute engine implementation.
|
|
3
|
+
|
|
4
|
+
This module provides DVT's Spark compute layer for large-scale processing
|
|
5
|
+
of heterogeneous data sources.
|
|
6
|
+
|
|
7
|
+
DVT uses dbt adapters for data extraction, eliminating the need for JDBC JARs:
|
|
8
|
+
- Data extracted via dbt adapters (Python)
|
|
9
|
+
- Converted to Arrow format for efficient transfer
|
|
10
|
+
- Loaded into Spark DataFrames
|
|
11
|
+
- No JDBC drivers or JAR management required
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from typing import Any, Optional, Tuple, Union
|
|
16
|
+
|
|
17
|
+
from dvt.compute.base import (
|
|
18
|
+
BaseComputeEngine,
|
|
19
|
+
ComputeResult,
|
|
20
|
+
ExecutionStrategy,
|
|
21
|
+
QueryExecutionPlan,
|
|
22
|
+
)
|
|
23
|
+
from dvt.config.compute_config import SparkClusterConfig, SparkLocalConfig
|
|
24
|
+
from dvt.events import fire_event
|
|
25
|
+
from dvt.events.types import Note
|
|
26
|
+
|
|
27
|
+
from dbt.adapters.exceptions import DbtRuntimeError
|
|
28
|
+
|
|
29
|
+
# PySpark import - will fail gracefully if not installed
|
|
30
|
+
try:
|
|
31
|
+
from pyspark.sql import SparkSession
|
|
32
|
+
|
|
33
|
+
PYSPARK_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
PYSPARK_AVAILABLE = False
|
|
36
|
+
SparkSession = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SparkEngine(BaseComputeEngine):
|
|
40
|
+
"""
|
|
41
|
+
Spark compute engine for DVT.
|
|
42
|
+
|
|
43
|
+
Supports both local (single-node) and cluster (distributed) modes.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
config: Union[SparkLocalConfig, SparkClusterConfig],
|
|
49
|
+
profile_registry: Any,
|
|
50
|
+
mode: str = "local",
|
|
51
|
+
):
|
|
52
|
+
"""
|
|
53
|
+
Initialize Spark engine.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
config: Spark configuration (local or cluster)
|
|
57
|
+
profile_registry: Registry for resolving profile connections
|
|
58
|
+
mode: 'local' or 'cluster'
|
|
59
|
+
"""
|
|
60
|
+
super().__init__(config=config.__dict__)
|
|
61
|
+
self.spark_config = config
|
|
62
|
+
self.profile_registry = profile_registry
|
|
63
|
+
self.mode = mode
|
|
64
|
+
self.spark: Optional[Any] = None # SparkSession
|
|
65
|
+
|
|
66
|
+
def initialize(self) -> None:
|
|
67
|
+
"""Initialize Spark session and load connectors."""
|
|
68
|
+
if not PYSPARK_AVAILABLE:
|
|
69
|
+
raise DbtRuntimeError("PySpark is not installed. Install with: pip install pyspark")
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
fire_event(Note(msg=f"Initializing Spark engine ({self.mode} mode)"))
|
|
73
|
+
|
|
74
|
+
# Create Spark session builder
|
|
75
|
+
builder = SparkSession.builder.appName(self.spark_config.app_name)
|
|
76
|
+
|
|
77
|
+
# Set master
|
|
78
|
+
builder = builder.master(self.spark_config.master)
|
|
79
|
+
|
|
80
|
+
# Set memory and cores
|
|
81
|
+
if hasattr(self.spark_config, "memory"):
|
|
82
|
+
builder = builder.config("spark.executor.memory", self.spark_config.memory)
|
|
83
|
+
if hasattr(self.spark_config, "driver_memory"):
|
|
84
|
+
builder = builder.config("spark.driver.memory", self.spark_config.driver_memory)
|
|
85
|
+
if hasattr(self.spark_config, "executor_cores"):
|
|
86
|
+
builder = builder.config(
|
|
87
|
+
"spark.executor.cores", str(self.spark_config.executor_cores)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Apply additional config
|
|
91
|
+
for key, value in self.spark_config.config.items():
|
|
92
|
+
builder = builder.config(key, value)
|
|
93
|
+
|
|
94
|
+
# Note: No JDBC JARs needed - DVT uses dbt adapters for data extraction
|
|
95
|
+
|
|
96
|
+
# Create session
|
|
97
|
+
self.spark = builder.getOrCreate()
|
|
98
|
+
|
|
99
|
+
# Set log level
|
|
100
|
+
self.spark.sparkContext.setLogLevel(self.spark_config.log_level)
|
|
101
|
+
|
|
102
|
+
self._initialized = True
|
|
103
|
+
fire_event(Note(msg="Spark engine initialized successfully"))
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
raise DbtRuntimeError(f"Failed to initialize Spark engine: {e}")
|
|
107
|
+
|
|
108
|
+
def shutdown(self) -> None:
|
|
109
|
+
"""Shutdown Spark session."""
|
|
110
|
+
if self.spark:
|
|
111
|
+
try:
|
|
112
|
+
self.spark.stop()
|
|
113
|
+
fire_event(Note(msg="Spark engine shutdown"))
|
|
114
|
+
except Exception as e:
|
|
115
|
+
fire_event(Note(msg=f"Error shutting down Spark: {e}"))
|
|
116
|
+
finally:
|
|
117
|
+
self.spark = None
|
|
118
|
+
self._initialized = False
|
|
119
|
+
|
|
120
|
+
def execute_query(
|
|
121
|
+
self,
|
|
122
|
+
sql: str,
|
|
123
|
+
execution_plan: QueryExecutionPlan,
|
|
124
|
+
) -> ComputeResult:
|
|
125
|
+
"""
|
|
126
|
+
Execute SQL query in Spark.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
sql: SQL query to execute
|
|
130
|
+
execution_plan: Execution plan with source information
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
ComputeResult
|
|
134
|
+
"""
|
|
135
|
+
if not self._initialized or not self.spark:
|
|
136
|
+
return ComputeResult(
|
|
137
|
+
success=False,
|
|
138
|
+
error="Spark engine not initialized",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
start_time = datetime.now()
|
|
143
|
+
|
|
144
|
+
# Register source tables
|
|
145
|
+
for source in execution_plan.sources:
|
|
146
|
+
self._register_source(source.profile_name, source.adapter_type, source.relation)
|
|
147
|
+
|
|
148
|
+
# Execute query
|
|
149
|
+
df = self.spark.sql(sql)
|
|
150
|
+
|
|
151
|
+
# Get row count (triggers execution)
|
|
152
|
+
rows_affected = df.count()
|
|
153
|
+
|
|
154
|
+
# Calculate execution time
|
|
155
|
+
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
156
|
+
|
|
157
|
+
return ComputeResult(
|
|
158
|
+
success=True,
|
|
159
|
+
rows_affected=rows_affected,
|
|
160
|
+
execution_time_ms=execution_time,
|
|
161
|
+
strategy_used=ExecutionStrategy.COMPUTE_LAYER,
|
|
162
|
+
compute_engine_used=f"spark_{self.mode}",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
return ComputeResult(
|
|
167
|
+
success=False,
|
|
168
|
+
error=str(e),
|
|
169
|
+
strategy_used=ExecutionStrategy.COMPUTE_LAYER,
|
|
170
|
+
compute_engine_used=f"spark_{self.mode}",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def _register_source(self, profile_name: str, adapter_type: str, relation: Any) -> None:
|
|
174
|
+
"""
|
|
175
|
+
Register a source table in Spark.
|
|
176
|
+
|
|
177
|
+
TODO: Replace with dbt adapter-based extraction:
|
|
178
|
+
1. Get dbt adapter for profile
|
|
179
|
+
2. Execute SELECT * FROM table via adapter
|
|
180
|
+
3. Convert Agate table to Arrow
|
|
181
|
+
4. Create Spark DataFrame from Arrow
|
|
182
|
+
5. Register as temp view
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
profile_name: Profile name
|
|
186
|
+
adapter_type: Adapter type
|
|
187
|
+
relation: Relation object
|
|
188
|
+
"""
|
|
189
|
+
# TODO: Implement dbt adapter-based extraction
|
|
190
|
+
# For now, this is a placeholder
|
|
191
|
+
fire_event(
|
|
192
|
+
Note(
|
|
193
|
+
msg=f"TODO: Extract {relation} from {profile_name} via dbt adapter (not yet implemented)"
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def can_handle(self, execution_plan: QueryExecutionPlan) -> bool:
|
|
198
|
+
"""
|
|
199
|
+
Check if Spark can handle this execution plan.
|
|
200
|
+
|
|
201
|
+
Spark can handle any database with a dbt adapter (uses adapter-based extraction).
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
execution_plan: Execution plan
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
True if Spark can handle it
|
|
208
|
+
"""
|
|
209
|
+
# Spark can handle any source with a dbt adapter
|
|
210
|
+
# Data extracted via adapter, converted to Arrow, loaded into Spark
|
|
211
|
+
return True
|
|
212
|
+
|
|
213
|
+
def estimate_cost(self, execution_plan: QueryExecutionPlan) -> float:
|
|
214
|
+
"""
|
|
215
|
+
Estimate cost of executing with Spark.
|
|
216
|
+
|
|
217
|
+
Spark has higher overhead but scales better:
|
|
218
|
+
- Small data (< 1GB): High cost (DuckDB is better)
|
|
219
|
+
- Medium data (1-10GB): Medium cost
|
|
220
|
+
- Large data (> 10GB): Low cost (Spark shines here)
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
execution_plan: Execution plan
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Cost estimate
|
|
227
|
+
"""
|
|
228
|
+
data_size_gb = execution_plan.estimated_data_size_mb / 1024
|
|
229
|
+
|
|
230
|
+
if self.mode == "local":
|
|
231
|
+
# Local mode has startup overhead
|
|
232
|
+
if data_size_gb < 1:
|
|
233
|
+
return 80.0 # High cost for small data
|
|
234
|
+
elif data_size_gb < 10:
|
|
235
|
+
return 40.0 # Medium cost
|
|
236
|
+
else:
|
|
237
|
+
return 20.0 # Low cost for large data
|
|
238
|
+
else:
|
|
239
|
+
# Cluster mode has even more overhead but scales better
|
|
240
|
+
if data_size_gb < 10:
|
|
241
|
+
return 100.0 # Very high cost for small/medium data
|
|
242
|
+
elif data_size_gb < 100:
|
|
243
|
+
return 30.0 # Medium cost
|
|
244
|
+
else:
|
|
245
|
+
return 10.0 # Very low cost for huge data
|
|
246
|
+
|
|
247
|
+
def get_engine_name(self) -> str:
|
|
248
|
+
"""Get engine name."""
|
|
249
|
+
return f"spark_{self.mode}"
|
|
250
|
+
|
|
251
|
+
def test_connection(self) -> Tuple[bool, Optional[str]]:
|
|
252
|
+
"""
|
|
253
|
+
Test if Spark is available and working.
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
(success, error_message)
|
|
257
|
+
"""
|
|
258
|
+
if not PYSPARK_AVAILABLE:
|
|
259
|
+
return (False, "PySpark not installed")
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
# Try to create a session
|
|
263
|
+
spark = (
|
|
264
|
+
SparkSession.builder.master("local[1]")
|
|
265
|
+
.appName("dvt-test")
|
|
266
|
+
.config("spark.ui.enabled", "false")
|
|
267
|
+
.getOrCreate()
|
|
268
|
+
)
|
|
269
|
+
spark.sql("SELECT 1").collect()
|
|
270
|
+
spark.stop()
|
|
271
|
+
return (True, None)
|
|
272
|
+
except Exception as e:
|
|
273
|
+
return (False, str(e))
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Query analyzer for DVT execution routing.
|
|
3
|
+
|
|
4
|
+
This module analyzes SQL queries to extract source references and build
|
|
5
|
+
execution plans for the ExecutionRouter.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
10
|
+
|
|
11
|
+
from dvt.compute.base import QueryExecutionPlan, SourceInfo
|
|
12
|
+
from dvt.contracts.graph.manifest import Manifest
|
|
13
|
+
|
|
14
|
+
from dbt_common.events.functions import fire_event
|
|
15
|
+
from dbt_common.events.types import Note
|
|
16
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class QueryAnalyzer:
|
|
20
|
+
"""
|
|
21
|
+
Analyzes SQL queries to extract source dependencies.
|
|
22
|
+
|
|
23
|
+
This analyzer:
|
|
24
|
+
- Parses Jinja source() calls to find source references
|
|
25
|
+
- Looks up source metadata from manifest
|
|
26
|
+
- Estimates data sizes for execution planning
|
|
27
|
+
- Builds SourceInfo objects for ExecutionRouter
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, manifest: Manifest):
|
|
31
|
+
"""
|
|
32
|
+
Initialize QueryAnalyzer.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
manifest: Manifest with source definitions
|
|
36
|
+
"""
|
|
37
|
+
self.manifest = manifest
|
|
38
|
+
|
|
39
|
+
def analyze_model_sql(self, sql: str, model_node: Optional[Any] = None) -> List[SourceInfo]:
|
|
40
|
+
"""
|
|
41
|
+
Analyze SQL to extract source references.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
sql: SQL query (may contain Jinja)
|
|
45
|
+
model_node: Optional model node for context
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of SourceInfo objects
|
|
49
|
+
"""
|
|
50
|
+
# Extract source references from SQL
|
|
51
|
+
source_refs = self._extract_source_references(sql)
|
|
52
|
+
|
|
53
|
+
# Build SourceInfo for each reference
|
|
54
|
+
source_infos: List[SourceInfo] = []
|
|
55
|
+
for source_name, table_name in source_refs:
|
|
56
|
+
source_info = self._build_source_info(source_name, table_name)
|
|
57
|
+
if source_info:
|
|
58
|
+
source_infos.append(source_info)
|
|
59
|
+
|
|
60
|
+
return source_infos
|
|
61
|
+
|
|
62
|
+
def _extract_source_references(self, sql: str) -> Set[Tuple[str, str]]:
|
|
63
|
+
"""
|
|
64
|
+
Extract source() references from SQL.
|
|
65
|
+
|
|
66
|
+
Looks for patterns like:
|
|
67
|
+
- {{ source('schema_name', 'table_name') }}
|
|
68
|
+
- {{source("schema_name", "table_name")}}
|
|
69
|
+
- {{ source( 'schema_name' , 'table_name' ) }}
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
sql: SQL query with Jinja
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Set of (source_name, table_name) tuples
|
|
76
|
+
"""
|
|
77
|
+
sources: Set[Tuple[str, str]] = set()
|
|
78
|
+
|
|
79
|
+
# Pattern to match source() calls
|
|
80
|
+
# Handles single or double quotes, optional whitespace
|
|
81
|
+
pattern = r"{{\s*source\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*['\"]([^'\"]+)['\"]\s*\)\s*}}"
|
|
82
|
+
|
|
83
|
+
matches = re.finditer(pattern, sql, re.IGNORECASE)
|
|
84
|
+
for match in matches:
|
|
85
|
+
source_name = match.group(1)
|
|
86
|
+
table_name = match.group(2)
|
|
87
|
+
sources.add((source_name, table_name))
|
|
88
|
+
|
|
89
|
+
return sources
|
|
90
|
+
|
|
91
|
+
def _build_source_info(self, source_name: str, table_name: str) -> Optional[SourceInfo]:
|
|
92
|
+
"""
|
|
93
|
+
Build SourceInfo from source reference.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
source_name: Source schema name
|
|
97
|
+
table_name: Table name
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
SourceInfo or None if source not found
|
|
101
|
+
"""
|
|
102
|
+
# Resolve source in manifest
|
|
103
|
+
source = self.manifest.resolve_source(
|
|
104
|
+
source_name=source_name,
|
|
105
|
+
table_name=table_name,
|
|
106
|
+
current_project=None,
|
|
107
|
+
node_package=None,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if not source or not hasattr(source, "unique_id"):
|
|
111
|
+
fire_event(
|
|
112
|
+
Note(msg=f"Warning: Source '{source_name}.{table_name}' not found in manifest")
|
|
113
|
+
)
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
# Extract profile name (DVT-specific)
|
|
117
|
+
profile_name = getattr(source, "profile", None)
|
|
118
|
+
if not profile_name:
|
|
119
|
+
# No profile specified - use default target profile
|
|
120
|
+
fire_event(
|
|
121
|
+
Note(
|
|
122
|
+
msg=f"Source '{source_name}.{table_name}' has no profile, "
|
|
123
|
+
"will use default target"
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
profile_name = "default"
|
|
127
|
+
|
|
128
|
+
# Determine adapter type
|
|
129
|
+
# TODO: Look up adapter type from profile
|
|
130
|
+
adapter_type = "unknown"
|
|
131
|
+
|
|
132
|
+
# Estimate data size
|
|
133
|
+
# TODO: Implement actual size estimation
|
|
134
|
+
# For now, use placeholder values
|
|
135
|
+
estimated_size_mb = None
|
|
136
|
+
estimated_rows = None
|
|
137
|
+
|
|
138
|
+
# Build SourceInfo
|
|
139
|
+
source_info = SourceInfo(
|
|
140
|
+
source_name=f"{source_name}.{table_name}",
|
|
141
|
+
profile_name=profile_name,
|
|
142
|
+
adapter_type=adapter_type,
|
|
143
|
+
database=getattr(source, "database", None),
|
|
144
|
+
schema=getattr(source, "schema", source_name),
|
|
145
|
+
identifier=getattr(source, "identifier", table_name),
|
|
146
|
+
estimated_size_mb=estimated_size_mb,
|
|
147
|
+
estimated_rows=estimated_rows,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return source_info
|
|
151
|
+
|
|
152
|
+
def build_execution_plan_for_model(
|
|
153
|
+
self, sql: str, model_node: Optional[Any] = None
|
|
154
|
+
) -> QueryExecutionPlan:
|
|
155
|
+
"""
|
|
156
|
+
Build complete execution plan for a model.
|
|
157
|
+
|
|
158
|
+
This is a convenience method that:
|
|
159
|
+
1. Analyzes SQL to extract sources
|
|
160
|
+
2. Builds SourceInfo objects
|
|
161
|
+
3. Creates QueryExecutionPlan
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
sql: SQL query
|
|
165
|
+
model_node: Optional model node
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
QueryExecutionPlan ready for strategy selection
|
|
169
|
+
"""
|
|
170
|
+
# Analyze sources
|
|
171
|
+
sources = self.analyze_model_sql(sql, model_node)
|
|
172
|
+
|
|
173
|
+
# Calculate metrics
|
|
174
|
+
unique_profiles = {s.profile_name for s in sources}
|
|
175
|
+
unique_adapters = {s.adapter_type for s in sources}
|
|
176
|
+
is_homogeneous = len(unique_profiles) <= 1 and len(unique_adapters) <= 1
|
|
177
|
+
|
|
178
|
+
# Estimate data size
|
|
179
|
+
total_size_mb = sum(s.estimated_size_mb or 0 for s in sources)
|
|
180
|
+
total_rows = sum(s.estimated_rows or 0 for s in sources)
|
|
181
|
+
|
|
182
|
+
# Create execution plan
|
|
183
|
+
from dvt.compute.base import ExecutionStrategy
|
|
184
|
+
|
|
185
|
+
plan = QueryExecutionPlan(
|
|
186
|
+
strategy=ExecutionStrategy.AUTO,
|
|
187
|
+
sources=sources,
|
|
188
|
+
is_homogeneous=is_homogeneous,
|
|
189
|
+
estimated_data_size_mb=total_size_mb if total_size_mb > 0 else None,
|
|
190
|
+
estimated_rows=total_rows if total_rows > 0 else None,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Set pushdown target if homogeneous
|
|
194
|
+
if is_homogeneous and sources:
|
|
195
|
+
plan.pushdown_target = sources[0].profile_name
|
|
196
|
+
|
|
197
|
+
return plan
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def analyze_query_sources(sql: str, manifest: Manifest) -> List[SourceInfo]:
|
|
201
|
+
"""
|
|
202
|
+
Convenience function to analyze query sources.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
sql: SQL query
|
|
206
|
+
manifest: Manifest
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
List of SourceInfo objects
|
|
210
|
+
"""
|
|
211
|
+
analyzer = QueryAnalyzer(manifest)
|
|
212
|
+
return analyzer.analyze_model_sql(sql)
|