dvt-core 0.52.2__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2039 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +804 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.py +624 -0
- dbt/compute/federated_executor.py +837 -0
- dbt/compute/filter_pushdown.cpython-310-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-310-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-310-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-310-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/smart_selector.cpython-310-darwin.so +0 -0
- dbt/compute/smart_selector.py +311 -0
- dbt/compute/strategies/__init__.py +54 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.py +364 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-310-darwin.so +0 -0
- dbt/config/compute.py +547 -0
- dbt/config/dvt_profile.cpython-310-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +346 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +247 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-310-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.py +454 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/generate.py +660 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.py +29 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.py +553 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/printer.py +175 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1306 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.py +759 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +268 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.52.2.dist-info/METADATA +286 -0
- dvt_core-0.52.2.dist-info/RECORD +275 -0
- dvt_core-0.52.2.dist-info/WHEEL +5 -0
- dvt_core-0.52.2.dist-info/entry_points.txt +2 -0
- dvt_core-0.52.2.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Smart Compute Engine Selector
|
|
3
|
+
|
|
4
|
+
Automatically selects the optimal compute engine (Spark Local vs Spark Cluster) based on
|
|
5
|
+
workload characteristics when user doesn't specify a preference.
|
|
6
|
+
|
|
7
|
+
Selection criteria:
|
|
8
|
+
- Estimated data size
|
|
9
|
+
- Number of sources
|
|
10
|
+
- Query complexity
|
|
11
|
+
- Available resources
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
from dbt.contracts.graph.manifest import Manifest
|
|
18
|
+
from dbt.contracts.graph.nodes import ManifestNode
|
|
19
|
+
from dbt.query_analyzer import QueryAnalysisResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class WorkloadEstimate:
|
|
24
|
+
"""Estimated workload characteristics for a query."""
|
|
25
|
+
|
|
26
|
+
estimated_rows: int # Estimated total rows to process
|
|
27
|
+
source_count: int # Number of source tables
|
|
28
|
+
connection_count: int # Number of different connections
|
|
29
|
+
has_aggregations: bool # Query contains GROUP BY or aggregations
|
|
30
|
+
has_joins: bool # Query contains JOIN operations
|
|
31
|
+
complexity_score: float # 0.0 to 1.0, higher = more complex
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def estimated_data_mb(self) -> float:
|
|
35
|
+
"""Rough estimate of data size in MB (assuming ~100 bytes/row)."""
|
|
36
|
+
return (self.estimated_rows * 100) / (1024 * 1024)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SmartComputeSelector:
|
|
40
|
+
"""
|
|
41
|
+
Intelligently selects compute engine based on workload characteristics.
|
|
42
|
+
|
|
43
|
+
v0.3.0: Unified Spark architecture - selects between spark-local and spark-cluster.
|
|
44
|
+
|
|
45
|
+
Default thresholds:
|
|
46
|
+
- Small/medium workload (<10GB): spark-local
|
|
47
|
+
- Large workload (>10GB): spark-cluster (if configured)
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Default thresholds (can be configured)
|
|
51
|
+
CLUSTER_THRESHOLD_MB = 10000 # 10GB - threshold for cluster recommendation
|
|
52
|
+
CLUSTER_THRESHOLD_GB = 10 # Same in GB for clarity
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
manifest: Manifest,
|
|
57
|
+
cluster_threshold_mb: Optional[int] = None,
|
|
58
|
+
compute_registry: Optional[Any] = None,
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Initialize smart selector.
|
|
62
|
+
|
|
63
|
+
:param manifest: The dbt manifest
|
|
64
|
+
:param cluster_threshold_mb: Data size threshold for cluster (default: 10GB)
|
|
65
|
+
:param compute_registry: ComputeRegistry instance for checking cluster availability
|
|
66
|
+
"""
|
|
67
|
+
self.manifest = manifest
|
|
68
|
+
self.cluster_threshold_mb = cluster_threshold_mb or self.CLUSTER_THRESHOLD_MB
|
|
69
|
+
self.compute_registry = compute_registry
|
|
70
|
+
|
|
71
|
+
def select_engine(
|
|
72
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
73
|
+
) -> str:
|
|
74
|
+
"""
|
|
75
|
+
Select the optimal compute engine for a node.
|
|
76
|
+
|
|
77
|
+
v0.3.0: Returns "spark-local" or "spark-cluster"
|
|
78
|
+
|
|
79
|
+
:param node: The node to execute
|
|
80
|
+
:param analysis_result: Query analysis result
|
|
81
|
+
:returns: "spark-local" or "spark-cluster"
|
|
82
|
+
"""
|
|
83
|
+
# Estimate workload
|
|
84
|
+
estimate = self._estimate_workload(node, analysis_result)
|
|
85
|
+
|
|
86
|
+
# Apply selection logic
|
|
87
|
+
return self._apply_selection_logic(estimate)
|
|
88
|
+
|
|
89
|
+
def _estimate_workload(
|
|
90
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
91
|
+
) -> WorkloadEstimate:
|
|
92
|
+
"""
|
|
93
|
+
Estimate workload characteristics for a node.
|
|
94
|
+
|
|
95
|
+
:param node: The node to analyze
|
|
96
|
+
:param analysis_result: Query analysis result
|
|
97
|
+
:returns: WorkloadEstimate
|
|
98
|
+
"""
|
|
99
|
+
# Count sources
|
|
100
|
+
source_count = len(analysis_result.source_refs)
|
|
101
|
+
connection_count = len(analysis_result.source_connections)
|
|
102
|
+
|
|
103
|
+
# Estimate row count from sources
|
|
104
|
+
estimated_rows = self._estimate_row_count(analysis_result.source_refs)
|
|
105
|
+
|
|
106
|
+
# Analyze SQL for complexity
|
|
107
|
+
sql = node.compiled_code if hasattr(node, "compiled_code") else node.raw_code
|
|
108
|
+
has_aggregations = self._has_aggregations(sql)
|
|
109
|
+
has_joins = self._has_joins(sql)
|
|
110
|
+
|
|
111
|
+
# Calculate complexity score
|
|
112
|
+
complexity_score = self._calculate_complexity(
|
|
113
|
+
source_count=source_count,
|
|
114
|
+
connection_count=connection_count,
|
|
115
|
+
has_aggregations=has_aggregations,
|
|
116
|
+
has_joins=has_joins,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return WorkloadEstimate(
|
|
120
|
+
estimated_rows=estimated_rows,
|
|
121
|
+
source_count=source_count,
|
|
122
|
+
connection_count=connection_count,
|
|
123
|
+
has_aggregations=has_aggregations,
|
|
124
|
+
has_joins=has_joins,
|
|
125
|
+
complexity_score=complexity_score,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def _estimate_row_count(self, source_refs: set) -> int:
|
|
129
|
+
"""
|
|
130
|
+
Estimate total row count from source tables.
|
|
131
|
+
|
|
132
|
+
Uses catalog metadata if available, otherwise uses heuristics.
|
|
133
|
+
|
|
134
|
+
:param source_refs: Set of source unique_ids
|
|
135
|
+
:returns: Estimated row count
|
|
136
|
+
"""
|
|
137
|
+
total_rows = 0
|
|
138
|
+
|
|
139
|
+
for source_id in source_refs:
|
|
140
|
+
source = self.manifest.sources.get(source_id)
|
|
141
|
+
if not source:
|
|
142
|
+
# Unknown source, use conservative estimate
|
|
143
|
+
total_rows += 100000
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
# Check if we have catalog metadata with row counts
|
|
147
|
+
# Note: This would come from `dbt docs generate`
|
|
148
|
+
# For now, use a heuristic based on naming
|
|
149
|
+
if (
|
|
150
|
+
"fact" in source.identifier.lower()
|
|
151
|
+
or "events" in source.identifier.lower()
|
|
152
|
+
):
|
|
153
|
+
# Fact tables tend to be larger
|
|
154
|
+
total_rows += 1000000
|
|
155
|
+
elif (
|
|
156
|
+
"dim" in source.identifier.lower()
|
|
157
|
+
or "lookup" in source.identifier.lower()
|
|
158
|
+
):
|
|
159
|
+
# Dimension tables tend to be smaller
|
|
160
|
+
total_rows += 10000
|
|
161
|
+
else:
|
|
162
|
+
# Default estimate
|
|
163
|
+
total_rows += 100000
|
|
164
|
+
|
|
165
|
+
return total_rows
|
|
166
|
+
|
|
167
|
+
def _has_aggregations(self, sql: str) -> bool:
|
|
168
|
+
"""Check if SQL contains aggregations."""
|
|
169
|
+
sql_upper = sql.upper()
|
|
170
|
+
return any(
|
|
171
|
+
keyword in sql_upper
|
|
172
|
+
for keyword in [
|
|
173
|
+
" GROUP BY ",
|
|
174
|
+
" SUM(",
|
|
175
|
+
" COUNT(",
|
|
176
|
+
" AVG(",
|
|
177
|
+
" MIN(",
|
|
178
|
+
" MAX(",
|
|
179
|
+
" HAVING ",
|
|
180
|
+
]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def _has_joins(self, sql: str) -> bool:
|
|
184
|
+
"""Check if SQL contains joins."""
|
|
185
|
+
sql_upper = sql.upper()
|
|
186
|
+
return any(
|
|
187
|
+
keyword in sql_upper
|
|
188
|
+
for keyword in [
|
|
189
|
+
" JOIN ",
|
|
190
|
+
" INNER JOIN ",
|
|
191
|
+
" LEFT JOIN ",
|
|
192
|
+
" RIGHT JOIN ",
|
|
193
|
+
" FULL JOIN ",
|
|
194
|
+
" CROSS JOIN ",
|
|
195
|
+
]
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def _calculate_complexity(
|
|
199
|
+
self,
|
|
200
|
+
source_count: int,
|
|
201
|
+
connection_count: int,
|
|
202
|
+
has_aggregations: bool,
|
|
203
|
+
has_joins: bool,
|
|
204
|
+
) -> float:
|
|
205
|
+
"""
|
|
206
|
+
Calculate query complexity score (0.0 to 1.0).
|
|
207
|
+
|
|
208
|
+
:returns: Complexity score
|
|
209
|
+
"""
|
|
210
|
+
score = 0.0
|
|
211
|
+
|
|
212
|
+
# Source count contributes
|
|
213
|
+
score += min(source_count / 10.0, 0.3)
|
|
214
|
+
|
|
215
|
+
# Multiple connections increases complexity
|
|
216
|
+
score += min(connection_count / 5.0, 0.2)
|
|
217
|
+
|
|
218
|
+
# Aggregations add complexity
|
|
219
|
+
if has_aggregations:
|
|
220
|
+
score += 0.2
|
|
221
|
+
|
|
222
|
+
# Joins add complexity
|
|
223
|
+
if has_joins:
|
|
224
|
+
score += 0.3
|
|
225
|
+
|
|
226
|
+
return min(score, 1.0)
|
|
227
|
+
|
|
228
|
+
def _apply_selection_logic(self, estimate: WorkloadEstimate) -> str:
|
|
229
|
+
"""
|
|
230
|
+
Apply selection logic based on workload estimate.
|
|
231
|
+
|
|
232
|
+
v0.3.0: Selects between spark-local and spark-cluster only.
|
|
233
|
+
|
|
234
|
+
:param estimate: WorkloadEstimate
|
|
235
|
+
:returns: "spark-local" or "spark-cluster"
|
|
236
|
+
"""
|
|
237
|
+
# Rule 1: Large data → prefer cluster (if available)
|
|
238
|
+
if estimate.estimated_data_mb > self.cluster_threshold_mb:
|
|
239
|
+
# Check if cluster is configured
|
|
240
|
+
if self._cluster_available():
|
|
241
|
+
return "spark-cluster"
|
|
242
|
+
else:
|
|
243
|
+
# Log warning about large data on local
|
|
244
|
+
# Note: Logging should be done by caller, we just return the engine
|
|
245
|
+
return "spark-local"
|
|
246
|
+
|
|
247
|
+
# Rule 2: Everything else → spark-local (default)
|
|
248
|
+
# spark-local is excellent for most workloads (<10GB)
|
|
249
|
+
return "spark-local"
|
|
250
|
+
|
|
251
|
+
def _cluster_available(self) -> bool:
|
|
252
|
+
"""
|
|
253
|
+
Check if a Spark cluster is configured.
|
|
254
|
+
|
|
255
|
+
:returns: True if cluster compute engine is available
|
|
256
|
+
"""
|
|
257
|
+
if not self.compute_registry:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
# Check if any cluster computes are registered (not spark-local)
|
|
261
|
+
clusters = self.compute_registry.list()
|
|
262
|
+
for cluster in clusters:
|
|
263
|
+
if cluster.type == "spark" and cluster.name != "spark-local":
|
|
264
|
+
# Check if it's actually a cluster (not local master)
|
|
265
|
+
config = cluster.config
|
|
266
|
+
if "master" in config:
|
|
267
|
+
master = config.get("master", "")
|
|
268
|
+
if not master.startswith("local"):
|
|
269
|
+
return True
|
|
270
|
+
elif "host" in config or "cluster_id" in config:
|
|
271
|
+
# Databricks or other remote cluster
|
|
272
|
+
return True
|
|
273
|
+
|
|
274
|
+
return False
|
|
275
|
+
|
|
276
|
+
def get_recommendation_reason(
|
|
277
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
278
|
+
) -> str:
|
|
279
|
+
"""
|
|
280
|
+
Get human-readable explanation for engine selection.
|
|
281
|
+
|
|
282
|
+
:param node: The node
|
|
283
|
+
:param analysis_result: Query analysis result
|
|
284
|
+
:returns: Explanation string
|
|
285
|
+
"""
|
|
286
|
+
estimate = self._estimate_workload(node, analysis_result)
|
|
287
|
+
engine = self._apply_selection_logic(estimate)
|
|
288
|
+
|
|
289
|
+
reasons = []
|
|
290
|
+
|
|
291
|
+
if estimate.estimated_data_mb > self.cluster_threshold_mb:
|
|
292
|
+
reasons.append(
|
|
293
|
+
f"Large dataset ({estimate.estimated_data_mb:.0f} MB / {estimate.estimated_data_mb / 1024:.1f} GB)"
|
|
294
|
+
)
|
|
295
|
+
if engine == "spark-local":
|
|
296
|
+
reasons.append(
|
|
297
|
+
"No cluster configured (consider registering a Spark cluster)"
|
|
298
|
+
)
|
|
299
|
+
else:
|
|
300
|
+
reasons.append(
|
|
301
|
+
f"Small/medium workload ({estimate.estimated_data_mb:.0f} MB, {estimate.source_count} sources)"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
if estimate.source_count > 5:
|
|
305
|
+
reasons.append(f"Many sources ({estimate.source_count})")
|
|
306
|
+
|
|
307
|
+
if estimate.complexity_score > 0.7:
|
|
308
|
+
reasons.append(f"High complexity (score: {estimate.complexity_score:.2f})")
|
|
309
|
+
|
|
310
|
+
reason_str = "; ".join(reasons)
|
|
311
|
+
return f"Selected {engine}: {reason_str}"
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spark Connection Strategies
|
|
3
|
+
|
|
4
|
+
This module provides different strategies for connecting to Spark clusters.
|
|
5
|
+
Uses the strategy pattern for flexible platform support.
|
|
6
|
+
|
|
7
|
+
v0.5.98: Added EMRStrategy, DataprocStrategy, and StandaloneStrategy.
|
|
8
|
+
v0.51.2: Removed Databricks support (serverless cannot read external JDBC sources).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dbt.compute.strategies.base import BaseConnectionStrategy
|
|
12
|
+
from dbt.compute.strategies.local import LocalStrategy
|
|
13
|
+
|
|
14
|
+
# Strategies are imported lazily to avoid import errors when
|
|
15
|
+
# optional dependencies are not installed
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_emr_strategy():
|
|
19
|
+
"""
|
|
20
|
+
Lazily import and return EMRStrategy.
|
|
21
|
+
|
|
22
|
+
:returns: EMRStrategy class
|
|
23
|
+
"""
|
|
24
|
+
from dbt.compute.strategies.emr import EMRStrategy
|
|
25
|
+
return EMRStrategy
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_dataproc_strategy():
|
|
29
|
+
"""
|
|
30
|
+
Lazily import and return DataprocStrategy.
|
|
31
|
+
|
|
32
|
+
:returns: DataprocStrategy class
|
|
33
|
+
"""
|
|
34
|
+
from dbt.compute.strategies.dataproc import DataprocStrategy
|
|
35
|
+
return DataprocStrategy
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_standalone_strategy():
|
|
39
|
+
"""
|
|
40
|
+
Lazily import and return StandaloneStrategy.
|
|
41
|
+
|
|
42
|
+
:returns: StandaloneStrategy class
|
|
43
|
+
"""
|
|
44
|
+
from dbt.compute.strategies.standalone import StandaloneStrategy
|
|
45
|
+
return StandaloneStrategy
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
"BaseConnectionStrategy",
|
|
50
|
+
"LocalStrategy",
|
|
51
|
+
"get_emr_strategy",
|
|
52
|
+
"get_dataproc_strategy",
|
|
53
|
+
"get_standalone_strategy",
|
|
54
|
+
]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Connection Strategy for Spark Engines
|
|
3
|
+
|
|
4
|
+
Defines the abstract interface for different Spark connection strategies.
|
|
5
|
+
Uses composition over inheritance for flexible platform support.
|
|
6
|
+
|
|
7
|
+
v0.5.98: Added JAR provisioning and connectivity testing methods.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import Any, Dict, Optional, Set, Tuple
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from pyspark.sql import SparkSession
|
|
15
|
+
|
|
16
|
+
PYSPARK_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
PYSPARK_AVAILABLE = False
|
|
19
|
+
SparkSession = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseConnectionStrategy(ABC):
|
|
23
|
+
"""
|
|
24
|
+
Abstract base class for Spark connection strategies.
|
|
25
|
+
|
|
26
|
+
Different strategies implement different ways to connect to Spark:
|
|
27
|
+
- LocalStrategy: Embedded PySpark (in-process)
|
|
28
|
+
- DatabricksStrategy: Databricks Connect (remote cluster)
|
|
29
|
+
- EMRStrategy: AWS EMR cluster
|
|
30
|
+
- DataprocStrategy: GCP Dataproc
|
|
31
|
+
- StandaloneStrategy: Self-managed Spark clusters
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: Dict[str, Any], app_name: str = "DVT-Compute"):
|
|
35
|
+
"""
|
|
36
|
+
Initialize connection strategy.
|
|
37
|
+
|
|
38
|
+
:param config: Strategy-specific configuration
|
|
39
|
+
:param app_name: Spark application name
|
|
40
|
+
"""
|
|
41
|
+
self.config = config
|
|
42
|
+
self.app_name = app_name
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def get_spark_session(self) -> SparkSession:
|
|
46
|
+
"""
|
|
47
|
+
Create and return a SparkSession.
|
|
48
|
+
|
|
49
|
+
:returns: Initialized SparkSession
|
|
50
|
+
:raises DbtRuntimeError: If session creation fails
|
|
51
|
+
"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def validate_config(self) -> None:
|
|
56
|
+
"""
|
|
57
|
+
Validate strategy-specific configuration.
|
|
58
|
+
|
|
59
|
+
:raises DbtRuntimeError: If configuration is invalid
|
|
60
|
+
"""
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
64
|
+
"""
|
|
65
|
+
Estimate cost for running on this platform.
|
|
66
|
+
|
|
67
|
+
Default implementation returns 0.0 (free). Override for cloud platforms.
|
|
68
|
+
|
|
69
|
+
:param duration_minutes: Estimated query duration in minutes
|
|
70
|
+
:returns: Estimated cost in USD
|
|
71
|
+
"""
|
|
72
|
+
return 0.0
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
76
|
+
"""
|
|
77
|
+
Clean up Spark session.
|
|
78
|
+
|
|
79
|
+
:param spark: SparkSession to clean up (may be None)
|
|
80
|
+
"""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
def get_platform_name(self) -> str:
|
|
84
|
+
"""
|
|
85
|
+
Get human-readable platform name.
|
|
86
|
+
|
|
87
|
+
:returns: Platform name (e.g., "local", "databricks", "emr")
|
|
88
|
+
"""
|
|
89
|
+
return self.__class__.__name__.replace("Strategy", "").lower()
|
|
90
|
+
|
|
91
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
92
|
+
"""
|
|
93
|
+
Get Spark configuration for JDBC JAR provisioning.
|
|
94
|
+
|
|
95
|
+
Default implementation returns empty dict. Override in subclasses
|
|
96
|
+
to provide platform-specific JAR configuration.
|
|
97
|
+
|
|
98
|
+
Local platforms use spark.jars (local file paths).
|
|
99
|
+
Remote platforms use spark.jars.packages (Maven coordinates).
|
|
100
|
+
|
|
101
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
102
|
+
:returns: Dictionary of Spark config keys/values (e.g., {"spark.jars": "..."})
|
|
103
|
+
"""
|
|
104
|
+
return {}
|
|
105
|
+
|
|
106
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
107
|
+
"""
|
|
108
|
+
Test basic connectivity to the Spark cluster.
|
|
109
|
+
|
|
110
|
+
Creates a session, runs a simple query, and returns status.
|
|
111
|
+
Override for platform-specific connectivity testing.
|
|
112
|
+
|
|
113
|
+
:returns: Tuple of (success, message)
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
spark = self.get_spark_session()
|
|
117
|
+
# Run a simple SQL query to verify connectivity
|
|
118
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
119
|
+
return (True, "Session created and SQL test passed")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
return (False, str(e))
|
|
122
|
+
|
|
123
|
+
def test_jdbc_connectivity(
|
|
124
|
+
self,
|
|
125
|
+
jdbc_url: str,
|
|
126
|
+
properties: Dict[str, str],
|
|
127
|
+
table_or_query: str = "(SELECT 1 AS test) AS t",
|
|
128
|
+
) -> Tuple[bool, str]:
|
|
129
|
+
"""
|
|
130
|
+
Test JDBC connectivity through the Spark cluster.
|
|
131
|
+
|
|
132
|
+
Creates a session and attempts to read from a JDBC source.
|
|
133
|
+
This verifies that JDBC drivers are properly configured.
|
|
134
|
+
|
|
135
|
+
:param jdbc_url: JDBC connection URL
|
|
136
|
+
:param properties: JDBC connection properties (user, password, driver)
|
|
137
|
+
:param table_or_query: Table name or SQL query wrapped in parentheses
|
|
138
|
+
:returns: Tuple of (success, message)
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
spark = self.get_spark_session()
|
|
142
|
+
|
|
143
|
+
# Attempt JDBC read
|
|
144
|
+
df = (
|
|
145
|
+
spark.read.format("jdbc")
|
|
146
|
+
.option("url", jdbc_url)
|
|
147
|
+
.option("dbtable", table_or_query)
|
|
148
|
+
.options(**properties)
|
|
149
|
+
.load()
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Force evaluation
|
|
153
|
+
row_count = df.count()
|
|
154
|
+
return (True, f"JDBC read successful ({row_count} rows)")
|
|
155
|
+
except Exception as e:
|
|
156
|
+
error_msg = str(e)
|
|
157
|
+
# Provide helpful error messages for common issues
|
|
158
|
+
if "ClassNotFoundException" in error_msg:
|
|
159
|
+
return (False, f"JDBC driver not found: {error_msg}")
|
|
160
|
+
elif "No suitable driver" in error_msg:
|
|
161
|
+
return (False, f"JDBC driver not loaded: {error_msg}")
|
|
162
|
+
elif "Authentication" in error_msg.lower() or "password" in error_msg.lower():
|
|
163
|
+
return (False, f"Authentication failed: {error_msg}")
|
|
164
|
+
else:
|
|
165
|
+
return (False, f"JDBC test failed: {error_msg}")
|