dvt-core 0.58.6__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt/__init__.py +7 -0
- dbt/_pydantic_shim.py +26 -0
- dbt/artifacts/__init__.py +0 -0
- dbt/artifacts/exceptions/__init__.py +1 -0
- dbt/artifacts/exceptions/schemas.py +31 -0
- dbt/artifacts/resources/__init__.py +116 -0
- dbt/artifacts/resources/base.py +67 -0
- dbt/artifacts/resources/types.py +93 -0
- dbt/artifacts/resources/v1/analysis.py +10 -0
- dbt/artifacts/resources/v1/catalog.py +23 -0
- dbt/artifacts/resources/v1/components.py +274 -0
- dbt/artifacts/resources/v1/config.py +277 -0
- dbt/artifacts/resources/v1/documentation.py +11 -0
- dbt/artifacts/resources/v1/exposure.py +51 -0
- dbt/artifacts/resources/v1/function.py +52 -0
- dbt/artifacts/resources/v1/generic_test.py +31 -0
- dbt/artifacts/resources/v1/group.py +21 -0
- dbt/artifacts/resources/v1/hook.py +11 -0
- dbt/artifacts/resources/v1/macro.py +29 -0
- dbt/artifacts/resources/v1/metric.py +172 -0
- dbt/artifacts/resources/v1/model.py +145 -0
- dbt/artifacts/resources/v1/owner.py +10 -0
- dbt/artifacts/resources/v1/saved_query.py +111 -0
- dbt/artifacts/resources/v1/seed.py +41 -0
- dbt/artifacts/resources/v1/semantic_layer_components.py +72 -0
- dbt/artifacts/resources/v1/semantic_model.py +314 -0
- dbt/artifacts/resources/v1/singular_test.py +14 -0
- dbt/artifacts/resources/v1/snapshot.py +91 -0
- dbt/artifacts/resources/v1/source_definition.py +84 -0
- dbt/artifacts/resources/v1/sql_operation.py +10 -0
- dbt/artifacts/resources/v1/unit_test_definition.py +77 -0
- dbt/artifacts/schemas/__init__.py +0 -0
- dbt/artifacts/schemas/base.py +191 -0
- dbt/artifacts/schemas/batch_results.py +24 -0
- dbt/artifacts/schemas/catalog/__init__.py +11 -0
- dbt/artifacts/schemas/catalog/v1/__init__.py +0 -0
- dbt/artifacts/schemas/catalog/v1/catalog.py +59 -0
- dbt/artifacts/schemas/freshness/__init__.py +1 -0
- dbt/artifacts/schemas/freshness/v3/__init__.py +0 -0
- dbt/artifacts/schemas/freshness/v3/freshness.py +158 -0
- dbt/artifacts/schemas/manifest/__init__.py +2 -0
- dbt/artifacts/schemas/manifest/v12/__init__.py +0 -0
- dbt/artifacts/schemas/manifest/v12/manifest.py +211 -0
- dbt/artifacts/schemas/results.py +147 -0
- dbt/artifacts/schemas/run/__init__.py +2 -0
- dbt/artifacts/schemas/run/v5/__init__.py +0 -0
- dbt/artifacts/schemas/run/v5/run.py +184 -0
- dbt/artifacts/schemas/upgrades/__init__.py +4 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest.py +174 -0
- dbt/artifacts/schemas/upgrades/upgrade_manifest_dbt_version.py +2 -0
- dbt/artifacts/utils/validation.py +153 -0
- dbt/cli/__init__.py +1 -0
- dbt/cli/context.py +17 -0
- dbt/cli/exceptions.py +57 -0
- dbt/cli/flags.py +560 -0
- dbt/cli/main.py +2403 -0
- dbt/cli/option_types.py +121 -0
- dbt/cli/options.py +80 -0
- dbt/cli/params.py +844 -0
- dbt/cli/requires.py +490 -0
- dbt/cli/resolvers.py +50 -0
- dbt/cli/types.py +40 -0
- dbt/clients/__init__.py +0 -0
- dbt/clients/checked_load.py +83 -0
- dbt/clients/git.py +164 -0
- dbt/clients/jinja.py +206 -0
- dbt/clients/jinja_static.py +245 -0
- dbt/clients/registry.py +192 -0
- dbt/clients/yaml_helper.py +68 -0
- dbt/compilation.py +876 -0
- dbt/compute/__init__.py +14 -0
- dbt/compute/engines/__init__.py +12 -0
- dbt/compute/engines/spark_engine.cpython-311-darwin.so +0 -0
- dbt/compute/engines/spark_engine.py +642 -0
- dbt/compute/federated_executor.cpython-311-darwin.so +0 -0
- dbt/compute/federated_executor.py +1080 -0
- dbt/compute/filter_pushdown.cpython-311-darwin.so +0 -0
- dbt/compute/filter_pushdown.py +273 -0
- dbt/compute/jar_provisioning.cpython-311-darwin.so +0 -0
- dbt/compute/jar_provisioning.py +255 -0
- dbt/compute/java_compat.cpython-311-darwin.so +0 -0
- dbt/compute/java_compat.py +689 -0
- dbt/compute/jdbc_utils.cpython-311-darwin.so +0 -0
- dbt/compute/jdbc_utils.py +678 -0
- dbt/compute/metadata/__init__.py +40 -0
- dbt/compute/metadata/adapters_registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/adapters_registry.py +370 -0
- dbt/compute/metadata/registry.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/registry.py +674 -0
- dbt/compute/metadata/store.cpython-311-darwin.so +0 -0
- dbt/compute/metadata/store.py +1499 -0
- dbt/compute/smart_selector.cpython-311-darwin.so +0 -0
- dbt/compute/smart_selector.py +377 -0
- dbt/compute/strategies/__init__.py +55 -0
- dbt/compute/strategies/base.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/base.py +165 -0
- dbt/compute/strategies/dataproc.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/dataproc.py +207 -0
- dbt/compute/strategies/emr.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/emr.py +203 -0
- dbt/compute/strategies/local.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/local.py +443 -0
- dbt/compute/strategies/standalone.cpython-311-darwin.so +0 -0
- dbt/compute/strategies/standalone.py +262 -0
- dbt/config/__init__.py +4 -0
- dbt/config/catalogs.py +94 -0
- dbt/config/compute.cpython-311-darwin.so +0 -0
- dbt/config/compute.py +513 -0
- dbt/config/dvt_profile.cpython-311-darwin.so +0 -0
- dbt/config/dvt_profile.py +342 -0
- dbt/config/profile.py +422 -0
- dbt/config/project.py +873 -0
- dbt/config/project_utils.py +28 -0
- dbt/config/renderer.py +231 -0
- dbt/config/runtime.py +553 -0
- dbt/config/selectors.py +208 -0
- dbt/config/utils.py +77 -0
- dbt/constants.py +28 -0
- dbt/context/__init__.py +0 -0
- dbt/context/base.py +745 -0
- dbt/context/configured.py +135 -0
- dbt/context/context_config.py +382 -0
- dbt/context/docs.py +82 -0
- dbt/context/exceptions_jinja.py +178 -0
- dbt/context/macro_resolver.py +195 -0
- dbt/context/macros.py +171 -0
- dbt/context/manifest.py +72 -0
- dbt/context/providers.py +2249 -0
- dbt/context/query_header.py +13 -0
- dbt/context/secret.py +58 -0
- dbt/context/target.py +74 -0
- dbt/contracts/__init__.py +0 -0
- dbt/contracts/files.py +413 -0
- dbt/contracts/graph/__init__.py +0 -0
- dbt/contracts/graph/manifest.py +1904 -0
- dbt/contracts/graph/metrics.py +97 -0
- dbt/contracts/graph/model_config.py +70 -0
- dbt/contracts/graph/node_args.py +42 -0
- dbt/contracts/graph/nodes.py +1806 -0
- dbt/contracts/graph/semantic_manifest.py +232 -0
- dbt/contracts/graph/unparsed.py +811 -0
- dbt/contracts/project.py +417 -0
- dbt/contracts/results.py +53 -0
- dbt/contracts/selection.py +23 -0
- dbt/contracts/sql.py +85 -0
- dbt/contracts/state.py +68 -0
- dbt/contracts/util.py +46 -0
- dbt/deprecations.py +348 -0
- dbt/deps/__init__.py +0 -0
- dbt/deps/base.py +152 -0
- dbt/deps/git.py +195 -0
- dbt/deps/local.py +79 -0
- dbt/deps/registry.py +130 -0
- dbt/deps/resolver.py +149 -0
- dbt/deps/tarball.py +120 -0
- dbt/docs/source/_ext/dbt_click.py +119 -0
- dbt/docs/source/conf.py +32 -0
- dbt/env_vars.py +64 -0
- dbt/event_time/event_time.py +40 -0
- dbt/event_time/sample_window.py +60 -0
- dbt/events/__init__.py +15 -0
- dbt/events/base_types.py +36 -0
- dbt/events/core_types_pb2.py +2 -0
- dbt/events/logging.py +108 -0
- dbt/events/types.py +2516 -0
- dbt/exceptions.py +1486 -0
- dbt/flags.py +89 -0
- dbt/graph/__init__.py +11 -0
- dbt/graph/cli.py +249 -0
- dbt/graph/graph.py +172 -0
- dbt/graph/queue.py +214 -0
- dbt/graph/selector.py +374 -0
- dbt/graph/selector_methods.py +975 -0
- dbt/graph/selector_spec.py +222 -0
- dbt/graph/thread_pool.py +18 -0
- dbt/hooks.py +21 -0
- dbt/include/README.md +49 -0
- dbt/include/__init__.py +3 -0
- dbt/include/data/adapters_registry.duckdb +0 -0
- dbt/include/data/build_registry.py +242 -0
- dbt/include/data/csv/adapter_queries.csv +33 -0
- dbt/include/data/csv/syntax_rules.csv +9 -0
- dbt/include/data/csv/type_mappings_bigquery.csv +28 -0
- dbt/include/data/csv/type_mappings_databricks.csv +30 -0
- dbt/include/data/csv/type_mappings_mysql.csv +40 -0
- dbt/include/data/csv/type_mappings_oracle.csv +30 -0
- dbt/include/data/csv/type_mappings_postgres.csv +56 -0
- dbt/include/data/csv/type_mappings_redshift.csv +33 -0
- dbt/include/data/csv/type_mappings_snowflake.csv +38 -0
- dbt/include/data/csv/type_mappings_sqlserver.csv +35 -0
- dbt/include/starter_project/.gitignore +4 -0
- dbt/include/starter_project/README.md +15 -0
- dbt/include/starter_project/__init__.py +3 -0
- dbt/include/starter_project/analyses/.gitkeep +0 -0
- dbt/include/starter_project/dbt_project.yml +36 -0
- dbt/include/starter_project/macros/.gitkeep +0 -0
- dbt/include/starter_project/models/example/my_first_dbt_model.sql +27 -0
- dbt/include/starter_project/models/example/my_second_dbt_model.sql +6 -0
- dbt/include/starter_project/models/example/schema.yml +21 -0
- dbt/include/starter_project/seeds/.gitkeep +0 -0
- dbt/include/starter_project/snapshots/.gitkeep +0 -0
- dbt/include/starter_project/tests/.gitkeep +0 -0
- dbt/internal_deprecations.py +26 -0
- dbt/jsonschemas/__init__.py +3 -0
- dbt/jsonschemas/jsonschemas.py +309 -0
- dbt/jsonschemas/project/0.0.110.json +4717 -0
- dbt/jsonschemas/project/0.0.85.json +2015 -0
- dbt/jsonschemas/resources/0.0.110.json +2636 -0
- dbt/jsonschemas/resources/0.0.85.json +2536 -0
- dbt/jsonschemas/resources/latest.json +6773 -0
- dbt/links.py +4 -0
- dbt/materializations/__init__.py +0 -0
- dbt/materializations/incremental/__init__.py +0 -0
- dbt/materializations/incremental/microbatch.py +236 -0
- dbt/mp_context.py +8 -0
- dbt/node_types.py +37 -0
- dbt/parser/__init__.py +23 -0
- dbt/parser/analysis.py +21 -0
- dbt/parser/base.py +548 -0
- dbt/parser/common.py +266 -0
- dbt/parser/docs.py +52 -0
- dbt/parser/fixtures.py +51 -0
- dbt/parser/functions.py +30 -0
- dbt/parser/generic_test.py +100 -0
- dbt/parser/generic_test_builders.py +333 -0
- dbt/parser/hooks.py +118 -0
- dbt/parser/macros.py +137 -0
- dbt/parser/manifest.py +2204 -0
- dbt/parser/models.py +573 -0
- dbt/parser/partial.py +1178 -0
- dbt/parser/read_files.py +445 -0
- dbt/parser/schema_generic_tests.py +422 -0
- dbt/parser/schema_renderer.py +111 -0
- dbt/parser/schema_yaml_readers.py +935 -0
- dbt/parser/schemas.py +1466 -0
- dbt/parser/search.py +149 -0
- dbt/parser/seeds.py +28 -0
- dbt/parser/singular_test.py +20 -0
- dbt/parser/snapshots.py +44 -0
- dbt/parser/sources.py +558 -0
- dbt/parser/sql.py +62 -0
- dbt/parser/unit_tests.py +621 -0
- dbt/plugins/__init__.py +20 -0
- dbt/plugins/contracts.py +9 -0
- dbt/plugins/exceptions.py +2 -0
- dbt/plugins/manager.py +163 -0
- dbt/plugins/manifest.py +21 -0
- dbt/profiler.py +20 -0
- dbt/py.typed +1 -0
- dbt/query_analyzer.cpython-311-darwin.so +0 -0
- dbt/query_analyzer.py +410 -0
- dbt/runners/__init__.py +2 -0
- dbt/runners/exposure_runner.py +7 -0
- dbt/runners/no_op_runner.py +45 -0
- dbt/runners/saved_query_runner.py +7 -0
- dbt/selected_resources.py +8 -0
- dbt/task/__init__.py +0 -0
- dbt/task/base.py +503 -0
- dbt/task/build.py +197 -0
- dbt/task/clean.py +56 -0
- dbt/task/clone.py +161 -0
- dbt/task/compile.py +150 -0
- dbt/task/compute.cpython-311-darwin.so +0 -0
- dbt/task/compute.py +458 -0
- dbt/task/debug.py +505 -0
- dbt/task/deps.py +280 -0
- dbt/task/docs/__init__.py +3 -0
- dbt/task/docs/api/__init__.py +23 -0
- dbt/task/docs/api/catalog.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/catalog.py +204 -0
- dbt/task/docs/api/lineage.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/lineage.py +234 -0
- dbt/task/docs/api/profile.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/profile.py +204 -0
- dbt/task/docs/api/spark.cpython-311-darwin.so +0 -0
- dbt/task/docs/api/spark.py +186 -0
- dbt/task/docs/generate.py +947 -0
- dbt/task/docs/index.html +250 -0
- dbt/task/docs/serve.cpython-311-darwin.so +0 -0
- dbt/task/docs/serve.py +174 -0
- dbt/task/dvt_output.py +362 -0
- dbt/task/dvt_run.py +204 -0
- dbt/task/freshness.py +322 -0
- dbt/task/function.py +121 -0
- dbt/task/group_lookup.py +46 -0
- dbt/task/init.cpython-311-darwin.so +0 -0
- dbt/task/init.py +604 -0
- dbt/task/java.cpython-311-darwin.so +0 -0
- dbt/task/java.py +316 -0
- dbt/task/list.py +236 -0
- dbt/task/metadata.cpython-311-darwin.so +0 -0
- dbt/task/metadata.py +804 -0
- dbt/task/printer.py +175 -0
- dbt/task/profile.cpython-311-darwin.so +0 -0
- dbt/task/profile.py +1307 -0
- dbt/task/profile_serve.py +615 -0
- dbt/task/retract.py +438 -0
- dbt/task/retry.py +175 -0
- dbt/task/run.py +1387 -0
- dbt/task/run_operation.py +141 -0
- dbt/task/runnable.py +758 -0
- dbt/task/seed.py +103 -0
- dbt/task/show.py +149 -0
- dbt/task/snapshot.py +56 -0
- dbt/task/spark.cpython-311-darwin.so +0 -0
- dbt/task/spark.py +414 -0
- dbt/task/sql.py +110 -0
- dbt/task/target_sync.cpython-311-darwin.so +0 -0
- dbt/task/target_sync.py +766 -0
- dbt/task/test.py +464 -0
- dbt/tests/fixtures/__init__.py +1 -0
- dbt/tests/fixtures/project.py +620 -0
- dbt/tests/util.py +651 -0
- dbt/tracking.py +529 -0
- dbt/utils/__init__.py +3 -0
- dbt/utils/artifact_upload.py +151 -0
- dbt/utils/utils.py +408 -0
- dbt/version.py +270 -0
- dvt_cli/__init__.py +72 -0
- dvt_core-0.58.6.dist-info/METADATA +288 -0
- dvt_core-0.58.6.dist-info/RECORD +324 -0
- dvt_core-0.58.6.dist-info/WHEEL +5 -0
- dvt_core-0.58.6.dist-info/entry_points.txt +2 -0
- dvt_core-0.58.6.dist-info/top_level.txt +2 -0
|
Binary file
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Smart Compute Engine Selector
|
|
3
|
+
|
|
4
|
+
Selects compute engine based on DVT compute rules (NOT size-based).
|
|
5
|
+
|
|
6
|
+
v0.56.0: Refactored to follow DVT compute rules:
|
|
7
|
+
1. CLI --target-compute override (highest priority)
|
|
8
|
+
2. Model-level config {{ config(compute='...') }}
|
|
9
|
+
3. Default from computes.yml target_compute
|
|
10
|
+
4. Pushdown when model and all inputs are in same target (no Spark needed)
|
|
11
|
+
|
|
12
|
+
Selection is deterministic based on configuration, not data characteristics.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from typing import Any, Optional, Set
|
|
18
|
+
|
|
19
|
+
from dbt.contracts.graph.manifest import Manifest
|
|
20
|
+
from dbt.contracts.graph.nodes import ManifestNode
|
|
21
|
+
from dbt.query_analyzer import QueryAnalysisResult
|
|
22
|
+
from dbt_common.exceptions import DbtRuntimeError
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ExecutionStrategy(Enum):
|
|
26
|
+
"""Execution strategy for a node."""
|
|
27
|
+
|
|
28
|
+
PUSHDOWN = "pushdown" # Execute directly on target adapter (same connection)
|
|
29
|
+
FEDERATED = "federated" # Execute via Spark for cross-target queries
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class WorkloadEstimate:
|
|
34
|
+
"""Estimated workload characteristics for a query."""
|
|
35
|
+
|
|
36
|
+
estimated_rows: int # Estimated total rows to process
|
|
37
|
+
source_count: int # Number of source tables
|
|
38
|
+
connection_count: int # Number of different connections
|
|
39
|
+
has_aggregations: bool # Query contains GROUP BY or aggregations
|
|
40
|
+
has_joins: bool # Query contains JOIN operations
|
|
41
|
+
complexity_score: float # 0.0 to 1.0, higher = more complex
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def estimated_data_mb(self) -> float:
|
|
45
|
+
"""Rough estimate of data size in MB (assuming ~100 bytes/row)."""
|
|
46
|
+
return (self.estimated_rows * 100) / (1024 * 1024)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class SmartComputeSelector:
|
|
50
|
+
"""
|
|
51
|
+
Selects compute engine based on DVT compute rules.
|
|
52
|
+
|
|
53
|
+
v0.56.0: Rule-based selection (NO size-based logic).
|
|
54
|
+
|
|
55
|
+
Selection hierarchy (highest to lowest priority):
|
|
56
|
+
1. CLI --target-compute override
|
|
57
|
+
2. Model config: {{ config(compute='spark-cluster') }}
|
|
58
|
+
3. Default from computes.yml target_compute
|
|
59
|
+
|
|
60
|
+
Execution strategy:
|
|
61
|
+
- PUSHDOWN: When model and all inputs are in same target
|
|
62
|
+
- FEDERATED: When sources span multiple targets (requires Spark)
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
manifest: Manifest,
|
|
68
|
+
compute_registry: Optional[Any] = None,
|
|
69
|
+
cli_target_compute: Optional[str] = None,
|
|
70
|
+
):
|
|
71
|
+
"""
|
|
72
|
+
Initialize smart selector.
|
|
73
|
+
|
|
74
|
+
:param manifest: The dbt manifest
|
|
75
|
+
:param compute_registry: ComputeRegistry instance for compute configuration
|
|
76
|
+
:param cli_target_compute: CLI --target-compute override (highest priority)
|
|
77
|
+
"""
|
|
78
|
+
self.manifest = manifest
|
|
79
|
+
self.compute_registry = compute_registry
|
|
80
|
+
self.cli_target_compute = cli_target_compute
|
|
81
|
+
|
|
82
|
+
def select_engine(
|
|
83
|
+
self,
|
|
84
|
+
node: ManifestNode,
|
|
85
|
+
analysis_result: QueryAnalysisResult,
|
|
86
|
+
cli_override: Optional[str] = None,
|
|
87
|
+
) -> str:
|
|
88
|
+
"""
|
|
89
|
+
Select compute engine based on DVT rules.
|
|
90
|
+
|
|
91
|
+
v0.56.0: Rule-based selection (no size-based logic).
|
|
92
|
+
|
|
93
|
+
Priority:
|
|
94
|
+
1. cli_override parameter (passed at call time)
|
|
95
|
+
2. self.cli_target_compute (passed at init time)
|
|
96
|
+
3. Model config: {{ config(compute='...') }}
|
|
97
|
+
4. Default from computes.yml target_compute
|
|
98
|
+
|
|
99
|
+
:param node: The node to execute
|
|
100
|
+
:param analysis_result: Query analysis result
|
|
101
|
+
:param cli_override: CLI --target-compute override
|
|
102
|
+
:returns: Compute engine name (e.g., "spark-local", "spark-cluster")
|
|
103
|
+
:raises DbtRuntimeError: If specified compute doesn't exist
|
|
104
|
+
"""
|
|
105
|
+
# Determine execution strategy first
|
|
106
|
+
strategy = self._determine_execution_strategy(node, analysis_result)
|
|
107
|
+
|
|
108
|
+
# For pushdown, no Spark compute needed
|
|
109
|
+
if strategy == ExecutionStrategy.PUSHDOWN:
|
|
110
|
+
return "pushdown"
|
|
111
|
+
|
|
112
|
+
# For federated execution, select compute engine
|
|
113
|
+
return self._select_compute_for_federation(node, cli_override)
|
|
114
|
+
|
|
115
|
+
def _determine_execution_strategy(
|
|
116
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
117
|
+
) -> ExecutionStrategy:
|
|
118
|
+
"""
|
|
119
|
+
Determine whether to use pushdown or federation.
|
|
120
|
+
|
|
121
|
+
DVT Rule: Pushdown when model and ALL inputs are in same target.
|
|
122
|
+
|
|
123
|
+
:param node: The node to analyze
|
|
124
|
+
:param analysis_result: Query analysis result
|
|
125
|
+
:returns: ExecutionStrategy (PUSHDOWN or FEDERATED)
|
|
126
|
+
"""
|
|
127
|
+
# Get target connection for this node
|
|
128
|
+
node_target = self._get_node_target(node)
|
|
129
|
+
|
|
130
|
+
# Get all source connections
|
|
131
|
+
source_connections = analysis_result.source_connections
|
|
132
|
+
|
|
133
|
+
# If no sources, can use pushdown (pure computation)
|
|
134
|
+
if not source_connections:
|
|
135
|
+
return ExecutionStrategy.PUSHDOWN
|
|
136
|
+
|
|
137
|
+
# Check if all sources are in the same connection as the target
|
|
138
|
+
if len(source_connections) == 1:
|
|
139
|
+
source_connection = next(iter(source_connections))
|
|
140
|
+
if source_connection == node_target:
|
|
141
|
+
# Same connection - use pushdown
|
|
142
|
+
return ExecutionStrategy.PUSHDOWN
|
|
143
|
+
|
|
144
|
+
# Multiple connections or different target - must federate
|
|
145
|
+
return ExecutionStrategy.FEDERATED
|
|
146
|
+
|
|
147
|
+
def _get_node_target(self, node: ManifestNode) -> str:
|
|
148
|
+
"""
|
|
149
|
+
Get the target connection for a node.
|
|
150
|
+
|
|
151
|
+
:param node: The manifest node
|
|
152
|
+
:returns: Target connection name
|
|
153
|
+
"""
|
|
154
|
+
# Check if node has explicit target config
|
|
155
|
+
if hasattr(node, "config") and hasattr(node.config, "target"):
|
|
156
|
+
if node.config.target:
|
|
157
|
+
return node.config.target
|
|
158
|
+
|
|
159
|
+
# Otherwise, use default target from manifest
|
|
160
|
+
# Note: In DVT, this comes from profiles.yml default target
|
|
161
|
+
return "default"
|
|
162
|
+
|
|
163
|
+
def _select_compute_for_federation(
|
|
164
|
+
self, node: ManifestNode, cli_override: Optional[str] = None
|
|
165
|
+
) -> str:
|
|
166
|
+
"""
|
|
167
|
+
Select compute engine for federated execution.
|
|
168
|
+
|
|
169
|
+
Priority:
|
|
170
|
+
1. cli_override parameter (passed at call time)
|
|
171
|
+
2. self.cli_target_compute (passed at init time)
|
|
172
|
+
3. Model config: {{ config(compute='...') }}
|
|
173
|
+
4. Default from computes.yml target_compute
|
|
174
|
+
|
|
175
|
+
:param node: The node to execute
|
|
176
|
+
:param cli_override: CLI --target-compute override
|
|
177
|
+
:returns: Compute engine name
|
|
178
|
+
:raises DbtRuntimeError: If specified compute doesn't exist
|
|
179
|
+
"""
|
|
180
|
+
compute_name = None
|
|
181
|
+
|
|
182
|
+
# Priority 1: CLI override (call-time)
|
|
183
|
+
if cli_override:
|
|
184
|
+
compute_name = cli_override
|
|
185
|
+
|
|
186
|
+
# Priority 2: CLI override (init-time)
|
|
187
|
+
elif self.cli_target_compute:
|
|
188
|
+
compute_name = self.cli_target_compute
|
|
189
|
+
|
|
190
|
+
# Priority 3: Model-level config
|
|
191
|
+
elif hasattr(node, "config") and hasattr(node.config, "compute"):
|
|
192
|
+
if node.config.compute:
|
|
193
|
+
compute_name = node.config.compute
|
|
194
|
+
|
|
195
|
+
# Priority 4: Default from computes.yml
|
|
196
|
+
elif self.compute_registry:
|
|
197
|
+
compute_name = self.compute_registry.target_compute
|
|
198
|
+
|
|
199
|
+
# Fallback if no registry
|
|
200
|
+
if not compute_name:
|
|
201
|
+
compute_name = "spark-local"
|
|
202
|
+
|
|
203
|
+
# Validate the compute engine exists
|
|
204
|
+
if self.compute_registry and not self.compute_registry.exists(compute_name):
|
|
205
|
+
available = [c.name for c in self.compute_registry.list()]
|
|
206
|
+
raise DbtRuntimeError(
|
|
207
|
+
f"Compute engine '{compute_name}' not found. "
|
|
208
|
+
f"Available engines: {', '.join(available)}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
return compute_name
|
|
212
|
+
|
|
213
|
+
def _estimate_workload(
|
|
214
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
215
|
+
) -> WorkloadEstimate:
|
|
216
|
+
"""
|
|
217
|
+
Estimate workload characteristics for a node.
|
|
218
|
+
|
|
219
|
+
Note: Used for informational purposes only, NOT for compute selection.
|
|
220
|
+
|
|
221
|
+
:param node: The node to analyze
|
|
222
|
+
:param analysis_result: Query analysis result
|
|
223
|
+
:returns: WorkloadEstimate
|
|
224
|
+
"""
|
|
225
|
+
# Count sources
|
|
226
|
+
source_count = len(analysis_result.source_refs)
|
|
227
|
+
connection_count = len(analysis_result.source_connections)
|
|
228
|
+
|
|
229
|
+
# Estimate row count (informational only)
|
|
230
|
+
estimated_rows = self._estimate_row_count(analysis_result.source_refs)
|
|
231
|
+
|
|
232
|
+
# Analyze SQL for complexity (informational only)
|
|
233
|
+
sql = node.compiled_code if hasattr(node, "compiled_code") else node.raw_code
|
|
234
|
+
has_aggregations = self._has_aggregations(sql)
|
|
235
|
+
has_joins = self._has_joins(sql)
|
|
236
|
+
|
|
237
|
+
# Calculate complexity score (informational only)
|
|
238
|
+
complexity_score = self._calculate_complexity(
|
|
239
|
+
source_count=source_count,
|
|
240
|
+
connection_count=connection_count,
|
|
241
|
+
has_aggregations=has_aggregations,
|
|
242
|
+
has_joins=has_joins,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return WorkloadEstimate(
|
|
246
|
+
estimated_rows=estimated_rows,
|
|
247
|
+
source_count=source_count,
|
|
248
|
+
connection_count=connection_count,
|
|
249
|
+
has_aggregations=has_aggregations,
|
|
250
|
+
has_joins=has_joins,
|
|
251
|
+
complexity_score=complexity_score,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def _estimate_row_count(self, source_refs: set) -> int:
|
|
255
|
+
"""
|
|
256
|
+
Estimate total row count from source tables.
|
|
257
|
+
|
|
258
|
+
Note: Used for informational purposes only.
|
|
259
|
+
|
|
260
|
+
:param source_refs: Set of source unique_ids
|
|
261
|
+
:returns: Estimated row count
|
|
262
|
+
"""
|
|
263
|
+
total_rows = 0
|
|
264
|
+
|
|
265
|
+
for source_id in source_refs:
|
|
266
|
+
source = self.manifest.sources.get(source_id)
|
|
267
|
+
if not source:
|
|
268
|
+
total_rows += 100000
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
# Heuristic based on naming (informational only)
|
|
272
|
+
if (
|
|
273
|
+
"fact" in source.identifier.lower()
|
|
274
|
+
or "events" in source.identifier.lower()
|
|
275
|
+
):
|
|
276
|
+
total_rows += 1000000
|
|
277
|
+
elif (
|
|
278
|
+
"dim" in source.identifier.lower()
|
|
279
|
+
or "lookup" in source.identifier.lower()
|
|
280
|
+
):
|
|
281
|
+
total_rows += 10000
|
|
282
|
+
else:
|
|
283
|
+
total_rows += 100000
|
|
284
|
+
|
|
285
|
+
return total_rows
|
|
286
|
+
|
|
287
|
+
def _has_aggregations(self, sql: str) -> bool:
|
|
288
|
+
"""Check if SQL contains aggregations."""
|
|
289
|
+
sql_upper = sql.upper()
|
|
290
|
+
return any(
|
|
291
|
+
keyword in sql_upper
|
|
292
|
+
for keyword in [
|
|
293
|
+
" GROUP BY ",
|
|
294
|
+
" SUM(",
|
|
295
|
+
" COUNT(",
|
|
296
|
+
" AVG(",
|
|
297
|
+
" MIN(",
|
|
298
|
+
" MAX(",
|
|
299
|
+
" HAVING ",
|
|
300
|
+
]
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
def _has_joins(self, sql: str) -> bool:
|
|
304
|
+
"""Check if SQL contains joins."""
|
|
305
|
+
sql_upper = sql.upper()
|
|
306
|
+
return any(
|
|
307
|
+
keyword in sql_upper
|
|
308
|
+
for keyword in [
|
|
309
|
+
" JOIN ",
|
|
310
|
+
" INNER JOIN ",
|
|
311
|
+
" LEFT JOIN ",
|
|
312
|
+
" RIGHT JOIN ",
|
|
313
|
+
" FULL JOIN ",
|
|
314
|
+
" CROSS JOIN ",
|
|
315
|
+
]
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def _calculate_complexity(
|
|
319
|
+
self,
|
|
320
|
+
source_count: int,
|
|
321
|
+
connection_count: int,
|
|
322
|
+
has_aggregations: bool,
|
|
323
|
+
has_joins: bool,
|
|
324
|
+
) -> float:
|
|
325
|
+
"""Calculate query complexity score (0.0 to 1.0)."""
|
|
326
|
+
score = 0.0
|
|
327
|
+
score += min(source_count / 10.0, 0.3)
|
|
328
|
+
score += min(connection_count / 5.0, 0.2)
|
|
329
|
+
if has_aggregations:
|
|
330
|
+
score += 0.2
|
|
331
|
+
if has_joins:
|
|
332
|
+
score += 0.3
|
|
333
|
+
return min(score, 1.0)
|
|
334
|
+
|
|
335
|
+
def get_execution_strategy(
|
|
336
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
337
|
+
) -> ExecutionStrategy:
|
|
338
|
+
"""
|
|
339
|
+
Get the execution strategy for a node (public API).
|
|
340
|
+
|
|
341
|
+
:param node: The node
|
|
342
|
+
:param analysis_result: Query analysis result
|
|
343
|
+
:returns: ExecutionStrategy enum
|
|
344
|
+
"""
|
|
345
|
+
return self._determine_execution_strategy(node, analysis_result)
|
|
346
|
+
|
|
347
|
+
def get_recommendation_reason(
|
|
348
|
+
self, node: ManifestNode, analysis_result: QueryAnalysisResult
|
|
349
|
+
) -> str:
|
|
350
|
+
"""
|
|
351
|
+
Get human-readable explanation for engine selection.
|
|
352
|
+
|
|
353
|
+
:param node: The node
|
|
354
|
+
:param analysis_result: Query analysis result
|
|
355
|
+
:returns: Explanation string
|
|
356
|
+
"""
|
|
357
|
+
strategy = self._determine_execution_strategy(node, analysis_result)
|
|
358
|
+
|
|
359
|
+
if strategy == ExecutionStrategy.PUSHDOWN:
|
|
360
|
+
return "Pushdown: All sources in same target connection - executing directly"
|
|
361
|
+
|
|
362
|
+
# Federated execution
|
|
363
|
+
engine = self._select_compute_for_federation(node)
|
|
364
|
+
estimate = self._estimate_workload(node, analysis_result)
|
|
365
|
+
|
|
366
|
+
reasons = []
|
|
367
|
+
reasons.append(f"Cross-target query ({estimate.connection_count} connections)")
|
|
368
|
+
|
|
369
|
+
if self.cli_target_compute:
|
|
370
|
+
reasons.append(f"CLI override: --target-compute {self.cli_target_compute}")
|
|
371
|
+
elif hasattr(node, "config") and hasattr(node.config, "compute") and node.config.compute:
|
|
372
|
+
reasons.append(f"Model config: compute='{node.config.compute}'")
|
|
373
|
+
else:
|
|
374
|
+
reasons.append("Using default from computes.yml")
|
|
375
|
+
|
|
376
|
+
reason_str = "; ".join(reasons)
|
|
377
|
+
return f"Federated ({engine}): {reason_str}"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spark Connection Strategies
|
|
3
|
+
|
|
4
|
+
This module provides different strategies for connecting to Spark clusters.
|
|
5
|
+
Uses the strategy pattern for flexible platform support.
|
|
6
|
+
|
|
7
|
+
v0.5.98: Added EMRStrategy, DataprocStrategy, and StandaloneStrategy.
|
|
8
|
+
v0.51.2: Removed Databricks support (serverless cannot read external JDBC sources).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dbt.compute.strategies.base import BaseConnectionStrategy
|
|
12
|
+
from dbt.compute.strategies.local import LocalStrategy, cleanup_all_spark_sessions
|
|
13
|
+
|
|
14
|
+
# Strategies are imported lazily to avoid import errors when
|
|
15
|
+
# optional dependencies are not installed
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_emr_strategy():
|
|
19
|
+
"""
|
|
20
|
+
Lazily import and return EMRStrategy.
|
|
21
|
+
|
|
22
|
+
:returns: EMRStrategy class
|
|
23
|
+
"""
|
|
24
|
+
from dbt.compute.strategies.emr import EMRStrategy
|
|
25
|
+
return EMRStrategy
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_dataproc_strategy():
|
|
29
|
+
"""
|
|
30
|
+
Lazily import and return DataprocStrategy.
|
|
31
|
+
|
|
32
|
+
:returns: DataprocStrategy class
|
|
33
|
+
"""
|
|
34
|
+
from dbt.compute.strategies.dataproc import DataprocStrategy
|
|
35
|
+
return DataprocStrategy
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_standalone_strategy():
|
|
39
|
+
"""
|
|
40
|
+
Lazily import and return StandaloneStrategy.
|
|
41
|
+
|
|
42
|
+
:returns: StandaloneStrategy class
|
|
43
|
+
"""
|
|
44
|
+
from dbt.compute.strategies.standalone import StandaloneStrategy
|
|
45
|
+
return StandaloneStrategy
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
"BaseConnectionStrategy",
|
|
50
|
+
"LocalStrategy",
|
|
51
|
+
"cleanup_all_spark_sessions",
|
|
52
|
+
"get_emr_strategy",
|
|
53
|
+
"get_dataproc_strategy",
|
|
54
|
+
"get_standalone_strategy",
|
|
55
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Connection Strategy for Spark Engines
|
|
3
|
+
|
|
4
|
+
Defines the abstract interface for different Spark connection strategies.
|
|
5
|
+
Uses composition over inheritance for flexible platform support.
|
|
6
|
+
|
|
7
|
+
v0.5.98: Added JAR provisioning and connectivity testing methods.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import Any, Dict, Optional, Set, Tuple
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from pyspark.sql import SparkSession
|
|
15
|
+
|
|
16
|
+
PYSPARK_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
PYSPARK_AVAILABLE = False
|
|
19
|
+
SparkSession = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseConnectionStrategy(ABC):
|
|
23
|
+
"""
|
|
24
|
+
Abstract base class for Spark connection strategies.
|
|
25
|
+
|
|
26
|
+
Different strategies implement different ways to connect to Spark:
|
|
27
|
+
- LocalStrategy: Embedded PySpark (in-process)
|
|
28
|
+
- DatabricksStrategy: Databricks Connect (remote cluster)
|
|
29
|
+
- EMRStrategy: AWS EMR cluster
|
|
30
|
+
- DataprocStrategy: GCP Dataproc
|
|
31
|
+
- StandaloneStrategy: Self-managed Spark clusters
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: Dict[str, Any], app_name: str = "DVT-Compute"):
|
|
35
|
+
"""
|
|
36
|
+
Initialize connection strategy.
|
|
37
|
+
|
|
38
|
+
:param config: Strategy-specific configuration
|
|
39
|
+
:param app_name: Spark application name
|
|
40
|
+
"""
|
|
41
|
+
self.config = config
|
|
42
|
+
self.app_name = app_name
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def get_spark_session(self) -> SparkSession:
|
|
46
|
+
"""
|
|
47
|
+
Create and return a SparkSession.
|
|
48
|
+
|
|
49
|
+
:returns: Initialized SparkSession
|
|
50
|
+
:raises DbtRuntimeError: If session creation fails
|
|
51
|
+
"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def validate_config(self) -> None:
|
|
56
|
+
"""
|
|
57
|
+
Validate strategy-specific configuration.
|
|
58
|
+
|
|
59
|
+
:raises DbtRuntimeError: If configuration is invalid
|
|
60
|
+
"""
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def estimate_cost(self, duration_minutes: float) -> float:
|
|
64
|
+
"""
|
|
65
|
+
Estimate cost for running on this platform.
|
|
66
|
+
|
|
67
|
+
Default implementation returns 0.0 (free). Override for cloud platforms.
|
|
68
|
+
|
|
69
|
+
:param duration_minutes: Estimated query duration in minutes
|
|
70
|
+
:returns: Estimated cost in USD
|
|
71
|
+
"""
|
|
72
|
+
return 0.0
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def close(self, spark: Optional[SparkSession]) -> None:
|
|
76
|
+
"""
|
|
77
|
+
Clean up Spark session.
|
|
78
|
+
|
|
79
|
+
:param spark: SparkSession to clean up (may be None)
|
|
80
|
+
"""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
def get_platform_name(self) -> str:
|
|
84
|
+
"""
|
|
85
|
+
Get human-readable platform name.
|
|
86
|
+
|
|
87
|
+
:returns: Platform name (e.g., "local", "databricks", "emr")
|
|
88
|
+
"""
|
|
89
|
+
return self.__class__.__name__.replace("Strategy", "").lower()
|
|
90
|
+
|
|
91
|
+
def get_jar_provisioning_config(self, adapter_types: Set[str]) -> Dict[str, str]:
|
|
92
|
+
"""
|
|
93
|
+
Get Spark configuration for JDBC JAR provisioning.
|
|
94
|
+
|
|
95
|
+
Default implementation returns empty dict. Override in subclasses
|
|
96
|
+
to provide platform-specific JAR configuration.
|
|
97
|
+
|
|
98
|
+
Local platforms use spark.jars (local file paths).
|
|
99
|
+
Remote platforms use spark.jars.packages (Maven coordinates).
|
|
100
|
+
|
|
101
|
+
:param adapter_types: Set of adapter types that need JDBC drivers
|
|
102
|
+
:returns: Dictionary of Spark config keys/values (e.g., {"spark.jars": "..."})
|
|
103
|
+
"""
|
|
104
|
+
return {}
|
|
105
|
+
|
|
106
|
+
def test_connectivity(self) -> Tuple[bool, str]:
|
|
107
|
+
"""
|
|
108
|
+
Test basic connectivity to the Spark cluster.
|
|
109
|
+
|
|
110
|
+
Creates a session, runs a simple query, and returns status.
|
|
111
|
+
Override for platform-specific connectivity testing.
|
|
112
|
+
|
|
113
|
+
:returns: Tuple of (success, message)
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
spark = self.get_spark_session()
|
|
117
|
+
# Run a simple SQL query to verify connectivity
|
|
118
|
+
spark.sql("SELECT 1 AS test").collect()
|
|
119
|
+
return (True, "Session created and SQL test passed")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
return (False, str(e))
|
|
122
|
+
|
|
123
|
+
def test_jdbc_connectivity(
|
|
124
|
+
self,
|
|
125
|
+
jdbc_url: str,
|
|
126
|
+
properties: Dict[str, str],
|
|
127
|
+
table_or_query: str = "(SELECT 1 AS test) AS t",
|
|
128
|
+
) -> Tuple[bool, str]:
|
|
129
|
+
"""
|
|
130
|
+
Test JDBC connectivity through the Spark cluster.
|
|
131
|
+
|
|
132
|
+
Creates a session and attempts to read from a JDBC source.
|
|
133
|
+
This verifies that JDBC drivers are properly configured.
|
|
134
|
+
|
|
135
|
+
:param jdbc_url: JDBC connection URL
|
|
136
|
+
:param properties: JDBC connection properties (user, password, driver)
|
|
137
|
+
:param table_or_query: Table name or SQL query wrapped in parentheses
|
|
138
|
+
:returns: Tuple of (success, message)
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
spark = self.get_spark_session()
|
|
142
|
+
|
|
143
|
+
# Attempt JDBC read
|
|
144
|
+
df = (
|
|
145
|
+
spark.read.format("jdbc")
|
|
146
|
+
.option("url", jdbc_url)
|
|
147
|
+
.option("dbtable", table_or_query)
|
|
148
|
+
.options(**properties)
|
|
149
|
+
.load()
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Force evaluation
|
|
153
|
+
row_count = df.count()
|
|
154
|
+
return (True, f"JDBC read successful ({row_count} rows)")
|
|
155
|
+
except Exception as e:
|
|
156
|
+
error_msg = str(e)
|
|
157
|
+
# Provide helpful error messages for common issues
|
|
158
|
+
if "ClassNotFoundException" in error_msg:
|
|
159
|
+
return (False, f"JDBC driver not found: {error_msg}")
|
|
160
|
+
elif "No suitable driver" in error_msg:
|
|
161
|
+
return (False, f"JDBC driver not loaded: {error_msg}")
|
|
162
|
+
elif "Authentication" in error_msg.lower() or "password" in error_msg.lower():
|
|
163
|
+
return (False, f"Authentication failed: {error_msg}")
|
|
164
|
+
else:
|
|
165
|
+
return (False, f"JDBC test failed: {error_msg}")
|
|
Binary file
|